# Initialize Spark session (BERDL JupyterHub — no import needed)
spark = get_spark_session()
spark.sql('SELECT 1 AS ok').show()
print('Spark session ready.')

+---+
| ok|
+---+
|  1|
+---+

Spark session ready.

import pandas as pd
import numpy as np
import os
import re
import json
import cobra
from collections import defaultdict

DATA_DIR = '../data'
MODEL_DIR = f'{DATA_DIR}/models'
os.makedirs(MODEL_DIR, exist_ok=True)

# Load genome manifest from NB01
manifest = pd.read_csv(f'{DATA_DIR}/genome_manifest.tsv', sep='\t')
manifest['has_pangenome'] = manifest['gtdb_species_clade_id'].notna()
print(f'Loaded manifest: {len(manifest)} genomes')
print(f'  With pangenome match: {manifest["has_pangenome"].sum()}')
print(f'  Without pangenome match: {(~manifest["has_pangenome"]).sum()}')

# Load carbon source experiments
fb_exps = pd.read_csv(f'{DATA_DIR}/fb_carbon_experiments.tsv', sep='\t')
print(f'Carbon source experiments: {len(fb_exps)}')

manifest[['orgId', 'full_name', 'n_carbon_sources', 'has_pangenome']].head()

Loaded manifest: 14 genomes
  With pangenome match: 14
  Without pangenome match: 0
Carbon source experiments: 717

# Load ModelSEED biochemistry
ms_rxns = pd.read_csv(f'{DATA_DIR}/modelseed_biochem/reactions.tsv', sep='\t', low_memory=False)
ms_cpds = pd.read_csv(f'{DATA_DIR}/modelseed_biochem/compounds.tsv', sep='\t', low_memory=False)

print(f'ModelSEED reactions: {len(ms_rxns)}')
print(f'ModelSEED compounds: {len(ms_cpds)}')
print(f'Reactions with EC: {ms_rxns["ec_numbers"].notna().sum()}')

# Build EC -> reaction mapping (explode pipe-separated EC numbers)
ec_rows = []
for _, row in ms_rxns[ms_rxns['ec_numbers'].notna()].iterrows():
    for ec in str(row['ec_numbers']).split('|'):
        ec = ec.strip()
        if ec and re.match(r'\d+\.\d+\.\d+\.\d+', ec):
            ec_rows.append({'ec': ec, 'rxn_id': row['id']})

ec_to_rxn = pd.DataFrame(ec_rows)
print(f'\nEC -> reaction mappings: {len(ec_to_rxn)}')
print(f'Unique EC numbers: {ec_to_rxn["ec"].nunique()}')
print(f'Unique reactions: {ec_to_rxn["rxn_id"].nunique()}')

# Load GramNegative template
tmpl_rxns = pd.read_csv(f'{DATA_DIR}/modelseed_templates/Reactions.tsv', sep='\t')
tmpl_biomass_cpds = pd.read_csv(f'{DATA_DIR}/modelseed_templates/BiomassCompounds.tsv', sep='\t')
print(f'\nGramNeg template reactions: {len(tmpl_rxns)}')
print(f'Biomass compounds: {len(tmpl_biomass_cpds)}')

ModelSEED reactions: 43774
ModelSEED compounds: 33992
Reactions with EC: 25767

EC -> reaction mappings: 25796
Unique EC numbers: 6628
Unique reactions: 21925

GramNeg template reactions: 8671
Biomass compounds: 78

# === Source 1: BERDL pangenome annotations (bakta EC) ===
# For organisms with has_pangenome=True, get EC numbers from bakta annotations
pangenome_orgs = manifest[manifest['has_pangenome'] == True].copy()
print(f'Organisms with pangenome match: {len(pangenome_orgs)}')

# Get bakta EC annotations for each matched species clade
pangenome_ecs = {}
for _, row in pangenome_orgs.iterrows():
    clade_id = row['gtdb_species_clade_id']
    orgId = row['orgId']
    
    # Get EC numbers from bakta annotations for this clade's gene clusters
    ec_df = spark.sql(f"""
        SELECT DISTINCT ba.ec
        FROM kbase_ke_pangenome.bakta_annotations ba
        JOIN kbase_ke_pangenome.gene_cluster gc 
            ON ba.gene_cluster_id = gc.gene_cluster_id
        WHERE gc.gtdb_species_clade_id = '{clade_id}'
        AND ba.ec IS NOT NULL AND ba.ec != ''
    """).toPandas()
    
    # Parse EC numbers (may be comma-separated)
    ecs = set()
    for ec_str in ec_df['ec'].dropna():
        for ec in str(ec_str).split(','):
            ec = ec.strip()
            if re.match(r'\d+\.\d+\.\d+\.\d+', ec):
                ecs.add(ec)
    
    pangenome_ecs[orgId] = ecs
    print(f'  {orgId}: {len(ecs)} EC numbers (bakta)')

print(f'\nTotal organisms with pangenome EC: {len(pangenome_ecs)}')

Organisms with pangenome match: 14

  Btheta: 647 EC numbers (bakta)

  Koxy: 1185 EC numbers (bakta)

  Smeli: 945 EC numbers (bakta)

  Phaeo: 683 EC numbers (bakta)

  Keio: 936 EC numbers (bakta)

  HerbieS: 713 EC numbers (bakta)

  Cup4G11: 892 EC numbers (bakta)

  Dino: 657 EC numbers (bakta)

  Marino: 659 EC numbers (bakta)

# Also get eggNOG EC annotations to supplement bakta
for _, row in pangenome_orgs.iterrows():
    clade_id = row['gtdb_species_clade_id']
    orgId = row['orgId']
    
    eggnog_df = spark.sql(f"""
        SELECT DISTINCT e.EC
        FROM kbase_ke_pangenome.eggnog_mapper_annotations e
        JOIN kbase_ke_pangenome.gene_cluster gc
            ON e.query_name = gc.gene_cluster_id
        WHERE gc.gtdb_species_clade_id = '{clade_id}'
        AND e.EC IS NOT NULL AND e.EC != '' AND e.EC != '-'
    """).toPandas()
    
    new_ecs = set()
    for ec_str in eggnog_df['EC'].dropna():
        for ec in str(ec_str).split(','):
            ec = ec.strip()
            if re.match(r'\d+\.\d+\.\d+\.\d+', ec):
                new_ecs.add(ec)
    
    before = len(pangenome_ecs[orgId])
    pangenome_ecs[orgId] |= new_ecs
    added = len(pangenome_ecs[orgId]) - before
    if added > 0:
        print(f'  {orgId}: +{added} from eggNOG → {len(pangenome_ecs[orgId])} total')

  Btheta: +485 from eggNOG → 1132 total

  Koxy: +555 from eggNOG → 1740 total

  Smeli: +663 from eggNOG → 1608 total

  Phaeo: +443 from eggNOG → 1126 total

  Keio: +306 from eggNOG → 1242 total

  HerbieS: +448 from eggNOG → 1161 total

  Cup4G11: +524 from eggNOG → 1416 total

  Dino: +386 from eggNOG → 1043 total

  Marino: +410 from eggNOG → 1069 total

  MR1: +289 from eggNOG → 975 total

# === Source 2: FB gene descriptions (extract EC numbers) ===
# For organisms WITHOUT pangenome match
no_pangenome_orgs = manifest[manifest['has_pangenome'] == False].copy()
print(f'Organisms without pangenome match: {len(no_pangenome_orgs)}')

fb_ecs = {}
for _, row in no_pangenome_orgs.iterrows():
    orgId = row['orgId']
    
    # Get gene descriptions from FB
    gene_df = spark.sql(f"""
        SELECT locusId, desc
        FROM kescience_fitnessbrowser.gene
        WHERE orgId = '{orgId}'
        AND type = '1'
    """).toPandas()
    
    # Extract EC numbers from descriptions using regex
    # Common patterns: "EC 1.2.3.4", "(EC 1.2.3.4)", "EC:1.2.3.4", or standalone "1.2.3.4"
    ecs = set()
    ec_pattern = re.compile(r'(?:EC[:\s]?)?([1-7]\.[0-9-]+\.[0-9-]+\.[0-9-]+)')
    for desc in gene_df['desc'].dropna():
        for m in ec_pattern.finditer(str(desc)):
            ec = m.group(1)
            # Only keep fully specified EC numbers (no dashes)
            if '-' not in ec:
                ecs.add(ec)
    
    fb_ecs[orgId] = ecs
    print(f'  {orgId}: {len(ecs)} EC numbers (from gene desc, {len(gene_df)} genes)')

print(f'\nTotal organisms with FB-derived EC: {len(fb_ecs)}')

Organisms without pangenome match: 0

Total organisms with FB-derived EC: 0

# Combine all EC annotations
all_ecs = {}
all_ecs.update(pangenome_ecs)
all_ecs.update(fb_ecs)

# Summary
ec_summary = []
for orgId in manifest['orgId']:
    ecs = all_ecs.get(orgId, set())
    source = 'pangenome' if orgId in pangenome_ecs else 'fb_desc'
    ec_summary.append({'orgId': orgId, 'n_ec': len(ecs), 'source': source})

ec_summary_df = pd.DataFrame(ec_summary).sort_values('n_ec', ascending=False)
print('EC annotation summary:')
print(ec_summary_df.to_string())
print(f'\nMean EC per organism: {ec_summary_df["n_ec"].mean():.0f}')
print(f'By source:')
print(ec_summary_df.groupby('source')['n_ec'].agg(['mean', 'min', 'max']))

# Save EC annotations
ec_rows_out = []
for orgId, ecs in all_ecs.items():
    source = 'pangenome' if orgId in pangenome_ecs else 'fb_desc'
    for ec in sorted(ecs):
        ec_rows_out.append({'orgId': orgId, 'ec': ec, 'source': source})
ec_out_df = pd.DataFrame(ec_rows_out)
ec_out_df.to_csv(f'{DATA_DIR}/ec_annotations.tsv', sep='\t', index=False)
print(f'\nSaved {len(ec_out_df)} EC annotations to data/ec_annotations.tsv')

EC annotation summary:
       orgId  n_ec     source
1       Koxy  1740  pangenome
2      Smeli  1608  pangenome
6    Cup4G11  1416  pangenome
4       Keio  1242  pangenome
10    azobra  1202  pangenome
5    HerbieS  1161  pangenome
0     Btheta  1132  pangenome
3      Phaeo  1126  pangenome
8     Marino  1069  pangenome
7       Dino  1043  pangenome
11       PV4  1013  pangenome
9        MR1   975  pangenome
12     Korea   949  pangenome
13  Dyella79   938  pangenome

Mean EC per organism: 1187
By source:
                  mean  min   max
source                           
pangenome  1186.714286  938  1740

Saved 16614 EC annotations to data/ec_annotations.tsv

# Download FB protein sequences and split into per-organism FASTAs
import urllib.request
from pathlib import Path

PROTEOME_DIR = Path(DATA_DIR) / 'proteomes'
PROTEOME_DIR.mkdir(exist_ok=True)
aaseqs_path = Path(DATA_DIR) / 'fb_aaseqs_all.fasta'

# Download if not cached
if not aaseqs_path.exists():
    print("Downloading FB protein sequences from fit.genomics.lbl.gov...")
    urllib.request.urlretrieve('https://fit.genomics.lbl.gov/cgi_data/aaseqs', str(aaseqs_path))
    n_total = sum(1 for line in open(aaseqs_path) if line.startswith('>'))
    print(f"Downloaded {n_total:,} protein sequences")
else:
    n_total = sum(1 for line in open(aaseqs_path) if line.startswith('>'))
    print(f"Using cached download: {n_total:,} protein sequences")

# Split into per-organism FASTAs (only our target organisms)
target_orgs = set(manifest['orgId'])
org_counts = {}

current_org = None
current_fh = None

with open(aaseqs_path) as f:
    for line in f:
        if line.startswith('>'):
            # Header format: >orgId:locusId
            header = line[1:].strip()
            parts = header.split(':', 1)
            orgId = parts[0]
            locusId = parts[1] if len(parts) > 1 else parts[0]
            
            if orgId in target_orgs:
                if orgId != current_org:
                    if current_fh is not None:
                        current_fh.close()
                    fasta_path = PROTEOME_DIR / f'{orgId}.fasta'
                    current_fh = open(fasta_path, 'w')
                    current_org = orgId
                    org_counts[orgId] = 0
                
                # Write with just locusId as header (MSGenome needs unique IDs)
                current_fh.write(f'>{locusId}\n')
                org_counts[orgId] = org_counts.get(orgId, 0) + 1
            else:
                if current_fh is not None:
                    current_fh.close()
                    current_fh = None
                current_org = None
        elif current_org is not None and current_fh is not None:
            current_fh.write(line)

if current_fh is not None:
    current_fh.close()

print(f"\nSplit FASTAs for {len(org_counts)} target organisms:")
for orgId in manifest['orgId']:
    n = org_counts.get(orgId, 0)
    status = 'OK' if n > 0 else 'MISSING'
    print(f"  {orgId}: {n:,} proteins [{status}]")

Using cached download: 272,076 protein sequences

Split FASTAs for 14 target organisms:
  Btheta: 4,816 proteins [OK]
  Koxy: 5,309 proteins [OK]
  Smeli: 6,218 proteins [OK]
  Phaeo: 3,875 proteins [OK]
  Keio: 4,146 proteins [OK]
  HerbieS: 4,715 proteins [OK]
  Cup4G11: 7,358 proteins [OK]
  Dino: 4,192 proteins [OK]
  Marino: 4,410 proteins [OK]
  MR1: 4,467 proteins [OK]
  azobra: 5,488 proteins [OK]
  PV4: 3,859 proteins [OK]
  Korea: 4,149 proteins [OK]
  Dyella79: 4,375 proteins [OK]

# RAST-annotate each genome using ModelSEEDpy's RastClient
# Sends protein sequences to tutorial.theseed.org for functional annotation
import pickle
from modelseedpy.core.msgenome import MSGenome
from modelseedpy.core.rast_client import RastClient

rast = RastClient()
genomes = {}  # orgId → MSGenome (annotated)
rast_cache = Path(DATA_DIR) / 'rast_genomes.pkl'

if rast_cache.exists():
    print("Loading cached RAST annotations...")
    with open(rast_cache, 'rb') as f:
        genomes = pickle.load(f)
    print(f"Loaded {len(genomes)} annotated genomes from cache")
else:
    for i, orgId in enumerate(manifest['orgId']):
        fasta_path = PROTEOME_DIR / f'{orgId}.fasta'
        if not fasta_path.exists():
            print(f"  [{i+1}/{len(manifest)}] {orgId}: SKIPPED (no FASTA)")
            continue
        
        genome = MSGenome.from_fasta(str(fasta_path), split=" ")
        n_features = len(genome.features)
        
        print(f"  [{i+1}/{len(manifest)}] {orgId}: annotating {n_features} proteins with RAST...", 
              end='', flush=True)
        try:
            rast.annotate_genome(genome)
            n_annotated = sum(1 for f in genome.features 
                            if f.ontology_terms.get('RAST'))
            genomes[orgId] = genome
            print(f" done ({n_annotated}/{n_features} annotated)")
        except Exception as e:
            print(f" ERROR: {e}")
    
    # Cache results
    with open(rast_cache, 'wb') as f:
        pickle.dump(genomes, f)
    print(f"\nCached {len(genomes)} annotated genomes to {rast_cache}")

# Summary
print(f"\nRAST annotation summary:")
for orgId in manifest['orgId']:
    if orgId in genomes:
        g = genomes[orgId]
        n_feat = len(g.features)
        n_rast = sum(1 for f in g.features if f.ontology_terms.get('RAST'))
        n_terms = sum(len(f.ontology_terms.get('RAST', [])) for f in g.features)
        print(f"  {orgId}: {n_feat} features, {n_rast} with RAST annotation, {n_terms} total terms")
    else:
        print(f"  {orgId}: NOT ANNOTATED")

modelseedpy 0.4.2

Loading cached RAST annotations...
Loaded 14 annotated genomes from cache

RAST annotation summary:
  Btheta: 4816 features, 4760 with RAST annotation, 4917 total terms
  Koxy: 5309 features, 5045 with RAST annotation, 5298 total terms
  Smeli: 6218 features, 6118 with RAST annotation, 6549 total terms
  Phaeo: 3875 features, 3679 with RAST annotation, 3832 total terms
  Keio: 4146 features, 4093 with RAST annotation, 4299 total terms
  HerbieS: 4715 features, 4579 with RAST annotation, 4757 total terms
  Cup4G11: 7358 features, 6471 with RAST annotation, 6733 total terms
  Dino: 4192 features, 4159 with RAST annotation, 4307 total terms
  Marino: 4410 features, 3646 with RAST annotation, 3793 total terms
  MR1: 4467 features, 4303 with RAST annotation, 4450 total terms
  azobra: 5488 features, 4506 with RAST annotation, 4668 total terms
  PV4: 3859 features, 3497 with RAST annotation, 3644 total terms
  Korea: 4149 features, 3260 with RAST annotation, 3383 total terms
  Dyella79: 4375 features, 3542 with RAST annotation, 3665 total terms

# Build a mapping from FB carbon source names to ModelSEED compound IDs
# Strategy: normalize names and do fuzzy matching

# Get unique carbon sources from our experiments
carbon_sources = fb_exps['condition_1'].dropna().unique()
print(f'Unique carbon sources to map: {len(carbon_sources)}')

# Normalize compound names for matching
def normalize_name(name):
    """Normalize compound name for matching."""
    s = str(name).lower().strip()
    for suffix in [' monohydrate', ' dihydrate', ' pentahydrate', ' hexahydrate',
                   ' hydrochloride', ' hcl', ' sodium salt', ' disodium salt',
                   ' dipotassium salt', ' monopotassium salt', ' dibasic',
                   ' tribasic', ' hydrate', ' anhydrous']:
        s = s.replace(suffix, '')
    for prefix in ['sodium ', 'potassium ', 'trisodium ', 'disodium ']:
        if s.startswith(prefix):
            s = s[len(prefix):]
    s = s.strip()
    return s

# Build ModelSEED compound lookup
ms_cpds['name_norm'] = ms_cpds['name'].apply(normalize_name)
cpd_by_name = {}
cpd_by_norm = {}
for _, row in ms_cpds.iterrows():
    name_lower = str(row['name']).lower().strip()
    cpd_by_name[name_lower] = row['id']
    cpd_by_norm[row['name_norm']] = row['id']

# Manual mapping for common FB carbon source names
# Verified against ModelSEED compounds.tsv
manual_map = {
    'D-Glucose': 'cpd00027',
    'D-Fructose': 'cpd00082',
    'D-Galactose': 'cpd00108',
    'D-Mannose': 'cpd00138',
    'D-Xylose': 'cpd00154',
    'L-Arabinose': 'cpd00224',
    'D-Ribose': 'cpd00105',
    'Glycerol': 'cpd00100',
    'D-Sorbitol': 'cpd00588',
    'D-Maltose monohydrate': 'cpd00179',
    'D-Trehalose dihydrate': 'cpd00794',
    'D-Cellobiose': 'cpd00158',
    'D-Raffinose pentahydrate': 'cpd30716',  # cpd03601 is Stizolobinate (wrong match)
    'Beta-Lactose': 'cpd00208',
    'Sucrose': 'cpd00076',
    'L-Fucose': 'cpd00751',
    'D-Galacturonic Acid monohydrate': 'cpd00280',
    'D-Glucuronic Acid': 'cpd00164',
    'D-Gluconic Acid sodium salt': 'cpd00222',
    'D-Glucosamine Hydrochloride': 'cpd00276',  # D-Glucosamine (GLUM)
    'N-Acetyl-D-Glucosamine': 'cpd00122',
    'Sodium pyruvate': 'cpd00020',
    'Sodium D,L-Lactate': 'cpd00159',
    'Sodium D-Lactate': 'cpd00221',
    'Sodium L-Lactate': 'cpd00159',
    'Sodium succinate dibasic hexahydrate': 'cpd00036',
    'Sodium Fumarate dibasic': 'cpd00106',
    'L-Malic acid disodium salt monohydrate': 'cpd00130',
    'Trisodium citrate dihydrate': 'cpd00137',
    'Potassium acetate': 'cpd00029',
    'a-Ketoglutaric acid disodium salt hydrate': 'cpd00024',
    'L-Glutamine': 'cpd00053',
    'L-Glutamic acid monopotassium salt monohydrate': 'cpd00023',
    'L-Proline': 'cpd00129',
    'L-Alanine': 'cpd00035',
    'L-Asparagine': 'cpd00132',
    'L-Aspartic acid': 'cpd00041',  # L-Aspartate
    'L-Serine': 'cpd00054',
    'L-Threonine': 'cpd00161',
    'Ethanol': 'cpd00363',
    'D-Mannitol': 'cpd00314',
    'm-Inositol': 'cpd00121',
    'myo-Inositol': 'cpd00121',
    'Acetoin': 'cpd00853',
    'D-Alanine': 'cpd00117',
    'Glycine': 'cpd00033',
    'L-Rhamnose monohydrate': 'cpd00396',
    'Adenosine': 'cpd00182',
    'Cytidine': 'cpd00367',
    'Uridine': 'cpd00249',
    'Inosine': 'cpd00246',
    'Thymidine': 'cpd00184',
    "2'-Deoxyinosine": 'cpd03279',      # Deoxyinosine (din)
    "2'-Deoxyadenosine": 'cpd00438',    # Deoxyadenosine (dad-2)
    "2'-Deoxycytidine": 'cpd00654',     # Deoxycytidine (dcyt)
    "2'-Deoxyuridine": 'cpd00412',      # Deoxyuridine (duri)
    'D-Glucaric acid potassium salt': 'cpd00652',
    '2-Deoxy-D-Ribose': 'cpd01242',    # Thyminose = 2-Deoxy-D-Ribose (drib)
    'Putrescine dihydrochloride': 'cpd00118',
    'Sodium butyrate': 'cpd00211',           # Butyrate (C4H7O2)
    'Sodium propionate': 'cpd00141',         # Propionate (C3H5O2)
    'D-Glucose-6-Phosphate sodium salt': 'cpd00079',  # D-glucose-6-phosphate (C6H11O9P)
    'a-Cyclodextrin': 'cpd24451',
    'L-Leucine': 'cpd00107',
    'Tween 20': None,
    'casamino acids': None,
    'Gelatin': None,
}

# Map all carbon sources
carbon_map = {}
mapped_count = 0
unmapped = []

for cs in carbon_sources:
    if cs in manual_map:
        carbon_map[cs] = manual_map[cs]
        if manual_map[cs] is not None:
            mapped_count += 1
        continue
    
    cs_lower = cs.lower().strip()
    if cs_lower in cpd_by_name:
        carbon_map[cs] = cpd_by_name[cs_lower]
        mapped_count += 1
        continue
    
    cs_norm = normalize_name(cs)
    if cs_norm in cpd_by_norm:
        carbon_map[cs] = cpd_by_norm[cs_norm]
        mapped_count += 1
        continue
    
    carbon_map[cs] = None
    unmapped.append(cs)

print(f'Carbon source mapping: {mapped_count}/{len(carbon_sources)} mapped')
print(f'Unmapped ({len(unmapped)}):')
for u in sorted(unmapped):
    print(f'  {u}')

Unique carbon sources to map: 109

Carbon source mapping: 75/109 mapped
Unmapped (31):
  (+)-Arabinogalactan
  1,4-B-D-Galactobiose
  2-Deoxy-D-ribonic acid lithium salt
  2-Deoxyadenosine 5-monophosphate
  2-Deoxyadenosine monohydrate
  4-Hydroxybenzoic Acid
  6-O-Acetyl-D-glucose
  Agro_defined_trehalose
  Amylose from potato
  Chondroitin sulfate A sodium salt from bovine trachea
  D-Leucrose
  D-Salicin
  Fructooligosaccharides (FOS)
  Gly-DL-Asp
  Glycolic Acid
  Heparin sodium salt from porcine intestinal mucosa
  Hyaluronic acid sodium salt from Streptococcus equi
  Isomaltose
  L-Citrulline
  L-Ornithine
  Laminaribiose
  Levan - from Erwinia herbicola
  Maltitol
  Mannan from Saccharomyces cerevisiae
  Red Arabinan from sugar-beet
  Rhamnogalacturonan - from potato
  Stachyose - 70%
  Supernatant; Agrobacterium rhizogenes K599 grown in Agro_defined_trehalose (~7.5 mM 3-keto-trehalose)
  amylopectin from maize
  dextran, Mw ~200,000
  polygalacturonic acid

# Verify compound mappings — correct any remaining fuzzy-match errors
corrections_applied = 0
for cs_name, cpd_id in list(carbon_map.items()):
    if cpd_id == 'cpd03601':
        carbon_map[cs_name] = 'cpd30716'
        corrections_applied += 1
    elif cpd_id == 'cpd30649':
        carbon_map[cs_name] = 'cpd00211'
        corrections_applied += 1
    elif cpd_id == 'cpd30710':
        carbon_map[cs_name] = 'cpd00141'
        corrections_applied += 1
    elif cpd_id == 'cpd26836':
        carbon_map[cs_name] = 'cpd00079'
        corrections_applied += 1

if corrections_applied:
    print(f"Corrected {corrections_applied} compound mappings:")
    print(f"  cpd03601 (Stizolobinate) → cpd30716 (D-Raffinose pentahydrate)")
    print(f"  cpd30649 (sodium butyrate salt) → cpd00211 (Butyrate)")
    print(f"  cpd30710 (Sodium propionate salt) → cpd00141 (Propionate)")
    print(f"  cpd26836 (G6P duplicate) → cpd00079 (D-glucose-6-phosphate)")
print(f"Mapped carbon sources: {sum(1 for v in carbon_map.values() if v is not None)}/{len(carbon_map)}")

Mapped carbon sources: 75/109

def parse_ms_equation(equation, rxn_id):
    """Parse a ModelSEED equation string into metabolite:coefficient dict.
    
    ModelSEED format: '(1) cpd00001[0] + (1) cpd00002[0] <=> (1) cpd00003[0]'
    [0] = cytoplasm, [1] = extracellular
    """
    if pd.isna(equation) or not equation.strip():
        return {}
    
    # Split by reaction arrow
    for arrow in [' <=> ', ' => ', ' <= ', ' -> ', ' <- ']:
        if arrow in equation:
            left, right = equation.split(arrow, 1)
            break
    else:
        return {}
    
    metabolites = {}
    # Parse left side (reactants, negative coefficients)
    for part in left.split(' + '):
        part = part.strip()
        if not part:
            continue
        m = re.match(r'\(([\d.]+)\)\s+(cpd\d+)\[(\d)\]', part)
        if m:
            coeff = -float(m.group(1))
            cpd_id = m.group(2)
            comp = 'c' if m.group(3) == '0' else 'e'
            met_id = f'{cpd_id}_{comp}'
            metabolites[met_id] = coeff
    
    # Parse right side (products, positive coefficients)
    for part in right.split(' + '):
        part = part.strip()
        if not part:
            continue
        m = re.match(r'\(([\d.]+)\)\s+(cpd\d+)\[(\d)\]', part)
        if m:
            coeff = float(m.group(1))
            cpd_id = m.group(2)
            comp = 'c' if m.group(3) == '0' else 'e'
            met_id = f'{cpd_id}_{comp}'
            metabolites[met_id] = metabolites.get(met_id, 0) + coeff
    
    return metabolites

# Test with a sample reaction
test_eq = ms_rxns.iloc[0]['equation']
print(f'Test equation: {test_eq}')
print(f'Parsed: {parse_ms_equation(test_eq, "test")}')

Test equation: (1) cpd00001[0] + (1) cpd00012[0] <=> (2) cpd00009[0] + (1) cpd00067[0]
Parsed: {'cpd00001_c': -1.0, 'cpd00012_c': -1.0, 'cpd00009_c': 2.0, 'cpd00067_c': 1.0}

# Pre-parse all ModelSEED reactions we might need
# Get the set of all reaction IDs we'll use
all_ec_set = set()
for ecs in all_ecs.values():
    all_ec_set |= ecs

needed_rxn_ids = set(ec_to_rxn[ec_to_rxn['ec'].isin(all_ec_set)]['rxn_id'])
# Also include all template reactions (for core metabolism + biomass)
tmpl_rxn_ids = set(tmpl_rxns['id'])
all_needed_rxns = needed_rxn_ids | tmpl_rxn_ids

print(f'Reaction IDs from organism EC annotations: {len(needed_rxn_ids)}')
print(f'Template reaction IDs: {len(tmpl_rxn_ids)}')
print(f'Total unique reaction IDs needed: {len(all_needed_rxns)}')

# Parse equations for all needed reactions
rxn_equations = {}
rxn_reversibility = {}
ms_rxns_needed = ms_rxns[ms_rxns['id'].isin(all_needed_rxns)]

for _, row in ms_rxns_needed.iterrows():
    mets = parse_ms_equation(row['equation'], row['id'])
    if mets:
        rxn_equations[row['id']] = mets
        rxn_reversibility[row['id']] = row.get('reversibility', '=')

print(f'Successfully parsed equations: {len(rxn_equations)}')

Reaction IDs from organism EC annotations: 11623
Template reaction IDs: 8671
Total unique reaction IDs needed: 17150

Successfully parsed equations: 17141

# Build models using ModelSEEDpy's MSBuilder
from modelseedpy.core.msbuilder import MSBuilder
from modelseedpy.core.mstemplate import MSTemplateBuilder
from modelseedpy.helpers import get_template

# Load GramNeg template (all 14 organisms are Gram-negative)
template_data = get_template('template_gram_neg')
template = MSTemplateBuilder.from_dict(template_data).build()
print(f"Template: {template.name}")
print(f"  Reactions: {len(template.reactions)}")
print(f"  Compounds: {len(template.compounds)}")
print(f"  Biomasses: {len(template.biomasses)}")
print(f"  Complexes: {len(template.complexes)}")
print(f"  Roles: {len(template.roles)}")

Template: GramNegative.modeltemplate
  Reactions: 9434
  Compounds: 6847
  Biomasses: 1
  Complexes: 9197
  Roles: 20389

# Build models for all 14 organisms using MSBuilder
models = {}
model_stats = []

for i, (_, row) in enumerate(manifest.iterrows()):
    orgId = row['orgId']
    
    if orgId not in genomes:
        print(f"  [{i+1}/{len(manifest)}] {orgId}: SKIPPED (no RAST annotation)")
        continue
    
    genome = genomes[orgId]
    
    builder = MSBuilder(genome, template=template, ontology_term='RAST')
    model = builder.build(
        orgId,
        index='0',
        annotate_with_rast=False,  # already annotated in previous cell
        biomass_gc=0.5,
        allow_all_non_grp_reactions=True
    )
    
    # Test baseline growth on complete media (all exchanges open)
    try:
        sol = model.optimize()
        growth = sol.objective_value if sol.status == 'optimal' else 0.0
    except Exception:
        growth = 0.0
    
    models[orgId] = model
    
    n_with_gpr = sum(1 for r in model.reactions if r.gene_reaction_rule)
    
    model_stats.append({
        'orgId': orgId,
        'n_reactions': len(model.reactions),
        'n_metabolites': len(model.metabolites),
        'n_genes': len(model.genes),
        'n_with_gpr': n_with_gpr,
        'n_exchanges': sum(1 for r in model.reactions if r.id.startswith('EX_')),
        'baseline_growth': round(growth, 4)
    })
    
    print(f"  [{i+1}/{len(manifest)}] {orgId}: {len(model.reactions)} rxns, "
          f"{len(model.metabolites)} mets, {len(model.genes)} genes, "
          f"{n_with_gpr} with GPR, growth={growth:.4f}")

model_stats_df = pd.DataFrame(model_stats)
print(f"\nBuilt {len(models)} models")
print(f"Mean reactions: {model_stats_df['n_reactions'].mean():.0f}")
print(f"Mean metabolites: {model_stats_df['n_metabolites'].mean():.0f}")
print(f"Mean genes: {model_stats_df['n_genes'].mean():.0f}")
print(f"Mean with GPR: {model_stats_df['n_with_gpr'].mean():.0f}")
print(f"Models with growth > 0: {(model_stats_df['baseline_growth'] > 0).sum()}")

  [1/14] Btheta: 989 rxns, 1010 mets, 769 genes, 903 with GPR, growth=0.0000

  [2/14] Koxy: 1789 rxns, 1492 mets, 1637 genes, 1604 with GPR, growth=0.0000

  [3/14] Smeli: 1309 rxns, 1243 mets, 1429 genes, 1216 with GPR, growth=0.0000

  [4/14] Phaeo: 1188 rxns, 1172 mets, 1044 genes, 1105 with GPR, growth=0.0000

  [5/14] Keio: 1739 rxns, 1451 mets, 1302 genes, 1545 with GPR, growth=0.0000

  [6/14] HerbieS: 1312 rxns, 1264 mets, 1178 genes, 1195 with GPR, growth=0.0000

  [7/14] Cup4G11: 1424 rxns, 1343 mets, 1562 genes, 1290 with GPR, growth=0.0000

  [8/14] Dino: 1216 rxns, 1196 mets, 1017 genes, 1125 with GPR, growth=0.0000

  [9/14] Marino: 1211 rxns, 1151 mets, 953 genes, 1104 with GPR, growth=0.0000

  [10/14] MR1: 1256 rxns, 1140 mets, 894 genes, 1156 with GPR, growth=0.0000

# Stage 1: Gapfill on complete media
from modelseedpy.core.msgapfill import MSGapfill
from modelseedpy.core.msmedia import MSMedia

complete_media = MSMedia.from_dict({})
complete_media.id = "Complete"
complete_media.name = "Complete"

complete_gf_stats = []
complete_gf_reactions = {}

for orgId, model in models.items():
    gapfiller = MSGapfill(
        model,
        default_gapfill_templates=[template],
        default_target="bio1",
        minimum_obj=0.01,
    )
    
    solution = gapfiller.run_gapfilling(media=complete_media, target="bio1")
    
    if solution:
        gapfiller.integrate_gapfill_solution(solution)
        new_rxns = list(solution.get("new", {}).keys())
        reversed_rxns = list(solution.get("reversed", {}).keys())
        complete_gf_reactions[orgId] = {
            'new': dict(solution.get("new", {})),
            'reversed': dict(solution.get("reversed", {}))
        }
    else:
        new_rxns = []
        reversed_rxns = []
        complete_gf_reactions[orgId] = {'new': {}, 'reversed': {}}
    
    sol = model.optimize()
    growth = sol.objective_value if sol.status == 'optimal' else 0.0
    
    complete_gf_stats.append({
        'orgId': orgId,
        'n_new': len(new_rxns),
        'n_reversed': len(reversed_rxns),
        'n_rxns_after': len(model.reactions),
        'n_exchanges_after': sum(1 for r in model.reactions if r.id.startswith('EX_')),
        'growth_complete': round(growth, 4)
    })
    
    print(f"  {orgId}: +{len(new_rxns)} new, {len(reversed_rxns)} reversed, "
          f"{len(model.reactions)} total rxns, growth={growth:.4f}")

gf_stats_df = pd.DataFrame(complete_gf_stats)
print(f"\nComplete media gapfill summary:")
print(f"  Models with growth > 0: {(gf_stats_df['growth_complete'] > 0).sum()}/{len(gf_stats_df)}")
print(f"  Mean new reactions: {gf_stats_df['n_new'].mean():.1f}")
print(f"  Mean exchanges after: {gf_stats_df['n_exchanges_after'].mean():.0f}")

  Btheta: +81 new, 0 reversed, 1070 total rxns, growth=0.0000

  Koxy: +11 new, 0 reversed, 1800 total rxns, growth=127.0807

  Smeli: +54 new, 0 reversed, 1363 total rxns, growth=42.8327

  Phaeo: +62 new, 0 reversed, 1250 total rxns, growth=37.7815

  Keio: +5 new, 0 reversed, 1744 total rxns, growth=122.9214

  HerbieS: +44 new, 0 reversed, 1356 total rxns, growth=22.5168

  Cup4G11: +50 new, 0 reversed, 1474 total rxns, growth=46.1738

  Dino: +62 new, 0 reversed, 1278 total rxns, growth=0.0000

  Marino: +55 new, 0 reversed, 1266 total rxns, growth=23.7323

  MR1: +31 new, 0 reversed, 1287 total rxns, growth=53.3576

def run_fba_carbon_source(model, cpd_id, uptake_rate=10.0):
    """Run FBA with a specific carbon source.
    
    Sets minimal media: closes all carbon exchange reactions,
    then opens the specified carbon source.
    Returns growth rate or 0 if infeasible.
    
    ModelSEEDpy models use _e0 compartment suffix (e.g., cpd00027_e0).
    """
    with model:  # context manager preserves original bounds
        # Define minimal inorganic media - always available
        always_open_cpds = {
            'cpd00001',  # H2O
            'cpd00009',  # Phosphate
            'cpd00013',  # NH3
            'cpd00048',  # Sulfate
            'cpd00007',  # O2 (aerobic)
            'cpd00011',  # CO2
            'cpd00067',  # H+
            'cpd00099',  # Cl-
            'cpd00058',  # Cu2+
            'cpd00034',  # Zn2+
            'cpd10516',  # fe3
            'cpd00030',  # Mn2+
            'cpd00254',  # Mg
            'cpd00063',  # Ca2+
            'cpd00205',  # K+
            'cpd00971',  # Na+
            'cpd00149',  # Co2+
            'cpd10515',  # Fe2+
            'cpd00528',  # Ni2+
        }
        
        # Close all exchange reactions first (set lower_bound = 0, no uptake)
        for rxn in model.reactions:
            if rxn.id.startswith('EX_'):
                met_id = rxn.id[3:]  # strip 'EX_' prefix
                cpd = met_id.split('_')[0]  # get just cpd ID
                if cpd in always_open_cpds:
                    rxn.lower_bound = -1000
                else:
                    rxn.lower_bound = 0
                rxn.upper_bound = 1000
        
        # Open the carbon source exchange (try both _e0 and _e formats)
        ex_id_e0 = f'EX_{cpd_id}_e0'
        ex_id_e = f'EX_{cpd_id}_e'
        
        ex_found = False
        for ex_id in [ex_id_e0, ex_id_e]:
            if ex_id in model.reactions:
                model.reactions.get_by_id(ex_id).lower_bound = -uptake_rate
                ex_found = True
                break
        
        if not ex_found:
            return 0.0, 'no_exchange'
        
        # Run FBA
        try:
            sol = model.optimize()
            if sol.status == 'optimal' and sol.objective_value > 1e-6:
                return round(sol.objective_value, 6), 'optimal'
            else:
                return 0.0, sol.status
        except Exception as e:
            return 0.0, f'error:{str(e)[:50]}'

print('FBA runner function defined.')

FBA runner function defined.

# Add missing transport and exchange reactions for carbon source compounds.
# MSBuilder only adds reactions with GPR from RAST annotation. Transporters are
# poorly annotated, so most carbon sources lack exchange/transport reactions.
# Without these, the gapfiller cannot route substrate into the model.
from cobra import Metabolite, Reaction

carbon_cpd_ids = set(cpd_id for cpd_id in carbon_map.values() if cpd_id is not None)
transport_stats = []

for orgId, model in models.items():
    n_added_ex = 0
    n_added_transport = 0
    
    for cpd_id in carbon_cpd_ids:
        met_c0_id = f'{cpd_id}_c0'
        met_e0_id = f'{cpd_id}_e0'
        ex_id = f'EX_{cpd_id}_e0'
        transport_id = f'transport_{cpd_id}'
        
        # Skip if exchange already exists
        if ex_id in model.reactions:
            continue
        
        # Create extracellular metabolite if missing
        if met_e0_id not in model.metabolites:
            # Get name from cytoplasmic form if it exists
            name = cpd_id
            if met_c0_id in model.metabolites:
                name = model.metabolites.get_by_id(met_c0_id).name or cpd_id
            met_e0 = Metabolite(met_e0_id, name=f'{name} [e0]', compartment='e0')
            model.add_metabolites([met_e0])
        
        met_e0 = model.metabolites.get_by_id(met_e0_id)
        
        # Add exchange reaction
        ex_rxn = Reaction(ex_id, name=f'Exchange for {met_e0.name}',
                          lower_bound=-1000, upper_bound=1000)
        ex_rxn.add_metabolites({met_e0: -1})
        model.add_reactions([ex_rxn])
        n_added_ex += 1
        
        # Add simple diffusion transport if no transport exists
        if met_c0_id in model.metabolites:
            met_c0 = model.metabolites.get_by_id(met_c0_id)
            # Check if any transport reaction already connects e0 and c0
            c_rxns = set(r.id for r in met_c0.reactions)
            e_rxns = set(r.id for r in met_e0.reactions)
            has_transport = bool(c_rxns & e_rxns - {ex_id})
            
            if not has_transport and transport_id not in model.reactions:
                tr_rxn = Reaction(transport_id, name=f'Diffusion transport {cpd_id}',
                                  lower_bound=-1000, upper_bound=1000)
                tr_rxn.add_metabolites({met_e0: -1, met_c0: 1})
                model.add_reactions([tr_rxn])
                n_added_transport += 1
    
    transport_stats.append({
        'orgId': orgId, 'exchanges_added': n_added_ex, 'transports_added': n_added_transport,
        'total_rxns': len(model.reactions)
    })
    print(f"  {orgId}: +{n_added_ex} exchanges, +{n_added_transport} transports → {len(model.reactions)} rxns")

ts_df = pd.DataFrame(transport_stats)
print(f"\nMean exchanges added: {ts_df['exchanges_added'].mean():.1f}")
print(f"Mean transports added: {ts_df['transports_added'].mean():.1f}")

  Btheta: +58 exchanges, +35 transports → 1163 rxns
  Koxy: +20 exchanges, +6 transports → 1826 rxns
  Smeli: +62 exchanges, +44 transports → 1469 rxns
  Phaeo: +56 exchanges, +33 transports → 1339 rxns
  Keio: +19 exchanges, +5 transports → 1768 rxns
  HerbieS: +46 exchanges, +31 transports → 1433 rxns
  Cup4G11: +42 exchanges, +21 transports → 1537 rxns
  Dino: +49 exchanges, +29 transports → 1356 rxns
  Marino: +50 exchanges, +24 transports → 1340 rxns
  MR1: +47 exchanges, +22 transports → 1356 rxns

  azobra: +51 exchanges, +32 transports → 1375 rxns
  PV4: +46 exchanges, +22 transports → 1373 rxns
  Korea: +55 exchanges, +34 transports → 1160 rxns
  Dyella79: +42 exchanges, +25 transports → 1321 rxns

Mean exchanges added: 45.9
Mean transports added: 25.9

# Stage 2: Gapfill on each carbon source media
# Only gapfill for carbon sources where the organism is observed to grow
# but the model (after complete-media gapfill) still can't grow

observed_growth_set = set(zip(
    fb_exps['orgId'], fb_exps['condition_1']
))

base_media_cpds = {
    "cpd00001": 1000,  # H2O
    "cpd00067": 1000,  # H+
    "cpd00009": 100,   # Phosphate
    "cpd00013": 100,   # NH3
    "cpd00048": 100,   # Sulfate
    "cpd00007": 100,   # O2
    "cpd00011": 100,   # CO2
    "cpd00099": 100,   # Cl-
    "cpd00058": 100,   # Cu2+
    "cpd00034": 100,   # Zn2+
    "cpd10516": 100,   # Fe3+
    "cpd00030": 100,   # Mn2+
    "cpd00254": 100,   # Mg
    "cpd00063": 100,   # Ca2+
    "cpd00205": 100,   # K+
    "cpd00971": 100,   # Na+
    "cpd00149": 100,   # Co2+
    "cpd10515": 100,   # Fe2+
    "cpd00528": 100,   # Ni2+
}

carbon_panel = pd.read_csv(f'{DATA_DIR}/carbon_source_panel.tsv', sep='\t')
top_carbons = carbon_panel[carbon_panel['n_organisms'] >= 3]['carbon_source'].tolist()

cs_gapfill_results = []
cs_gapfill_reactions = {}

for orgId, model in models.items():
    n_already = 0
    n_gapfilled = 0
    n_failed = 0
    
    for cs in top_carbons:
        cpd_id = carbon_map.get(cs)
        if cpd_id is None:
            continue
        
        observed = (orgId, cs) in observed_growth_set
        
        # Test if model already grows on this carbon source
        growth_before, status = run_fba_carbon_source(model, cpd_id)
        
        if growth_before > 1e-6:
            n_already += 1
            cs_gapfill_results.append({
                'orgId': orgId, 'carbon_source': cs, 'cpd_id': cpd_id,
                'observed_growth': observed, 'status': 'already_grows',
                'growth_after_gf': round(growth_before, 4),
                'n_gapfilled_rxns': 0, 'gapfilled_rxns': ''
            })
            continue
        
        if not observed:
            cs_gapfill_results.append({
                'orgId': orgId, 'carbon_source': cs, 'cpd_id': cpd_id,
                'observed_growth': False, 'status': 'not_observed',
                'growth_after_gf': 0.0,
                'n_gapfilled_rxns': 0, 'gapfilled_rxns': ''
            })
            continue
        
        # Gapfill for this carbon source
        cs_media_cpds = dict(base_media_cpds)
        cs_media_cpds[cpd_id] = 10
        cs_media = MSMedia.from_dict(cs_media_cpds)
        cs_media.id = cs
        cs_media.name = cs
        
        gapfiller = MSGapfill(
            model,
            default_gapfill_templates=[template],
            default_target="bio1",
            minimum_obj=0.01,
        )
        
        try:
            solution = gapfiller.run_gapfilling(media=cs_media, target="bio1")
        except Exception as e:
            solution = None
            print(f"  {orgId}/{cs}: gapfill error: {str(e)[:80]}")
        
        if solution:
            gapfiller.integrate_gapfill_solution(solution)
            rxns_new = list(solution.get("new", {}).keys())
            rxns_rev = list(solution.get("reversed", {}).keys())
            all_rxns = rxns_new + rxns_rev
            
            key = f"{orgId}:{cs}"
            cs_gapfill_reactions[key] = {
                'new': dict(solution.get("new", {})),
                'reversed': dict(solution.get("reversed", {}))
            }
            
            growth_after, _ = run_fba_carbon_source(model, cpd_id)
            n_gapfilled += 1
            
            cs_gapfill_results.append({
                'orgId': orgId, 'carbon_source': cs, 'cpd_id': cpd_id,
                'observed_growth': True, 'status': 'gapfilled',
                'growth_after_gf': round(growth_after, 4),
                'n_gapfilled_rxns': len(all_rxns),
                'gapfilled_rxns': ';'.join(all_rxns)
            })
        else:
            n_failed += 1
            cs_gapfill_results.append({
                'orgId': orgId, 'carbon_source': cs, 'cpd_id': cpd_id,
                'observed_growth': True, 'status': 'gapfill_failed',
                'growth_after_gf': 0.0,
                'n_gapfilled_rxns': 0, 'gapfilled_rxns': ''
            })
    
    print(f"  {orgId}: {n_already} already grow, {n_gapfilled} gapfilled, "
          f"{n_failed} failed, {len(model.reactions)} total rxns")

cs_gf_df = pd.DataFrame(cs_gapfill_results)
print(f"\nCarbon source gapfill summary:")
print(f"  Total conditions tested: {len(cs_gf_df)}")
print(cs_gf_df['status'].value_counts().to_string())

# Save gapfill results
cs_gf_df.to_csv(f'{DATA_DIR}/carbon_source_gapfill_results.tsv', sep='\t', index=False)
print(f"\nSaved to data/carbon_source_gapfill_results.tsv")

  Btheta: 33 already grow, 2 gapfilled, 0 failed, 1178 total rxns

  Koxy: 40 already grow, 1 gapfilled, 0 failed, 1833 total rxns

  Smeli: 40 already grow, 1 gapfilled, 0 failed, 1485 total rxns

  Phaeo: 31 already grow, 7 gapfilled, 0 failed, 1362 total rxns

  Keio: 40 already grow, 1 gapfilled, 0 failed, 1776 total rxns

  HerbieS: 36 already grow, 5 gapfilled, 0 failed, 1451 total rxns

  Cup4G11: 26 already grow, 3 gapfilled, 0 failed, 1551 total rxns

  Dino: 39 already grow, 2 gapfilled, 0 failed, 1379 total rxns

  Marino: 40 already grow, 1 gapfilled, 0 failed, 1353 total rxns

  MR1: 39 already grow, 2 gapfilled, 0 failed, 1365 total rxns

# Save gapfilled models as SBML
for orgId, model in models.items():
    cobra.io.write_sbml_model(model, f'{MODEL_DIR}/{orgId}.xml')

print(f'Saved {len(models)} gapfilled SBML models to {MODEL_DIR}/')

# Save complete-media gapfill stats
gf_stats_df.to_csv(f'{DATA_DIR}/complete_gapfill_stats.tsv', sep='\t', index=False)

# Save gapfill reaction details
import json as _json
with open(f'{DATA_DIR}/gapfill_reactions.json', 'w') as f:
    _json.dump({
        'complete_media': complete_gf_reactions,
        'carbon_source': cs_gapfill_reactions
    }, f, indent=2)
print(f'Saved gapfill reaction details to data/gapfill_reactions.json')

# Updated model stats after gapfilling
gf_model_stats = []
for orgId, model in models.items():
    n_with_gpr = sum(1 for r in model.reactions if r.gene_reaction_rule)
    sol = model.optimize()
    growth = sol.objective_value if sol.status == 'optimal' else 0.0
    gf_model_stats.append({
        'orgId': orgId,
        'n_reactions': len(model.reactions),
        'n_metabolites': len(model.metabolites),
        'n_genes': len(model.genes),
        'n_with_gpr': n_with_gpr,
        'n_exchanges': sum(1 for r in model.reactions if r.id.startswith('EX_')),
        'growth_complete_media': round(growth, 4)
    })
gf_model_stats_df = pd.DataFrame(gf_model_stats)
print(f'\nPost-gapfill model stats:')
print(gf_model_stats_df.to_string())

Saved 14 gapfilled SBML models to ../data/models/
Saved gapfill reaction details to data/gapfill_reactions.json

Post-gapfill model stats:
       orgId  n_reactions  n_metabolites  n_genes  n_with_gpr  n_exchanges  growth_complete_media
0     Btheta         1178           1193      769         903          125                 0.4216
1       Koxy         1833           1597     1637        1604          181                10.8267
2      Smeli         1485           1428     1429        1216          138                 3.2989
3      Phaeo         1362           1352     1044        1105          123                11.6254
4       Keio         1776           1541     1302        1545          190                 3.2933
5    HerbieS         1451           1428     1178        1195          142                 4.1738
6    Cup4G11         1551           1481     1562        1290          157                11.5203
7       Dino         1379           1373     1017        1125          126                 1.6876
8     Marino         1353           1304      953        1104          139                 3.2825
9        MR1         1365           1282      894        1156          127                 2.5376
10    azobra         1396           1344     1143        1109          147                 3.1409
11       PV4         1381           1305      883        1172          127                 2.5376
12     Korea         1182           1179      781         906          112                11.6254
13  Dyella79         1342           1306      829        1095          131                11.4943

# Run FBA for all organism × carbon source pairs
fba_results = []

# Build observed growth matrix from FB experiments
# If organism has an experiment for a carbon source, it grew (FB only sequences successful cultures)
observed_growth = fb_exps.groupby(['orgId', 'condition_1']).size().reset_index(name='n_experiments')
observed_set = set(zip(observed_growth['orgId'], observed_growth['condition_1']))

# Get the top carbon sources (tested in >= 3 organisms)
carbon_panel = pd.read_csv(f'{DATA_DIR}/carbon_source_panel.tsv', sep='\t')
top_carbons = carbon_panel[carbon_panel['n_organisms'] >= 3]['carbon_source'].tolist()
print(f'Testing {len(top_carbons)} carbon sources across {len(models)} organisms')

for orgId, model in models.items():
    n_tested = 0
    n_predicted_growth = 0
    
    for cs in top_carbons:
        cpd_id = carbon_map.get(cs)
        if cpd_id is None:
            continue  # unmappable carbon source
        
        # Run FBA
        growth_rate, status = run_fba_carbon_source(model, cpd_id)
        
        # Observed: did FB have an experiment for this organism + carbon source?
        observed = (orgId, cs) in observed_set
        predicted = growth_rate > 1e-6
        
        fba_results.append({
            'orgId': orgId,
            'carbon_source': cs,
            'cpd_id': cpd_id,
            'growth_rate': growth_rate,
            'fba_status': status,
            'predicted_growth': predicted,
            'observed_growth': observed,
            'correct': predicted == observed
        })
        
        n_tested += 1
        if predicted:
            n_predicted_growth += 1
    
    print(f'{orgId}: {n_predicted_growth}/{n_tested} predicted growth')

fba_df = pd.DataFrame(fba_results)
print(f'\nTotal FBA results: {len(fba_df)}')

Testing 42 carbon sources across 14 organisms

Btheta: 41/41 predicted growth

Koxy: 41/41 predicted growth

Smeli: 41/41 predicted growth

Phaeo: 41/41 predicted growth

Keio: 41/41 predicted growth

HerbieS: 41/41 predicted growth

Cup4G11: 41/41 predicted growth

Dino: 41/41 predicted growth

Marino: 41/41 predicted growth

# Overall confusion matrix
tp = fba_df[(fba_df['predicted_growth']) & (fba_df['observed_growth'])].shape[0]
fn = fba_df[(~fba_df['predicted_growth']) & (fba_df['observed_growth'])].shape[0]
fp = fba_df[(fba_df['predicted_growth']) & (~fba_df['observed_growth'])].shape[0]
tn = fba_df[(~fba_df['predicted_growth']) & (~fba_df['observed_growth'])].shape[0]

print('Overall Confusion Matrix:')
print(f'                     Observed Growth  Observed No-Growth*')
print(f'Predicted Growth:     TP = {tp:>5d}       FP = {fp:>5d}')
print(f'Predicted No-Growth:  FN = {fn:>5d}       TN = {tn:>5d}')
print(f'\n*No-growth = no FB experiment for this condition (weak negative evidence)')

total = tp + fn + fp + tn
accuracy = (tp + tn) / total if total > 0 else 0
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0

print(f'\nAccuracy:    {accuracy:.3f}')
print(f'Sensitivity: {sensitivity:.3f} (TP / (TP + FN)) — how many observed-growth cases we predict')
print(f'Specificity: {specificity:.3f} (TN / (TN + FP))')
print(f'Precision:   {precision:.3f} (TP / (TP + FP))')

print(f'\nFalse negatives (FN = {fn}): These are the ANNOTATION GAP CANDIDATES')
print(f'  Organism grew on this carbon source but model predicts no growth')
print(f'  → Missing reactions/genes needed for gapfilling in NB03')

Overall Confusion Matrix:
                     Observed Growth  Observed No-Growth*
Predicted Growth:     TP =   244       FP =   330
Predicted No-Growth:  FN =     0       TN =     0

*No-growth = no FB experiment for this condition (weak negative evidence)

Accuracy:    0.425
Sensitivity: 1.000 (TP / (TP + FN)) — how many observed-growth cases we predict
Specificity: 0.000 (TN / (TN + FP))
Precision:   0.425 (TP / (TP + FP))

False negatives (FN = 0): These are the ANNOTATION GAP CANDIDATES
  Organism grew on this carbon source but model predicts no growth
  → Missing reactions/genes needed for gapfilling in NB03

# Per-organism accuracy
org_accuracy = fba_df.groupby('orgId').agg(
    n_tested=('correct', 'count'),
    n_correct=('correct', 'sum'),
    n_observed_growth=('observed_growth', 'sum'),
    n_predicted_growth=('predicted_growth', 'sum'),
    n_fn=('correct', lambda x: ((~fba_df.loc[x.index, 'predicted_growth']) & 
                                 (fba_df.loc[x.index, 'observed_growth'])).sum())
).reset_index()
org_accuracy['accuracy'] = (org_accuracy['n_correct'] / org_accuracy['n_tested']).round(3)
org_accuracy['sensitivity'] = (org_accuracy['n_predicted_growth'].clip(lower=0) / 
                                org_accuracy['n_observed_growth'].clip(lower=1)).clip(upper=1).round(3)
org_accuracy = org_accuracy.sort_values('n_fn', ascending=False)

print('Per-organism FBA accuracy:')
print(org_accuracy.to_string())
print(f'\nMean accuracy: {org_accuracy["accuracy"].mean():.3f}')
print(f'Mean sensitivity: {org_accuracy["sensitivity"].mean():.3f}')

Per-organism FBA accuracy:
       orgId  n_tested  n_correct  n_observed_growth  n_predicted_growth  n_fn  accuracy  sensitivity
0     Btheta        41         16                 16                  41     0     0.390          1.0
1    Cup4G11        41         18                 18                  41     0     0.439          1.0
2       Dino        41         17                 17                  41     0     0.415          1.0
3   Dyella79        41         10                 10                  41     0     0.244          1.0
4    HerbieS        41         24                 24                  41     0     0.585          1.0
5       Keio        41         23                 23                  41     0     0.561          1.0
6      Korea        41         10                 10                  41     0     0.244          1.0
7       Koxy        41         27                 27                  41     0     0.659          1.0
8        MR1        41          6                  6                  41     0     0.146          1.0
9     Marino        41         15                 15                  41     0     0.366          1.0
10       PV4        41         11                 11                  41     0     0.268          1.0
11     Phaeo        41         26                 26                  41     0     0.634          1.0
12     Smeli        41         26                 26                  41     0     0.634          1.0
13    azobra        41         15                 15                  41     0     0.366          1.0

Mean accuracy: 0.425
Mean sensitivity: 1.000

# Per-carbon-source accuracy
cs_accuracy = fba_df.groupby('carbon_source').agg(
    n_organisms=('orgId', 'nunique'),
    n_observed_growth=('observed_growth', 'sum'),
    n_predicted_growth=('predicted_growth', 'sum'),
    n_fn=('correct', lambda x: ((~fba_df.loc[x.index, 'predicted_growth']) & 
                                 (fba_df.loc[x.index, 'observed_growth'])).sum()),
    sensitivity=('correct', lambda x: (
        fba_df.loc[x.index, 'predicted_growth'] & fba_df.loc[x.index, 'observed_growth']
    ).sum() / max(fba_df.loc[x.index, 'observed_growth'].sum(), 1))
).reset_index().sort_values('n_fn', ascending=False)

print('Top carbon sources by false negatives (annotation gap candidates):')
print(cs_accuracy.head(20).to_string())

Top carbon sources by false negatives (annotation gap candidates):
                        carbon_source  n_organisms  n_observed_growth  n_predicted_growth  n_fn  sensitivity
0                        D-Cellobiose           14                  5                  14     0          1.0
1                          D-Fructose           14                  9                  14     0          1.0
2                         D-Galactose           14                  5                  14     0          1.0
3     D-Galacturonic Acid monohydrate           14                  3                  14     0          1.0
4         D-Gluconic Acid sodium salt           14                  5                  14     0          1.0
5         D-Glucosamine Hydrochloride           14                  4                  14     0          1.0
6                           D-Glucose           14                  9                  14     0          1.0
7   D-Glucose-6-Phosphate sodium salt           14                  3                  14     0          1.0
8               D-Maltose monohydrate           14                  8                  14     0          1.0
9                           D-Mannose           14                  5                  14     0          1.0
10           D-Raffinose pentahydrate           14                  4                  14     0          1.0
11                           D-Ribose           14                  5                  14     0          1.0
12                         D-Sorbitol           14                  5                  14     0          1.0
13              D-Trehalose dihydrate           14                  7                  14     0          1.0
14                           D-Xylose           14                  7                  14     0          1.0
15                            Ethanol           14                  3                  14     0          1.0
16                           Glycerol           14                  8                  14     0          1.0
17                          L-Alanine           14                  3                  14     0          1.0
18                        L-Arabinose           14                  3                  14     0          1.0
19                       L-Asparagine           14                  3                  14     0          1.0

# Extract false negative cases — these go to gapfilling
false_negatives = fba_df[
    (~fba_df['predicted_growth']) & (fba_df['observed_growth'])
].copy()

print(f'False negative cases (observed growth, predicted no-growth): {len(false_negatives)}')
print(f'  Organisms affected: {false_negatives["orgId"].nunique()}')
print(f'  Carbon sources affected: {false_negatives["carbon_source"].nunique()}')
print(f'\nTop organism × carbon source gaps:')
false_negatives[['orgId', 'carbon_source', 'cpd_id', 'fba_status']].head(20)

False negative cases (observed growth, predicted no-growth): 0
  Organisms affected: 0
  Carbon sources affected: 0

Top organism × carbon source gaps:

# Save FBA results
fba_df.to_csv(f'{DATA_DIR}/baseline_fba_results.tsv', sep='\t', index=False)
print(f'Saved FBA results: {len(fba_df)} rows')

# Save false negatives (gapfill targets)
false_negatives.to_csv(f'{DATA_DIR}/false_negative_targets.tsv', sep='\t', index=False)
print(f'Saved false negative targets: {len(false_negatives)} rows')

# Save model stats
model_stats_df.to_csv(f'{DATA_DIR}/model_stats.tsv', sep='\t', index=False)
print(f'Saved model stats: {len(model_stats_df)} rows')

# Save carbon source mapping
cs_map_df = pd.DataFrame([
    {'carbon_source': k, 'cpd_id': v} for k, v in carbon_map.items()
])
cs_map_df.to_csv(f'{DATA_DIR}/carbon_source_mapping.tsv', sep='\t', index=False)
print(f'Saved carbon source mapping: {len(cs_map_df)} rows')

print('\nAll NB02 outputs saved to data/')

Saved FBA results: 574 rows
Saved false negative targets: 0 rows
Saved model stats: 14 rows
Saved carbon source mapping: 109 rows

All NB02 outputs saved to data/

# Final summary
print('=' * 60)
print('NB02 SUMMARY: Model Building, Gapfilling, and FBA')
print('=' * 60)
print(f'Models built:              {len(models)}')
print(f'Mean reactions (pre-gf):    {model_stats_df["n_reactions"].mean():.0f}')
print(f'Mean reactions (post-gf):   {gf_model_stats_df["n_reactions"].mean():.0f}')
print(f'Mean metabolites (post-gf): {gf_model_stats_df["n_metabolites"].mean():.0f}')
print(f'Mean genes per model:       {gf_model_stats_df["n_genes"].mean():.0f}')
print(f'Mean rxns with GPR:         {gf_model_stats_df["n_with_gpr"].mean():.0f}')
print(f'Models with growth > 0:     {(gf_model_stats_df["growth_complete_media"] > 0).sum()}')
print(f'---')
print(f'Gapfilling:')
print(f'  Complete media avg new rxns: {gf_stats_df["n_new"].mean():.1f}')
gf_observed = cs_gf_df[cs_gf_df['observed_growth'] == True]
print(f'  Carbon source conditions:    {len(gf_observed)} observed-growth')
print(f'    Already grew:              {(gf_observed["status"] == "already_grows").sum()}')
print(f'    Successfully gapfilled:    {(gf_observed["status"] == "gapfilled").sum()}')
print(f'    Gapfill failed:            {(gf_observed["status"] == "gapfill_failed").sum()}')
print(f'---')
print(f'Post-gapfill FBA:')
print(f'  Tests run:               {len(fba_df)}')
print(f'  True positives (TP):     {tp}')
print(f'  False negatives (FN):    {fn}')
print(f'  Sensitivity:             {sensitivity:.3f}')
print(f'  Accuracy:                {accuracy:.3f}')
print(f'---')
print(f'Carbon sources mapped:     {mapped_count}/{len(carbon_sources)}')
print('=' * 60)

============================================================
NB02 SUMMARY: Model Building, Gapfilling, and FBA
============================================================
Models built:              14
Mean reactions (pre-gf):    1293
Mean reactions (post-gf):   1431
Mean metabolites (post-gf): 1365
Mean genes per model:       1102
Mean rxns with GPR:         1180
Models with growth > 0:     14
---
Gapfilling:
  Complete media avg new rxns: 50.1
  Carbon source conditions:    244 observed-growth
    Already grew:              206
    Successfully gapfilled:    38
    Gapfill failed:            0
---
Post-gapfill FBA:
  Tests run:               574
  True positives (TP):     244
  False negatives (FN):    0
  Sensitivity:             1.000
  Accuracy:                0.425
---
Carbon sources mapped:     75/109
============================================================

02 Model Building Fba

NB02: Model Building and Baseline FBA¶

Outputs¶

1. Load ModelSEED Biochemistry¶

2. Get EC Annotations from BERDL¶

2b. Download FB Protein Sequences and Annotate with RAST¶

3. Map Carbon Source Names to ModelSEED Compounds¶

4. Build Metabolic Models with ModelSEEDpy¶

4b. Gapfill Models¶

5. Run FBA Across Carbon Sources (Post-Gapfill)¶

6. Evaluate Predictions¶

7. Identify False-Negative Cases for Gapfilling¶

8. Save Outputs¶

	orgId	full_name	n_carbon_sources	has_pangenome
0	Btheta	Bacteroides thetaiotaomicron VPI-5482	47	True
1	Koxy	Klebsiella michiganensis M5al	41	True
2	Smeli	Sinorhizobium meliloti 1021	33	True
3	Phaeo	Phaeobacter inhibens BS107	32	True
4	Keio	Escherichia coli BW25113	28	True