spark = get_spark_session()
spark.sql('SELECT 1 AS ok').show()

import pandas as pd
import numpy as np
import os
import json
from collections import defaultdict, Counter

DATA_DIR = '../data'
FIG_DIR = '../figures'
os.makedirs(FIG_DIR, exist_ok=True)

FIT_THRESHOLD = -2.0
T_THRESHOLD = 4.0
FIT_MODERATE = -1.0
T_MODERATE = 3.0

print('Spark session ready.')

+---+
| ok|
+---+
|  1|
+---+

Spark session ready.

# Load NB02 outputs
gf_results = pd.read_csv(f'{DATA_DIR}/carbon_source_gapfill_results.tsv', sep='\t')
manifest = pd.read_csv(f'{DATA_DIR}/genome_manifest.tsv', sep='\t')
fb_exps = pd.read_csv(f'{DATA_DIR}/fb_carbon_experiments.tsv', sep='\t')

gf_cases = gf_results[gf_results['status'] == 'gapfilled'].copy()
print(f'Gapfilled cases: {len(gf_cases)}')
print(f'  Organisms: {gf_cases["orgId"].nunique()}')
print(f'  Carbon sources: {gf_cases["carbon_source"].nunique()}')
print()
print(gf_cases[['orgId', 'carbon_source', 'n_gapfilled_rxns']].to_string(index=False))

Gapfilled cases: 38
  Organisms: 14
  Carbon sources: 18

   orgId               carbon_source  n_gapfilled_rxns
  Btheta                  D-Fructose                13
  Btheta      N-Acetyl-D-Glucosamine                 3
    Koxy             Sodium pyruvate                 7
   Smeli             Sodium pyruvate                16
   Phaeo             Sodium pyruvate                13
   Phaeo                 L-Glutamine                 1
   Phaeo      N-Acetyl-D-Glucosamine                 1
   Phaeo       D-Trehalose dihydrate                 1
   Phaeo Trisodium citrate dihydrate                 2
   Phaeo           Potassium acetate                 1
   Phaeo                   D-Mannose                 4
    Keio             Sodium pyruvate                 8
 HerbieS             Sodium pyruvate                 8
 HerbieS                 L-Glutamine                 1
 HerbieS                   D-Glucose                 2
 HerbieS                    D-Xylose                 2
 HerbieS                    L-Fucose                 5
 Cup4G11             Sodium pyruvate                 7
 Cup4G11          Sodium D,L-Lactate                 1
 Cup4G11             Sodium butyrate                 6
    Dino             Sodium pyruvate                22
    Dino                 L-Glutamine                 1
  Marino             Sodium pyruvate                13
     MR1             Sodium pyruvate                 7
     MR1          Sodium D,L-Lactate                 2
  azobra             Sodium pyruvate                21
     PV4             Sodium pyruvate                 6
     PV4          Sodium D,L-Lactate                 2
   Korea                 L-Glutamine                13
   Korea                  D-Fructose                 1
   Korea       D-Maltose monohydrate                 1
   Korea                    D-Xylose                 2
   Korea    D-Raffinose pentahydrate                 5
Dyella79             Sodium pyruvate                10
Dyella79                  D-Fructose                 1
Dyella79                   L-Proline                 1
Dyella79                 D-Galactose                 4
Dyella79              a-Cyclodextrin                 5

# Parse gapfilled reactions into flat table
rxn_records = []
for _, row in gf_cases.iterrows():
    if pd.isna(row['gapfilled_rxns']):
        continue
    for rxn_id in row['gapfilled_rxns'].split(';'):
        rxn_id = rxn_id.strip()
        base_id = rxn_id.replace('_c0', '').replace('_e0', '')
        if base_id.startswith('EX_'):
            rxn_type = 'exchange'
        elif base_id.startswith('DM_'):
            rxn_type = 'demand'
        else:
            rxn_type = 'metabolic'
        rxn_records.append({
            'orgId': row['orgId'],
            'carbon_source': row['carbon_source'],
            'rxn_id_full': rxn_id,
            'rxn_id_base': base_id,
            'rxn_type': rxn_type,
        })

rxn_df = pd.DataFrame(rxn_records)
print(f'Total gapfilled reaction instances: {len(rxn_df)}')
print(f'  Unique base reaction IDs: {rxn_df["rxn_id_base"].nunique()}')
print(f'  By type:')
print(rxn_df['rxn_type'].value_counts().to_string())

Total gapfilled reaction instances: 219
  Unique base reaction IDs: 94
  By type:
rxn_type
metabolic    201
exchange      12
demand         6

# Join to ModelSEED biochemistry for EC numbers and reaction names
ms_rxns = pd.read_csv(f'{DATA_DIR}/modelseed_biochem/reactions.tsv', sep='\t',
                       low_memory=False)

rxn_detail = rxn_df.merge(
    ms_rxns[['id', 'name', 'ec_numbers', 'is_transport', 'aliases']].rename(
        columns={'id': 'rxn_id_base'}),
    on='rxn_id_base', how='left'
)

# Reclassify transport reactions from ModelSEED
rxn_detail.loc[rxn_detail['is_transport'].astype(str) == '1', 'rxn_type'] = 'transport'

# Separate enzymatic from infrastructure
enzymatic = rxn_detail[rxn_detail['rxn_type'] == 'metabolic'].copy()
infrastructure = rxn_detail[rxn_detail['rxn_type'] != 'metabolic']

# Explode multi-valued EC numbers (separated by | or ;)
ec_rows = []
for _, row in enzymatic.iterrows():
    if pd.notna(row['ec_numbers']) and str(row['ec_numbers']).strip():
        for ec in str(row['ec_numbers']).replace('|', ';').split(';'):
            ec = ec.strip()
            if ec:
                ec_rows.append({**row.to_dict(), 'ec_number': ec})
    else:
        ec_rows.append({**row.to_dict(), 'ec_number': None})

rxn_ec = pd.DataFrame(ec_rows)

print(f'Enzymatic reactions: {len(enzymatic)}')
print(f'  With EC: {enzymatic["ec_numbers"].notna().sum()}')
print(f'  Without EC: {enzymatic["ec_numbers"].isna().sum()}')
print(f'Infrastructure (excluded): {len(infrastructure)}')
print(f'Reaction x EC pairs: {len(rxn_ec)}')
print(f'  Unique EC numbers: {rxn_ec["ec_number"].dropna().nunique()}')

# Save reaction details
rxn_detail[['orgId', 'carbon_source', 'rxn_id_full', 'rxn_id_base',
            'rxn_type', 'name', 'ec_numbers', 'is_transport']].to_csv(
    f'{DATA_DIR}/gapfill_reaction_details.tsv', sep='\t', index=False)
print(f'\nSaved gapfill_reaction_details.tsv: {len(rxn_detail)} rows')

Enzymatic reactions: 201
  With EC: 151
  Without EC: 50
Infrastructure (excluded): 18
Reaction x EC pairs: 244
  Unique EC numbers: 84

Saved gapfill_reaction_details.tsv: 219 rows

# Strategy 1: KEGG three-table join for EC annotations
org_list = manifest['orgId'].tolist()
org_in = "','" .join(org_list)

kegg_ec_genes = spark.sql(f"""
    SELECT bk.orgId, bk.locusId,
           km.kgroup,
           ke.ecnum AS ec_number
    FROM kescience_fitnessbrowser.besthitkegg bk
    JOIN kescience_fitnessbrowser.keggmember km
        ON bk.keggOrg = km.keggOrg AND bk.keggId = km.keggId
    JOIN kescience_fitnessbrowser.kgroupec ke
        ON km.kgroup = ke.kgroup
    WHERE bk.orgId IN ('{org_in}')
""").toPandas()

print(f'KEGG EC-gene pairs: {len(kegg_ec_genes):,}')
print(f'  Unique genes: {kegg_ec_genes[["orgId","locusId"]].drop_duplicates().shape[0]:,}')
print(f'  Unique EC numbers: {kegg_ec_genes["ec_number"].nunique()}')

# Strategy 2: Extract EC from FB gene descriptions
import re

fb_genes = spark.sql(f"""
    SELECT orgId, locusId, gene, desc AS gene_desc, type
    FROM kescience_fitnessbrowser.gene
    WHERE orgId IN ('{org_in}')
      AND type = '1'
""").toPandas()

print(f'\nFB protein-coding genes: {len(fb_genes):,}')

# Parse EC from gene descriptions (pattern: EC X.X.X.X or (EC X.X.X.X))
ec_pattern = re.compile(r'(?:EC[:\s]*)?(\d+\.\d+\.\d+\.\d+)')
desc_ec_rows = []
for _, row in fb_genes.iterrows():
    if row['gene_desc'] and isinstance(row['gene_desc'], str):
        ecs = ec_pattern.findall(row['gene_desc'])
        for ec in ecs:
            desc_ec_rows.append({
                'orgId': row['orgId'],
                'locusId': row['locusId'],
                'ec_number': ec,
                'source': 'gene_desc'
            })

desc_ec_genes = pd.DataFrame(desc_ec_rows)
print(f'Gene-description EC pairs: {len(desc_ec_genes):,}')

# Combine KEGG + description ECs (deduplicate)
kegg_ec_genes['source'] = 'kegg'
all_ec_genes = pd.concat([
    kegg_ec_genes[['orgId', 'locusId', 'ec_number', 'source']],
    desc_ec_genes
], ignore_index=True).drop_duplicates(subset=['orgId', 'locusId', 'ec_number'])

print(f'\nCombined EC-gene pairs (deduplicated): {len(all_ec_genes):,}')
print(f'  From KEGG only: {(all_ec_genes["source"] == "kegg").sum():,}')
print(f'  From gene desc: {(all_ec_genes["source"] == "gene_desc").sum():,}')

KEGG EC-gene pairs: 16,318
  Unique genes: 15,419
  Unique EC numbers: 1316

FB protein-coding genes: 67,318

Gene-description EC pairs: 257

Combined EC-gene pairs (deduplicated): 16,360
  From KEGG only: 16,310
  From gene desc: 50

# Match gapfilled reaction ECs to FB genes in the same organism
target_ecs = rxn_ec[rxn_ec['ec_number'].notna()][
    ['orgId', 'ec_number', 'rxn_id_base', 'name', 'carbon_source']
].drop_duplicates()

ec_matches = target_ecs.merge(
    all_ec_genes[['orgId', 'locusId', 'ec_number', 'source']],
    on=['orgId', 'ec_number'],
    how='left'
)

matched = ec_matches[ec_matches['locusId'].notna()].copy()
unmatched_rxns = ec_matches[ec_matches['locusId'].isna()][
    ['orgId', 'rxn_id_base', 'name', 'ec_number', 'carbon_source']
].drop_duplicates()

n_rxn_with_match = matched[['orgId', 'rxn_id_base']].drop_duplicates().shape[0]
n_rxn_no_match = unmatched_rxns[['orgId', 'rxn_id_base']].drop_duplicates().shape[0]

print(f'Gapfilled reaction-EC targets: {len(target_ecs)}')
print(f'  Reactions with FB gene match: {n_rxn_with_match}')
print(f'  Reactions without FB gene match: {n_rxn_no_match}')
print(f'  Total gene-reaction matches: {len(matched)}')
print()

# Also check reactions with NO EC at all
no_ec_rxns = rxn_ec[rxn_ec['ec_number'].isna()][
    ['orgId', 'rxn_id_base', 'name', 'carbon_source']
].drop_duplicates()
print(f'Reactions with no EC number (dark reactions): {len(no_ec_rxns)}')
if len(no_ec_rxns) > 0:
    print(no_ec_rxns[['rxn_id_base', 'name']].drop_duplicates().to_string(index=False))

Gapfilled reaction-EC targets: 194
  Reactions with FB gene match: 51
  Reactions without FB gene match: 128
  Total gene-reaction matches: 107

Reactions with no EC number (dark reactions): 50
rxn_id_base                                                                  name
   rxn05229                                     NCAIR synthetase and NCAIR mutase
   rxn12376 transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)
   rxn09310                                          thiazole phosphate synthesis
   rxn12239                                4-hydroxy-benzyl-alcohol dehydrogenase
   rxn19346                                                   CYSTATHIONASE-RXN.c
   rxn08474                                                             RXN0-6.cp
   rxn11285                                                 uridine phosphorylase
   rxn10348                                                         alpha-amylase
   rxn05740                                     1,4-alpha-glucan-branching enzyme
   rxn10030                                       b-ketoacyl synthetase (n-C18:1)
   rxn25278                                           2-methylaconitate isomerase
   rxn09989                                    Maltodextrin glucosidase (dextrin)
   rxn00796                                                       biotin synthase
   rxn30633                                                          CYSTAGLY-RXN
   rxn30604                                                       ALDOSE1EPIM-RXN
   rxn30623                                                    BETAGALACTOSID-RXN

# Map gapfilled cases to FB experiment names
case_exps = fb_exps.merge(
    gf_cases[['orgId', 'carbon_source']],
    left_on=['orgId', 'condition_1'],
    right_on=['orgId', 'carbon_source'],
    how='inner'
)

print(f'Gapfilled cases with experiments: {case_exps[["orgId","carbon_source"]].drop_duplicates().shape[0]}')
print(f'Total experiments to query: {len(case_exps)}')

# Build lookup
case_exp_map = case_exps.groupby(['orgId', 'carbon_source'])['expName'].apply(list).to_dict()

Gapfilled cases with experiments: 38
Total experiments to query: 78

# Query fitness for candidate genes under relevant experiments
# Batched per organism to keep genefitness queries targeted
candidate_loci = matched.groupby('orgId')['locusId'].apply(set).to_dict()

fitness_records = []
for orgId in gf_cases['orgId'].unique():
    if orgId not in candidate_loci or not candidate_loci[orgId]:
        print(f'  {orgId}: no gene candidates, skipping fitness query')
        continue

    org_case_exps = case_exps[case_exps['orgId'] == orgId]
    exp_list = org_case_exps['expName'].unique().tolist()
    loci = list(candidate_loci[orgId])

    # Chunk locusIds if too many (SQL IN clause limit)
    chunk_size = 500
    for i in range(0, len(loci), chunk_size):
        loci_chunk = loci[i:i+chunk_size]
        loci_in = "','".join(loci_chunk)
        exp_in = "','".join(exp_list)

        fit_df = spark.sql(f"""
            SELECT orgId, locusId, expName,
                   CAST(fit AS DOUBLE) AS fit,
                   CAST(t AS DOUBLE) AS t
            FROM kescience_fitnessbrowser.genefitness
            WHERE orgId = '{orgId}'
              AND expName IN ('{exp_in}')
              AND locusId IN ('{loci_in}')
        """).toPandas()
        fitness_records.append(fit_df)

    print(f'  {orgId}: {len(loci)} candidates, {len(exp_list)} experiments')

if fitness_records:
    fitness_df = pd.concat(fitness_records, ignore_index=True)
    print(f'\nTotal fitness measurements: {len(fitness_df):,}')
else:
    fitness_df = pd.DataFrame(columns=['orgId', 'locusId', 'expName', 'fit', 't'])
    print('No fitness measurements found')

  Btheta: 2 candidates, 4 experiments

  Koxy: 7 candidates, 2 experiments

  Smeli: 10 candidates, 1 experiments

  Phaeo: 9 candidates, 18 experiments

  Keio: 10 candidates, 2 experiments

  HerbieS: 18 candidates, 11 experiments

  Cup4G11: 7 candidates, 6 experiments

  Dino: 8 candidates, 4 experiments

  Marino: 10 candidates, 3 experiments

  MR1: 4 candidates, 3 experiments

# Aggregate fitness per gene x carbon source
fitness_with_cs = fitness_df.merge(
    case_exps[['orgId', 'expName', 'carbon_source']].drop_duplicates(),
    on=['orgId', 'expName'],
    how='inner'
)

if len(fitness_with_cs) > 0:
    fitness_agg = fitness_with_cs.groupby(['orgId', 'locusId', 'carbon_source']).agg(
        mean_fit=('fit', 'mean'),
        min_fit=('fit', 'min'),
        max_abs_t=('t', lambda x: x.abs().max()),
        n_experiments=('expName', 'nunique'),
    ).reset_index()

    fitness_agg['fitness_class'] = 'non_significant'
    fitness_agg.loc[
        (fitness_agg['min_fit'] < FIT_MODERATE) & (fitness_agg['max_abs_t'] > T_MODERATE),
        'fitness_class'] = 'moderate'
    fitness_agg.loc[
        (fitness_agg['min_fit'] < FIT_THRESHOLD) & (fitness_agg['max_abs_t'] > T_THRESHOLD),
        'fitness_class'] = 'strong'

    print(f'Gene x carbon source fitness profiles: {len(fitness_agg)}')
    print(fitness_agg['fitness_class'].value_counts().to_string())
else:
    fitness_agg = pd.DataFrame(columns=['orgId', 'locusId', 'carbon_source',
                                         'mean_fit', 'min_fit', 'max_abs_t',
                                         'n_experiments', 'fitness_class'])
    print('No fitness data to aggregate')

Gene x carbon source fitness profiles: 188
fitness_class
non_significant    145
strong              33
moderate            10

# Find putatively essential genes (absent from genefitness = no viable transposon mutants)
essential_genes = spark.sql(f"""
    SELECT g.orgId, g.locusId, g.desc AS gene_desc
    FROM kescience_fitnessbrowser.gene g
    LEFT JOIN (
        SELECT DISTINCT orgId, locusId
        FROM kescience_fitnessbrowser.genefitness
        WHERE orgId IN ('{org_in}')
    ) gf ON g.orgId = gf.orgId AND g.locusId = gf.locusId
    WHERE g.type = '1'
      AND g.orgId IN ('{org_in}')
      AND gf.locusId IS NULL
""").toPandas()

print(f'Putative essential genes: {len(essential_genes):,}')
print(f'  Per organism: {essential_genes.groupby("orgId").size().describe().to_string()}')

# Match essential genes to target ECs
essential_with_ec = essential_genes.merge(
    all_ec_genes[['orgId', 'locusId', 'ec_number']],
    on=['orgId', 'locusId'],
    how='inner'
)

essential_matches = essential_with_ec.merge(
    target_ecs[['orgId', 'ec_number', 'rxn_id_base', 'name', 'carbon_source']].drop_duplicates(),
    on=['orgId', 'ec_number'],
    how='inner'
)

print(f'Essential genes matching gapfilled reaction ECs: {len(essential_matches)}')
if len(essential_matches) > 0:
    print(essential_matches[['orgId', 'locusId', 'ec_number', 'name']].drop_duplicates().head(10).to_string(index=False))

Putative essential genes: 12,077
  Per organism: count      14.000000
mean      862.642857
std       183.894454
min       561.000000
25%       766.750000
50%       814.500000
75%       954.000000
max      1310.000000
Essential genes matching gapfilled reaction ECs: 36
  orgId        locusId ec_number                                                  name
 Marino         GFF236   4.1.1.-  carboxyspermidine carboxy-lyase (spermidine-forming)
  Phaeo         GFF341  2.3.1.51 stearoyl-1-acylglycerol-3-phosphate O-acyltransferase
    PV4        5210715   4.1.1.-  carboxyspermidine carboxy-lyase (spermidine-forming)
    MR1         199697   4.1.1.-  carboxyspermidine carboxy-lyase (spermidine-forming)
  Korea Ga0059261_2296  2.3.1.51 stearoyl-1-acylglycerol-3-phosphate O-acyltransferase
Cup4G11   RR42_RS05440   2.2.1.6         2-Acetolactate pyruvate-lyase (carboxylating)
HerbieS  HSERO_RS10925   2.3.1.-          propanoyl-CoA:formate C-propanoyltransferase
 azobra  AZOBR_RS06190   4.1.1.-  carboxyspermidine carboxy-lyase (spermidine-forming)
HerbieS  HSERO_RS19870   4.1.1.-  carboxyspermidine carboxy-lyase (spermidine-forming)
   Dino        3608378   2.2.1.6         2-Acetolactate pyruvate-lyase (carboxylating)

# Query pangenome gene clusters with EC annotations for target clades
clades = manifest[['orgId', 'gtdb_species_clade_id']].copy()
clade_list = clades['gtdb_species_clade_id'].dropna().unique().tolist()
clade_in = "','" .join(clade_list)

pangenome_ec = spark.sql(f"""
    SELECT gc.gtdb_species_clade_id,
           gc.gene_cluster_id,
           gc.is_core,
           gc.is_auxiliary,
           gc.is_singleton,
           e.EC AS eggnog_ec
    FROM kbase_ke_pangenome.gene_cluster gc
    JOIN kbase_ke_pangenome.eggnog_mapper_annotations e
        ON gc.gene_cluster_id = e.query_name
    WHERE gc.gtdb_species_clade_id IN ('{clade_in}')
      AND e.EC IS NOT NULL
      AND e.EC != ''
      AND e.EC != '-'
""").toPandas()

print(f'Gene clusters with EC annotations in target clades: {len(pangenome_ec):,}')
print(f'  Clades found: {pangenome_ec["gtdb_species_clade_id"].nunique()}')

Gene clusters with EC annotations in target clades: 32,858
  Clades found: 14

# Parse eggNOG EC (comma-separated) and match to target ECs
target_ec_set = set(rxn_ec['ec_number'].dropna().unique())

pg_ec_rows = []
for _, row in pangenome_ec.iterrows():
    for ec in str(row['eggnog_ec']).split(','):
        ec = ec.strip()
        if ec and ec != '-' and ec in target_ec_set:
            pg_ec_rows.append({
                'gtdb_species_clade_id': row['gtdb_species_clade_id'],
                'gene_cluster_id': row['gene_cluster_id'],
                'is_core': row['is_core'],
                'is_auxiliary': row['is_auxiliary'],
                'is_singleton': row['is_singleton'],
                'ec_number': ec,
            })

pg_target = pd.DataFrame(pg_ec_rows)
print(f'Gene clusters matching target ECs: {len(pg_target)}')

if len(pg_target) > 0:
    pg_target = pg_target.merge(clades, on='gtdb_species_clade_id', how='left')

    conservation = pg_target.groupby(['orgId', 'ec_number']).agg(
        n_clusters=('gene_cluster_id', 'nunique'),
        n_core=('is_core', lambda x: (x == True).sum()),
        n_auxiliary=('is_auxiliary', lambda x: (x == True).sum()),
        n_singleton=('is_singleton', lambda x: (x == True).sum()),
    ).reset_index()

    conservation['conservation_class'] = 'absent'
    conservation.loc[conservation['n_core'] > 0, 'conservation_class'] = 'core'
    conservation.loc[
        (conservation['n_core'] == 0) & (conservation['n_auxiliary'] > 0),
        'conservation_class'] = 'accessory'
    conservation.loc[
        (conservation['n_core'] == 0) & (conservation['n_auxiliary'] == 0) &
        (conservation['n_singleton'] > 0),
        'conservation_class'] = 'singleton'

    print(f'\nConservation of gapfilled reaction ECs:')
    print(conservation['conservation_class'].value_counts().to_string())
else:
    conservation = pd.DataFrame(columns=['orgId', 'ec_number', 'n_clusters',
                                          'n_core', 'conservation_class'])
    print('No matching gene clusters found in pangenome')

Gene clusters matching target ECs: 1459

Conservation of gapfilled reaction ECs:
conservation_class
core         530
accessory     41

# Map carbon sources to GapMind pathway names
CS_TO_GAPMIND = {
    'D-Fructose': 'fructose',
    'D-Glucose': 'glucose',
    'D-Galactose': 'galactose',
    'D-Mannose': 'mannose',
    'D-Xylose': 'xylose',
    'D-Trehalose dihydrate': 'trehalose',
    'N-Acetyl-D-Glucosamine': 'NAG',
    'Sodium pyruvate': 'pyruvate',
    'Trisodium citrate dihydrate': 'citrate',
    'Potassium acetate': 'acetate',
    'Sodium D,L-Lactate': 'lactate',
    'Sodium butyrate': 'butyrate',
    'L-Glutamine': None,
    'L-Proline': None,
    'L-Fucose': 'fucose',
    'D-Maltose monohydrate': 'maltose',
    'D-Raffinose pentahydrate': None,
    'a-Cyclodextrin': None,
}

gf_with_gapmind = gf_cases.copy()
gf_with_gapmind['gapmind_pathway'] = gf_with_gapmind['carbon_source'].map(CS_TO_GAPMIND)
has_gapmind = gf_with_gapmind[gf_with_gapmind['gapmind_pathway'].notna()]
print(f'Gapfilled cases with GapMind pathway: {len(has_gapmind)} of {len(gf_cases)}')

# Extract genome accessions (strip RS_/GB_ prefix for GapMind)
manifest['genome_accession'] = manifest['gtdb_species_clade_id'].apply(
    lambda x: x.split('--')[1].replace('RS_', '').replace('GB_', '') if '--' in str(x) else None
)

genome_list = manifest['genome_accession'].dropna().tolist()
genome_in = "','" .join(genome_list)

# Query GapMind for focal genomes
gapmind_raw = spark.sql(f"""
    SELECT genome_id, pathway, metabolic_category,
           MAX(CAST(score_simplified AS DOUBLE)) AS score_simplified
    FROM kbase_ke_pangenome.gapmind_pathways
    WHERE genome_id IN ('{genome_in}')
      AND metabolic_category = 'carbon'
    GROUP BY genome_id, pathway, metabolic_category
""").toPandas()

print(f'GapMind predictions for focal genomes: {len(gapmind_raw)}')
print(f'  Genomes found: {gapmind_raw["genome_id"].nunique()}')

# Also query clade-level completeness
gapmind_clade = spark.sql(f"""
    SELECT clade_name, genome_id, pathway,
           MAX(CAST(score_simplified AS DOUBLE)) AS score_simplified
    FROM kbase_ke_pangenome.gapmind_pathways
    WHERE clade_name IN ('{clade_in}')
      AND metabolic_category = 'carbon'
    GROUP BY clade_name, genome_id, pathway
""").toPandas()

print(f'GapMind clade predictions: {len(gapmind_clade):,}')

Gapfilled cases with GapMind pathway: 31 of 38

GapMind predictions for focal genomes: 744
  Genomes found: 12

GapMind clade predictions: 67,890

# Cross-reference GapMind with gapfilled cases
genome_to_org = manifest.set_index('genome_accession')['orgId'].to_dict()

if len(gapmind_raw) > 0:
    gapmind_raw['orgId'] = gapmind_raw['genome_id'].map(genome_to_org)
    gapmind_match = has_gapmind.merge(
        gapmind_raw[['orgId', 'pathway', 'score_simplified']],
        left_on=['orgId', 'gapmind_pathway'],
        right_on=['orgId', 'pathway'],
        how='left'
    )
    print(f'GapMind cross-validation:')
    print(f'  With data: {gapmind_match["score_simplified"].notna().sum()}')
    print(f'  Incomplete (supports gap): {(gapmind_match["score_simplified"] == 0).sum()}')
    print(f'  Complete (contradicts gap): {(gapmind_match["score_simplified"] == 1).sum()}')
else:
    gapmind_match = has_gapmind.copy()
    gapmind_match['score_simplified'] = np.nan
    print('No GapMind data for focal genomes')

# Clade-level completeness
if len(gapmind_clade) > 0:
    clade_completeness = gapmind_clade.groupby(['clade_name', 'pathway']).agg(
        n_genomes=('genome_id', 'nunique'),
        n_complete=('score_simplified', lambda x: (x == 1.0).sum()),
    ).reset_index()
    clade_completeness['pct_complete'] = (
        100 * clade_completeness['n_complete'] / clade_completeness['n_genomes']
    )
    clade_to_org = manifest.set_index('gtdb_species_clade_id')['orgId'].to_dict()
    clade_completeness['orgId'] = clade_completeness['clade_name'].map(clade_to_org)
    print(f'\nClade pathway completeness entries: {len(clade_completeness)}')
else:
    clade_completeness = pd.DataFrame(columns=['clade_name', 'pathway', 'n_genomes',
                                                 'n_complete', 'pct_complete', 'orgId'])
    print('No clade-level GapMind data')

GapMind cross-validation:
  With data: 25
  Incomplete (supports gap): 3
  Complete (contradicts gap): 22

Clade pathway completeness entries: 868

# Build master candidate table
candidates = matched[['orgId', 'carbon_source', 'rxn_id_base', 'name',
                       'ec_number', 'locusId', 'source']].copy()
candidates = candidates.rename(columns={'source': 'ec_source'})

# Add gene descriptions
candidates = candidates.merge(
    fb_genes[['orgId', 'locusId', 'gene_desc']],
    on=['orgId', 'locusId'],
    how='left'
)

# Add fitness evidence
if len(fitness_agg) > 0:
    candidates = candidates.merge(
        fitness_agg[['orgId', 'locusId', 'carbon_source',
                      'mean_fit', 'min_fit', 'max_abs_t', 'fitness_class']],
        on=['orgId', 'locusId', 'carbon_source'],
        how='left'
    )
else:
    candidates['mean_fit'] = np.nan
    candidates['min_fit'] = np.nan
    candidates['max_abs_t'] = np.nan
    candidates['fitness_class'] = None
candidates['fitness_class'] = candidates['fitness_class'].fillna('no_data')

# Add pangenome conservation
if len(conservation) > 0:
    candidates = candidates.merge(
        conservation[['orgId', 'ec_number', 'conservation_class', 'n_core']],
        on=['orgId', 'ec_number'],
        how='left'
    )
else:
    candidates['conservation_class'] = None
    candidates['n_core'] = 0
candidates['conservation_class'] = candidates['conservation_class'].fillna('not_checked')

# Add GapMind evidence (case-level)
gapmind_lookup = gf_with_gapmind[['orgId', 'carbon_source', 'gapmind_pathway']].drop_duplicates()
candidates = candidates.merge(gapmind_lookup, on=['orgId', 'carbon_source'], how='left')

if len(gapmind_match) > 0 and 'score_simplified' in gapmind_match.columns:
    gm_scores = gapmind_match[['orgId', 'carbon_source', 'score_simplified']].rename(
        columns={'score_simplified': 'gapmind_score'}
    ).drop_duplicates()
    candidates = candidates.merge(gm_scores, on=['orgId', 'carbon_source'], how='left')
else:
    candidates['gapmind_score'] = np.nan

if len(clade_completeness) > 0:
    candidates = candidates.merge(
        clade_completeness[['orgId', 'pathway', 'pct_complete']].rename(
            columns={'pathway': 'gapmind_pathway', 'pct_complete': 'clade_pct_complete'}
        ),
        on=['orgId', 'gapmind_pathway'],
        how='left'
    )
else:
    candidates['clade_pct_complete'] = np.nan

# Add essential gene candidates
if len(essential_matches) > 0:
    ess = essential_matches[['orgId', 'locusId', 'ec_number', 'rxn_id_base',
                             'name', 'carbon_source', 'gene_desc']].copy()
    ess['ec_source'] = 'kegg'
    ess['fitness_class'] = 'essential'
    ess['mean_fit'] = np.nan
    ess['min_fit'] = np.nan
    ess['max_abs_t'] = np.nan

    # Add conservation
    if len(conservation) > 0:
        ess = ess.merge(
            conservation[['orgId', 'ec_number', 'conservation_class', 'n_core']],
            on=['orgId', 'ec_number'], how='left'
        )
    else:
        ess['conservation_class'] = 'not_checked'
        ess['n_core'] = 0
    ess['conservation_class'] = ess['conservation_class'].fillna('not_checked')

    ess = ess.merge(gapmind_lookup, on=['orgId', 'carbon_source'], how='left')
    ess['gapmind_score'] = np.nan
    ess['clade_pct_complete'] = np.nan

    # Avoid duplicates
    existing = set(zip(candidates['orgId'], candidates['locusId'],
                       candidates['rxn_id_base'], candidates['carbon_source']))
    ess_new = ess[
        ~ess.apply(lambda r: (r['orgId'], r['locusId'],
                              r['rxn_id_base'], r['carbon_source']) in existing, axis=1)
    ]
    candidates = pd.concat([candidates, ess_new], ignore_index=True)
    print(f'Added {len(ess_new)} essential gene candidates')

print(f'Total candidate entries: {len(candidates)}')
print(f'  Unique genes: {candidates[["orgId","locusId"]].drop_duplicates().shape[0]}')

Added 0 essential gene candidates
Total candidate entries: 107
  Unique genes: 107

# Score confidence levels
def score_confidence(row):
    has_fitness = row['fitness_class'] in ('strong', 'moderate')
    is_essential = row['fitness_class'] == 'essential'
    has_conservation = row.get('conservation_class') == 'core'
    gapmind_supports = (pd.notna(row.get('gapmind_score')) and
                        row.get('gapmind_score') == 0.0)

    if row['fitness_class'] == 'strong' and (has_conservation or gapmind_supports):
        return 'high'
    if is_essential and has_conservation:
        return 'high'
    if has_fitness:
        return 'medium'
    if is_essential:
        return 'medium'
    if has_conservation and gapmind_supports:
        return 'medium'
    if has_conservation:
        return 'low'
    return 'low'

candidates['confidence'] = candidates.apply(score_confidence, axis=1)

# Rank within each case
conf_order = {'high': 0, 'medium': 1, 'low': 2}
candidates['_conf_rank'] = candidates['confidence'].map(conf_order)
candidates['_fit_mag'] = candidates['min_fit'].fillna(0).abs()

candidates = candidates.sort_values(
    ['orgId', 'carbon_source', 'rxn_id_base', '_conf_rank', '_fit_mag'],
    ascending=[True, True, True, True, False]
)
candidates['candidate_rank'] = candidates.groupby(
    ['orgId', 'carbon_source', 'rxn_id_base']
).cumcount() + 1

candidates = candidates.drop(columns=['_conf_rank', '_fit_mag'])

best = candidates[candidates['candidate_rank'] == 1]
print(f'Confidence distribution (all candidates):')
print(candidates['confidence'].value_counts().to_string())
print(f'\nBest candidates per reaction: {len(best)}')
print(f'  High: {(best["confidence"] == "high").sum()}')
print(f'  Medium: {(best["confidence"] == "medium").sum()}')
print(f'  Low: {(best["confidence"] == "low").sum()}')

Confidence distribution (all candidates):
confidence
low       84
high      12
medium    11

Best candidates per reaction: 51
  High: 10
  Medium: 6
  Low: 35

# Top high-confidence candidates
display_cols = ['orgId', 'carbon_source', 'rxn_id_base', 'name', 'ec_number',
                'locusId', 'gene_desc', 'mean_fit', 'max_abs_t', 'fitness_class',
                'conservation_class', 'confidence']

top = candidates[candidates['confidence'] == 'high'].sort_values(
    'min_fit', ascending=True
).head(20)

print('Top 20 High-Confidence Annotation Gap Candidates:')
if len(top) > 0:
    print(top[display_cols].to_string(index=False))
else:
    print('  (none at high confidence)')
    top_med = candidates[candidates['confidence'] == 'medium'].sort_values(
        'min_fit', ascending=True
    ).head(20)
    print(f'\nTop 20 Medium-Confidence instead:')
    print(top_med[display_cols].to_string(index=False))

# Reactions with no gene candidates at all
all_enzymatic_keys = set(zip(enzymatic['orgId'], enzymatic['carbon_source'],
                             enzymatic['rxn_id_base']))
resolved_keys = set(zip(candidates['orgId'], candidates['carbon_source'],
                         candidates['rxn_id_base']))
unresolved_keys = all_enzymatic_keys - resolved_keys

print(f'\n--- Unresolved annotation gaps (no gene candidates): {len(unresolved_keys)} ---')
if unresolved_keys:
    unresolved = pd.DataFrame(list(unresolved_keys),
                              columns=['orgId', 'carbon_source', 'rxn_id_base'])
    unresolved = unresolved.merge(
        enzymatic[['orgId', 'carbon_source', 'rxn_id_base', 'name', 'ec_numbers']].drop_duplicates(),
        on=['orgId', 'carbon_source', 'rxn_id_base'], how='left'
    )
    print(unresolved.to_string(index=False))

Top 20 High-Confidence Annotation Gap Candidates:
   orgId   carbon_source rxn_id_base                                                                                                   name ec_number        locusId                                               gene_desc  mean_fit  max_abs_t fitness_class conservation_class confidence
 HerbieS Sodium pyruvate    rxn02185                                                          2-Acetolactate pyruvate-lyase (carboxylating)   2.2.1.6  HSERO_RS08650                                   acetolactate synthase -7.091511   8.244066        strong               core       high
   Smeli Sodium pyruvate    rxn02185                                                          2-Acetolactate pyruvate-lyase (carboxylating)   2.2.1.6       SMc01431               acetolactate synthase 3 catalytic subunit -7.618500   7.430802        strong               core       high
 HerbieS Sodium pyruvate    rxn03436                                      (S)-2-Aceto-2-hydroxybutanoate:NADP+ oxidoreductase (isomerizing)  1.1.1.86  HSERO_RS08660                             ketol-acid reductoisomerase -7.348724   5.069962        strong               core       high
    Koxy Sodium pyruvate    rxn03436                                      (S)-2-Aceto-2-hydroxybutanoate:NADP+ oxidoreductase (isomerizing)  1.1.1.86  BWI76_RS00990                             ketol-acid reductoisomerase -6.052952   8.618473        strong               core       high
   Phaeo Sodium pyruvate    rxn02185                                                          2-Acetolactate pyruvate-lyase (carboxylating)   2.2.1.6        GFF1987           acetolactate synthase isozyme 3 large subunit -4.620879   9.358506        strong               core       high
  Marino Sodium pyruvate    rxn02185                                                          2-Acetolactate pyruvate-lyase (carboxylating)   2.2.1.6         GFF544              acetolactate synthase 3 regulatory subunit -3.159043   6.227283        strong               core       high
    Keio Sodium pyruvate    rxn03436                                      (S)-2-Aceto-2-hydroxybutanoate:NADP+ oxidoreductase (isomerizing)  1.1.1.86          17826                      ketol-acid reductoisomerase (NCBI) -3.857710  18.131366        strong               core       high
  Marino Sodium pyruvate    rxn02185                                                          2-Acetolactate pyruvate-lyase (carboxylating)   2.2.1.6         GFF543 acetolactate synthase, large subunit, biosynthetic type -3.349700  10.208585        strong               core       high
    Keio Sodium pyruvate    rxn02185                                                          2-Acetolactate pyruvate-lyase (carboxylating)   2.2.1.6        1936210          acetolactate synthase III large subunit (NCBI) -3.695227  13.866260        strong               core       high
    Keio Sodium pyruvate    rxn02185                                                          2-Acetolactate pyruvate-lyase (carboxylating)   2.2.1.6          14224              acetolactate synthase small subunit (NCBI) -3.487028  11.986621        strong               core       high
Dyella79 Sodium pyruvate    rxn19764            glycine:H-protein-lipoyllysine oxidoreductase (decarboxylating, acceptor-amino-methylating)   1.4.4.2 N515DRAFT_0055                                   glycine dehydrogenase -2.506133  16.149485        strong               core       high
Dyella79 Sodium pyruvate    rxn19766 [protein]-8-S-aminomethyldihydrolipoyllysine:tetrahydrofolate aminomethyltransferase (ammonia-forming)  2.1.2.10 N515DRAFT_4331                                  aminomethyltransferase -2.233723   7.590537        strong               core       high

--- Unresolved annotation gaps (no gene candidates): 150 ---
   orgId               carbon_source rxn_id_base                                                                                              name        ec_numbers
  azobra             Sodium pyruvate    rxn00796                                                                                   biotin synthase               NaN
  azobra             Sodium pyruvate    rxn02914                                                   3-Phosphoserine:2-oxoglutarate aminotransferase          2.6.1.52
   Phaeo             Sodium pyruvate    rxn15947                                                            carboxyspermidine:NADP+ oxidoreductase  1.5.1.-|1.5.1.43
     MR1             Sodium pyruvate    rxn05464                                        trans-Octodec-2-enoyl-ACP:NAD+ oxidoreductase (A-specific)           1.3.1.0
   Korea                 L-Glutamine    rxn01790                                                               (R)-Pantoate:NADP+ 2-oxidoreductase         1.1.1.169
   Korea                 L-Glutamine    rxn05153                                                                       sulfate-transporting ATPase          3.6.3.25
   Phaeo             Sodium pyruvate    rxn05464                                        trans-Octodec-2-enoyl-ACP:NAD+ oxidoreductase (A-specific)           1.3.1.0
Dyella79                 D-Galactose    rxn15121                                                              ATP:D-galactose 1-phosphotransferase           2.7.1.6
  Marino             Sodium pyruvate    rxn00551                                           diphosphate:D-fructose-6-phosphate 1-phosphotransferase          2.7.1.90
 HerbieS                   D-Glucose    rxn15270                                                     D-mannose-6-phosphate aldose-ketose-isomerase           5.3.1.8
   Smeli             Sodium pyruvate    rxn20627                                                                        protoporphyrinogen oxidase           1.3.3.4
   Smeli             Sodium pyruvate    rxn02312                                   S-Adenosyl-L-methionine:8-amino-7-oxononanoate aminotransferase          2.6.1.62
Dyella79                 D-Galactose    rxn30604                                                                                   ALDOSE1EPIM-RXN               NaN
  azobra             Sodium pyruvate    rxn00957                                                         4-hydroxybenzaldehyde:NAD+ oxidoreductase 1.2.1.28|1.2.1.64
   Korea    D-Raffinose pentahydrate    rxn26127                                                                        maltodextrin phosphorylase           2.4.1.1
   Phaeo                 L-Glutamine    rxn19346                                                                               CYSTATHIONASE-RXN.c               NaN
    Koxy             Sodium pyruvate    rxn12239                                                            4-hydroxy-benzyl-alcohol dehydrogenase               NaN
   Korea                 L-Glutamine    rxn12239                                                            4-hydroxy-benzyl-alcohol dehydrogenase               NaN
  azobra             Sodium pyruvate    rxn02159                                                                  L-Histidinol:NAD+ oxidoreductase          1.1.1.23
     PV4             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
  Marino             Sodium pyruvate    rxn00656                                                        L-Alanine:3-oxopropanoate aminotransferase          2.6.1.18
 Cup4G11             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
Dyella79             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
  Marino             Sodium pyruvate    rxn20627                                                                        protoporphyrinogen oxidase           1.3.3.4
  Btheta                  D-Fructose    rxn05464                                        trans-Octodec-2-enoyl-ACP:NAD+ oxidoreductase (A-specific)           1.3.1.0
    Dino             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
 HerbieS                   D-Glucose    rxn15060 D-fructose-6-phosphate D-erythrose-4-phosphate-lyase (adding phosphate; acetyl-phosphate-forming)          4.1.2.22
   Korea                 L-Glutamine    rxn00958                                                        4-Hydroxybenzaldehyde:NADP+ oxidoreductase           1.2.1.7
   Smeli             Sodium pyruvate    rxn09149                                             Phospholipase A2 (phosphatidate, n-C18:0) (periplasm)           3.1.1.4
    Koxy             Sodium pyruvate    rxn01520                                           5,10-Methylenetetrahydrofolate:dUMP C-methyltransferase          2.1.1.45
  azobra             Sodium pyruvate    rxn16573                                             pimelyl-[acyl-carrier protein] methyl ester hydrolase          3.1.1.85
Dyella79                 D-Galactose    rxn00355                                             UTP:alpha-D-galactose-1-phosphate uridylyltransferase          2.7.7.10
    Dino             Sodium pyruvate    rxn02277                                                  7,8-Diaminononanoate:carbon-dioxide cyclo-ligase           6.3.3.3
  Btheta                  D-Fructose    rxn02303                                         Coproporphyrinogen:oxygen oxidoreductase(decarboxylating)           1.3.3.3
     MR1             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
   Smeli             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
    Keio             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
  Btheta                  D-Fructose    rxn10204                                              acyl-CoA:sn-glycerol-3-phosphate 1-O-acyltransferase          2.3.1.15
   Smeli             Sodium pyruvate    rxn21864                                                                                       RXN-11484.c          2.3.1.47
 HerbieS             Sodium pyruvate    rxn10030                                                                   b-ketoacyl synthetase (n-C18:1)               NaN
 Cup4G11             Sodium butyrate    rxn06096                                     1,4-alpha-D-Glucan:orthophosphate alpha-D-glucosyltransferase           2.4.1.1
   Smeli             Sodium pyruvate    rxn16573                                             pimelyl-[acyl-carrier protein] methyl ester hydrolase          3.1.1.85
    Koxy             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
    Dino             Sodium pyruvate    rxn10030                                                                   b-ketoacyl synthetase (n-C18:1)               NaN
   Korea                 L-Glutamine    rxn09310                                                                      thiazole phosphate synthesis               NaN
   Phaeo                   D-Mannose    rxn05740                                                                 1,4-alpha-glucan-branching enzyme               NaN
 Cup4G11          Sodium D,L-Lactate    rxn01057                                                                                  lactate racemase           5.1.2.1
  azobra             Sodium pyruvate    rxn00863                                                                  L-histidinal:NAD+ oxidoreductase          1.1.1.23
  azobra             Sodium pyruvate    rxn16341                                                 hydrogen-sulfide:flavocytochrome c oxidoreductase           1.8.2.3
     PV4          Sodium D,L-Lactate    rxn17391                                                  2-hydroxybutane-1,2,3-tricarboxylate hydro-lyase          4.2.1.79
  azobra             Sodium pyruvate    rxn00693                                       5-Methyltetrahydrofolate:L-homocysteine S-methyltransferase          2.1.1.13
   Korea    D-Raffinose pentahydrate    rxn10348                                                                                     alpha-amylase               NaN
    Dino             Sodium pyruvate    rxn20627                                                                        protoporphyrinogen oxidase           1.3.3.4
 HerbieS                    L-Fucose    rxn25279                                    2-methylcitrate dehydratase (2-methyl-trans-aconitate forming)         4.2.1.117
  azobra             Sodium pyruvate    rxn00224                                                    protoheme ferro-lyase (protoporphyrin-forming)          4.99.1.1
  Btheta                  D-Fructose    rxn09310                                                                      thiazole phosphate synthesis               NaN
    Koxy             Sodium pyruvate    rxn05464                                        trans-Octodec-2-enoyl-ACP:NAD+ oxidoreductase (A-specific)           1.3.1.0
   Korea                 L-Glutamine    rxn05464                                        trans-Octodec-2-enoyl-ACP:NAD+ oxidoreductase (A-specific)           1.3.1.0
   Phaeo                   D-Mannose    rxn09399                                                                 1,4-alpha-glucan branching enzyme          2.4.1.18
Dyella79              a-Cyclodextrin    rxn26127                                                                        maltodextrin phosphorylase           2.4.1.1
   Korea    D-Raffinose pentahydrate    rxn05740                                                                 1,4-alpha-glucan-branching enzyme               NaN
  azobra             Sodium pyruvate    rxn41452                                                                                               NaN         2.1.1.197
  azobra             Sodium pyruvate    rxn12239                                                            4-hydroxy-benzyl-alcohol dehydrogenase               NaN
 HerbieS                    L-Fucose    rxn25278                                                                       2-methylaconitate isomerase               NaN
   Korea                    D-Xylose    rxn01753                                                                            D-Xylonate hydro-lyase          4.2.1.82
   Phaeo      N-Acetyl-D-Glucosamine    rxn00892                                                   ATP:N-acetyl-D-glucosamine 6-phosphotransferase          2.7.1.59
   Phaeo             Sodium pyruvate    rxn20627                                                                        protoporphyrinogen oxidase           1.3.3.4
   Phaeo       D-Trehalose dihydrate    rxn00007                                                              alpha,alpha-trehalose glucohydrolase          3.2.1.28
   Korea       D-Maltose monohydrate    rxn00607                                                      Maltose-6'-phosphate 6-phosphoglucohydrolase         3.2.1.122
  Btheta                  D-Fructose    rxn00350                                                          glutathione gamma-glutamylaminopeptidase 2.3.2.2|3.4.19.13
   Phaeo Trisodium citrate dihydrate    rxn08474                                                                                         RXN0-6.cp               NaN
 HerbieS             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
    Dino             Sodium pyruvate    rxn01927                                     3-Hydroxyanthranilate:oxygen 3,4-oxidoreductase (decyclizing)         1.13.11.6
 HerbieS                    D-Xylose    rxn01751                                                                  D-Xylonolactone lactonohydrolase          3.1.1.68
   Korea    D-Raffinose pentahydrate    rxn09399                                                                 1,4-alpha-glucan branching enzyme          2.4.1.18
Dyella79                 D-Galactose    rxn30623                                                                                BETAGALACTOSID-RXN               NaN
   Smeli             Sodium pyruvate    rxn41452                                                                                               NaN         2.1.1.197
   Phaeo Trisodium citrate dihydrate    rxn00068                                                                          Fe2+:NAD+ oxidoreductase          1.16.1.7
 Cup4G11             Sodium pyruvate    rxn41185                                                                                               NaN          1.2.99.-
Dyella79             Sodium pyruvate    rxn05211                                                               Transport of citrate, extracellular         2.A.1.6.-
  Btheta                  D-Fructose    rxn20627                                                                        protoporphyrinogen oxidase           1.3.3.4
   Korea                 L-Glutamine    rxn00704                                                     alpha-D-Glucose 1-phosphate 1,6-phosphomutase   5.4.2.2|5.4.2.5
    Dino             Sodium pyruvate    rxn16573                                             pimelyl-[acyl-carrier protein] methyl ester hydrolase          3.1.1.85
     PV4             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
   Phaeo             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
  Marino             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
 Cup4G11             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
  azobra             Sodium pyruvate    rxn20627                                                                        protoporphyrinogen oxidase           1.3.3.4
  azobra             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
 HerbieS                    L-Fucose    rxn00541                                                  L-threonine acetaldehyde-lyase (glycine-forming)           4.1.2.5
    Dino             Sodium pyruvate    rxn01438                                        L-Kynurenine,NADPH:oxygen oxidoreductase (3-hydroxylating)         1.14.13.9
Dyella79             Sodium pyruvate    rxn20627                                                                        protoporphyrinogen oxidase           1.3.3.4
   Phaeo           Potassium acetate    rxn11285                                                                             uridine phosphorylase               NaN
 Cup4G11             Sodium butyrate    rxn09952                                                                                     alpha-amylase           3.2.1.1
   Smeli             Sodium pyruvate    rxn17731                                                                                   biotin synthase           2.8.1.6
  Marino             Sodium pyruvate    rxn00467                                                           L-Ornithine:2-oxo-acid aminotransferase          2.6.1.13
    Dino             Sodium pyruvate    rxn02312                                   S-Adenosyl-L-methionine:8-amino-7-oxononanoate aminotransferase          2.6.1.62
   Korea                 L-Glutamine    rxn00520                                                                Succinate:CoA ligase (IDP-forming)           6.2.1.4
  azobra             Sodium pyruvate    rxn02277                                                  7,8-Diaminononanoate:carbon-dioxide cyclo-ligase           6.3.3.3
Dyella79              a-Cyclodextrin    rxn10348                                                                                     alpha-amylase               NaN
 Cup4G11             Sodium butyrate    rxn09989                                                                Maltodextrin glucosidase (dextrin)               NaN
 Cup4G11             Sodium butyrate    rxn05740                                                                 1,4-alpha-glucan-branching enzyme               NaN
   Smeli             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
    Keio             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
 HerbieS             Sodium pyruvate    rxn15947                                                            carboxyspermidine:NADP+ oxidoreductase  1.5.1.-|1.5.1.43
   Korea                 L-Glutamine    rxn00656                                                        L-Alanine:3-oxopropanoate aminotransferase          2.6.1.18
   Smeli             Sodium pyruvate    rxn02277                                                  7,8-Diaminononanoate:carbon-dioxide cyclo-ligase           6.3.3.3
 Cup4G11             Sodium butyrate    rxn09399                                                                 1,4-alpha-glucan branching enzyme          2.4.1.18
     MR1          Sodium D,L-Lactate    rxn17391                                                  2-hydroxybutane-1,2,3-tricarboxylate hydro-lyase          4.2.1.79
   Korea                 L-Glutamine    rxn20627                                                                        protoporphyrinogen oxidase           1.3.3.4
    Dino             Sodium pyruvate    rxn41452                                                                                               NaN         2.1.1.197
Dyella79             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
   Smeli             Sodium pyruvate    rxn05464                                        trans-Octodec-2-enoyl-ACP:NAD+ oxidoreductase (A-specific)           1.3.1.0
    Dino             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
    Keio             Sodium pyruvate    rxn05464                                        trans-Octodec-2-enoyl-ACP:NAD+ oxidoreductase (A-specific)           1.3.1.0
    Dino             Sodium pyruvate    rxn21864                                                                                       RXN-11484.c          2.3.1.47
Dyella79              a-Cyclodextrin    rxn09399                                                                 1,4-alpha-glucan branching enzyme          2.4.1.18
    Dino                 L-Glutamine    rxn11285                                                                             uridine phosphorylase               NaN
     MR1             Sodium pyruvate    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
  Btheta                  D-Fructose    rxn00623                                                             hydrogen-sulfide:NADP+ oxidoreductase   1.8.1.2|1.8.2.2
  Marino             Sodium pyruvate    rxn05464                                        trans-Octodec-2-enoyl-ACP:NAD+ oxidoreductase (A-specific)           1.3.1.0
    Koxy             Sodium pyruvate    rxn00957                                                         4-hydroxybenzaldehyde:NAD+ oxidoreductase 1.2.1.28|1.2.1.64
  azobra             Sodium pyruvate    rxn00747                                              D-glyceraldehyde-3-phosphate aldose-ketose-isomerase           5.3.1.1
     MR1             Sodium pyruvate    rxn00467                                                           L-Ornithine:2-oxo-acid aminotransferase          2.6.1.13
    Dino             Sodium pyruvate    rxn17731                                                                                   biotin synthase           2.8.1.6
    Dino             Sodium pyruvate    rxn02402                     Nicotinate-nucleotide:pyrophosphate phosphoribosyltransferase (carboxylating)          2.4.2.19
  Btheta                  D-Fructose    rxn00224                                                    protoheme ferro-lyase (protoporphyrin-forming)          4.99.1.1
   Phaeo             Sodium pyruvate    rxn15949                                              carboxyspermidine carboxy-lyase (spermidine-forming)  4.1.1.-|4.1.1.96
 HerbieS             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
 HerbieS                 L-Glutamine    rxn00194                                            L-glutamate 1-carboxy-lyase (4-aminobutanoate-forming)          4.1.1.15
   Korea                  D-Fructose    rxn01492                                                   ATP:D-fructose-1-phosphate 6-phosphotransferase          2.7.1.56
    Dino             Sodium pyruvate    rxn00340                                                          L-aspartate:ammonia ligase (AMP-forming)   6.3.1.1|6.3.5.4
  Marino             Sodium pyruvate    rxn02160                                                           L-Histidinol-phosphate phosphohydrolase          3.1.3.15
  Btheta                  D-Fructose    rxn12376                             transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)               NaN
  Btheta      N-Acetyl-D-Glucosamine    rxn15060 D-fructose-6-phosphate D-erythrose-4-phosphate-lyase (adding phosphate; acetyl-phosphate-forming)          4.1.2.22
Dyella79              a-Cyclodextrin    rxn05740                                                                 1,4-alpha-glucan-branching enzyme               NaN
 Cup4G11             Sodium pyruvate    rxn10030                                                                   b-ketoacyl synthetase (n-C18:1)               NaN
    Dino             Sodium pyruvate    rxn15947                                                            carboxyspermidine:NADP+ oxidoreductase  1.5.1.-|1.5.1.43
Dyella79                   L-Proline    rxn00203                                                                      2-Oxoglutarate carboxy-lyase          4.1.1.71
  azobra             Sodium pyruvate    rxn10030                                                                   b-ketoacyl synthetase (n-C18:1)               NaN
  Btheta      N-Acetyl-D-Glucosamine    rxn14178                                   2-oxobutanoate:ferredoxin 2-oxidoreductase (CoA-propanoylating)   1.2.7.1|1.2.7.2
Dyella79             Sodium pyruvate    rxn05464                                        trans-Octodec-2-enoyl-ACP:NAD+ oxidoreductase (A-specific)           1.3.1.0
  azobra             Sodium pyruvate    rxn30633                                                                                      CYSTAGLY-RXN               NaN
   Phaeo                   D-Mannose    rxn10348                                                                                     alpha-amylase               NaN
   Phaeo             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
  Marino             Sodium pyruvate    rxn09310                                                                      thiazole phosphate synthesis               NaN
  Btheta                  D-Fructose    rxn05229                                                                 NCAIR synthetase and NCAIR mutase               NaN
   Phaeo             Sodium pyruvate    rxn00340                                                          L-aspartate:ammonia ligase (AMP-forming)   6.3.1.1|6.3.5.4
  Btheta                  D-Fructose    rxn02288                                                                Uroporphyrinogen-III carboxy-lyase          4.1.1.37
     PV4             Sodium pyruvate    rxn05463                                                Octodecanoyl-ACP:NADP+ oxidoreductase (A-specific)           1.3.1.0

# Per-case summary
case_summary = []
for _, case in gf_cases.iterrows():
    orgId, cs = case['orgId'], case['carbon_source']
    n_enz = len(enzymatic[(enzymatic['orgId'] == orgId) & (enzymatic['carbon_source'] == cs)])
    case_cands = best[(best['orgId'] == orgId) & (best['carbon_source'] == cs)]

    case_summary.append({
        'orgId': orgId,
        'carbon_source': cs,
        'n_gapfilled_rxns': case['n_gapfilled_rxns'],
        'n_enzymatic': n_enz,
        'n_with_candidates': len(case_cands),
        'n_high': (case_cands['confidence'] == 'high').sum(),
        'n_medium': (case_cands['confidence'] == 'medium').sum(),
        'n_low': (case_cands['confidence'] == 'low').sum(),
    })

case_df = pd.DataFrame(case_summary)
case_df['pct_resolved'] = np.where(
    case_df['n_enzymatic'] > 0,
    100 * case_df['n_with_candidates'] / case_df['n_enzymatic'],
    0
).clip(0, 100)

print('Annotation Gap Resolution by Case:')
print(case_df.to_string(index=False))
print(f'\nMean % resolved: {case_df["pct_resolved"].mean():.1f}%')
print(f'Cases with high-confidence candidates: {(case_df["n_high"] > 0).sum()}')

Annotation Gap Resolution by Case:
   orgId               carbon_source  n_gapfilled_rxns  n_enzymatic  n_with_candidates  n_high  n_medium  n_low  pct_resolved
  Btheta                  D-Fructose                13           12                  1       0         0      1      8.333333
  Btheta      N-Acetyl-D-Glucosamine                 3            3                  1       0         0      1     33.333333
    Koxy             Sodium pyruvate                 7            7                  2       1         0      1     28.571429
   Smeli             Sodium pyruvate                16           14                  3       1         1      1     21.428571
   Phaeo             Sodium pyruvate                13           12                  5       1         3      1     41.666667
   Phaeo                 L-Glutamine                 1            1                  0       0         0      0      0.000000
   Phaeo      N-Acetyl-D-Glucosamine                 1            1                  0       0         0      0      0.000000
   Phaeo       D-Trehalose dihydrate                 1            1                  0       0         0      0      0.000000
   Phaeo Trisodium citrate dihydrate                 2            2                  0       0         0      0      0.000000
   Phaeo           Potassium acetate                 1            1                  0       0         0      0      0.000000
   Phaeo                   D-Mannose                 4            3                  0       0         0      0      0.000000
    Keio             Sodium pyruvate                 8            7                  4       2         0      2     57.142857
 HerbieS             Sodium pyruvate                 8            7                  3       2         0      1     42.857143
 HerbieS                 L-Glutamine                 1            1                  0       0         0      0      0.000000
 HerbieS                   D-Glucose                 2            2                  0       0         0      0      0.000000
 HerbieS                    D-Xylose                 2            2                  1       0         0      1     50.000000
 HerbieS                    L-Fucose                 5            5                  2       0         1      1     40.000000
 Cup4G11             Sodium pyruvate                 7            6                  2       0         0      2     33.333333
 Cup4G11          Sodium D,L-Lactate                 1            1                  0       0         0      0      0.000000
 Cup4G11             Sodium butyrate                 6            5                  0       0         0      0      0.000000
    Dino             Sodium pyruvate                22           20                  5       0         0      5     25.000000
    Dino                 L-Glutamine                 1            1                  0       0         0      0      0.000000
  Marino             Sodium pyruvate                13           12                  4       1         0      3     33.333333
     MR1             Sodium pyruvate                 7            6                  2       0         0      2     33.333333
     MR1          Sodium D,L-Lactate                 2            2                  1       0         0      1     50.000000
  azobra             Sodium pyruvate                21           21                  4       0         1      3     19.047619
     PV4             Sodium pyruvate                 6            5                  2       0         0      2     40.000000
     PV4          Sodium D,L-Lactate                 2            2                  1       0         0      1     50.000000
   Korea                 L-Glutamine                13           12                  2       0         0      2     16.666667
   Korea                  D-Fructose                 1            1                  0       0         0      0      0.000000
   Korea       D-Maltose monohydrate                 1            1                  0       0         0      0      0.000000
   Korea                    D-Xylose                 2            2                  1       0         0      1     50.000000
   Korea    D-Raffinose pentahydrate                 5            4                  0       0         0      0      0.000000
Dyella79             Sodium pyruvate                10            9                  4       2         0      2     44.444444
Dyella79                  D-Fructose                 1            1                  1       0         0      1    100.000000
Dyella79                   L-Proline                 1            1                  0       0         0      0      0.000000
Dyella79                 D-Galactose                 4            4                  0       0         0      0      0.000000
Dyella79              a-Cyclodextrin                 5            4                  0       0         0      0      0.000000

Mean % resolved: 21.5%
Cases with high-confidence candidates: 7

# Visualization
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: confidence distribution
conf_counts = candidates['confidence'].value_counts().reindex(['high', 'medium', 'low']).fillna(0)
conf_counts.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#f39c12', '#e74c3c'])
axes[0].set_title('Candidate Gene Confidence Distribution')
axes[0].set_ylabel('Number of Candidates')
axes[0].set_xlabel('Confidence Level')
axes[0].tick_params(axis='x', rotation=0)

# Right: fitness class distribution
fit_counts = candidates['fitness_class'].value_counts()
fit_counts.plot(kind='bar', ax=axes[1], color='steelblue')
axes[1].set_title('Fitness Evidence Classification')
axes[1].set_ylabel('Number of Candidates')
axes[1].set_xlabel('Fitness Class')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(f'{FIG_DIR}/evidence_distribution.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved: {FIG_DIR}/evidence_distribution.png')

# Heatmap of resolution by organism x carbon source
if len(case_df) > 3:
    pivot = case_df.pivot(index='orgId', columns='carbon_source', values='pct_resolved')
    fig, ax = plt.subplots(figsize=(max(12, len(pivot.columns)*0.8), max(6, len(pivot)*0.5)))
    sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlOrRd',
                linewidths=0.5, ax=ax, vmin=0, vmax=100,
                cbar_kws={'label': '% reactions with gene candidates'})
    ax.set_title('Annotation Gap Resolution')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'{FIG_DIR}/annotation_gap_resolution_heatmap.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f'Saved: {FIG_DIR}/annotation_gap_resolution_heatmap.png')

Saved: ../figures/evidence_distribution.png

Saved: ../figures/annotation_gap_resolution_heatmap.png

# Save full candidate table
save_cols = ['orgId', 'carbon_source', 'rxn_id_base', 'name', 'ec_number',
             'locusId', 'gene_desc', 'ec_source',
             'mean_fit', 'min_fit', 'max_abs_t', 'fitness_class',
             'conservation_class', 'n_core',
             'gapmind_pathway', 'gapmind_score', 'clade_pct_complete',
             'confidence', 'candidate_rank']

# Only include columns that exist
save_cols = [c for c in save_cols if c in candidates.columns]
candidates[save_cols].to_csv(
    f'{DATA_DIR}/annotation_gap_candidates.tsv', sep='\t', index=False
)
print(f'Saved annotation_gap_candidates.tsv: {len(candidates)} rows')

# Save case summary
case_df.to_csv(f'{DATA_DIR}/annotation_gap_case_summary.tsv', sep='\t', index=False)
print(f'Saved annotation_gap_case_summary.tsv: {len(case_df)} rows')

# Final summary
print()
print('=' * 60)
print('NB03 SUMMARY: Gene Candidate Identification')
print('=' * 60)
print(f'Gapfilled cases analyzed:        {len(gf_cases)}')
n_enz_unique = enzymatic[['orgId', 'rxn_id_base']].drop_duplicates().shape[0]
print(f'Enzymatic reactions investigated: {n_enz_unique}')
print(f'Unique EC numbers targeted:      {rxn_ec["ec_number"].dropna().nunique()}')
print(f'Gene candidates identified:      {len(candidates)}')
print(f'  High confidence:               {(candidates["confidence"] == "high").sum()}')
print(f'  Medium confidence:             {(candidates["confidence"] == "medium").sum()}')
print(f'  Low confidence:                {(candidates["confidence"] == "low").sum()}')
n_resolved = candidates[['orgId', 'rxn_id_base']].drop_duplicates().shape[0]
print(f'Reactions with candidates:       {n_resolved}')
print(f'Reactions without candidates:    {len(unresolved_keys)}')
ess_count = (candidates['fitness_class'] == 'essential').sum()
print(f'Essential gene candidates:       {ess_count}')
print()
print('Output files:')
print(f'  data/gapfill_reaction_details.tsv')
print(f'  data/annotation_gap_candidates.tsv')
print(f'  data/annotation_gap_case_summary.tsv')
print(f'  figures/evidence_distribution.png')
print(f'  figures/annotation_gap_resolution_heatmap.png')
print('=' * 60)

Saved annotation_gap_candidates.tsv: 107 rows
Saved annotation_gap_case_summary.tsv: 38 rows

============================================================
NB03 SUMMARY: Gene Candidate Identification
============================================================
Gapfilled cases analyzed:        38
Enzymatic reactions investigated: 201
Unique EC numbers targeted:      84
Gene candidates identified:      107
  High confidence:               12
  Medium confidence:             11
  Low confidence:                84
Reactions with candidates:       51
Reactions without candidates:    150
Essential gene candidates:       0

Output files:
  data/gapfill_reaction_details.tsv
  data/annotation_gap_candidates.tsv
  data/annotation_gap_case_summary.tsv
  figures/evidence_distribution.png
  figures/annotation_gap_resolution_heatmap.png
============================================================

03 Annotation Gap Candidates

NB03: Gene Candidate Identification for Annotation Gaps¶

2. Map EC Numbers to Fitness Browser Genes¶

3. Fitness Evidence for Gene Candidates¶

4. Pangenome Conservation Analysis¶

5. GapMind Cross-Validation¶

6. Assemble Evidence and Rank Candidates¶

7. Summary Statistics and Visualization¶

8. Save Outputs¶