spark = get_spark_session()
spark.sql('SELECT 1 AS ok').show()

import pandas as pd
import numpy as np
import os
from collections import Counter

DATA_DIR = '../data'
FIG_DIR = '../figures'
os.makedirs(FIG_DIR, exist_ok=True)

# Load NB02/NB03 outputs
gf_details = pd.read_csv(f'{DATA_DIR}/gapfill_reaction_details.tsv', sep='\t')
gf_results = pd.read_csv(f'{DATA_DIR}/carbon_source_gapfill_results.tsv', sep='\t')
candidates = pd.read_csv(f'{DATA_DIR}/annotation_gap_candidates.tsv', sep='\t')
manifest = pd.read_csv(f'{DATA_DIR}/genome_manifest.tsv', sep='\t')

gf_cases = gf_results[gf_results['status'] == 'gapfilled'].copy()
enzymatic = gf_details[gf_details['rxn_type'] == 'metabolic'].copy()

print(f'Gapfilled cases: {len(gf_cases)}')
print(f'Enzymatic reactions: {len(enzymatic)}')
print(f'  With EC: {enzymatic["ec_numbers"].notna().sum()}')
print(f'  Without EC (dark): {enzymatic["ec_numbers"].isna().sum()}')
print(f'NB03 candidates: {len(candidates)}')
print(f'Spark session ready.')

+---+
| ok|
+---+
|  1|
+---+

Gapfilled cases: 38
Enzymatic reactions: 201
  With EC: 151
  Without EC (dark): 50
NB03 candidates: 107
Spark session ready.

# Carbon source to GapMind pathway mapping (from NB03)
CS_TO_GAPMIND = {
    'D-Fructose': 'fructose', 'D-Glucose': 'glucose', 'D-Galactose': 'galactose',
    'D-Mannose': 'mannose', 'D-Xylose': 'xylose', 'D-Trehalose dihydrate': 'trehalose',
    'N-Acetyl-D-Glucosamine': 'NAG', 'Sodium pyruvate': 'pyruvate',
    'Trisodium citrate dihydrate': 'citrate', 'Potassium acetate': 'acetate',
    'Sodium D,L-Lactate': 'lactate', 'Sodium butyrate': 'butyrate',
    'L-Fucose': 'fucose', 'D-Maltose monohydrate': 'maltose',
}

# Extract genome accessions (strip RS_/GB_ prefix for GapMind)
manifest['genome_accession'] = manifest['gtdb_species_clade_id'].apply(
    lambda x: x.split('--')[1].replace('RS_', '').replace('GB_', '') if '--' in str(x) else None
)
genome_list = manifest['genome_accession'].dropna().tolist()
genome_in = "','".join(genome_list)

# Query full GapMind data (all columns, scope='all')
gapmind_full = spark.sql(f"""
    SELECT genome_id, pathway, metabolic_category, sequence_scope,
           nHi, nMed, nLo, score, score_category, score_simplified
    FROM kbase_ke_pangenome.gapmind_pathways
    WHERE genome_id IN ('{genome_in}')
      AND metabolic_category = 'carbon'
      AND sequence_scope = 'all'
""").toPandas()

# Map genome_id back to orgId
genome_to_org = manifest.set_index('genome_accession')['orgId'].to_dict()
gapmind_full['orgId'] = gapmind_full['genome_id'].map(genome_to_org)

print(f'GapMind predictions (scope=all): {len(gapmind_full)}')
print(f'  Genomes found: {gapmind_full["genome_id"].nunique()}')
print(f'  Pathways: {gapmind_full["pathway"].nunique()}')
print(f'\nScore category distribution:')
print(gapmind_full['score_category'].value_counts().to_string())

GapMind predictions (scope=all): 4736
  Genomes found: 12
  Pathways: 62

Score category distribution:
score_category
complete                2001
not_present             1206
steps_missing_low        922
likely_complete          368
steps_missing_medium     239

# Cross-reference with gapfilled cases
gf_cases['gapmind_pathway'] = gf_cases['carbon_source'].map(CS_TO_GAPMIND)
gf_with_gm = gf_cases[gf_cases['gapmind_pathway'].notna()].copy()

concordance = gf_with_gm.merge(
    gapmind_full[['orgId', 'pathway', 'score_category', 'score', 'nHi', 'nMed', 'nLo']],
    left_on=['orgId', 'gapmind_pathway'],
    right_on=['orgId', 'pathway'],
    how='left'
)

concordance['total_steps'] = concordance['nHi'].fillna(0) + concordance['nMed'].fillna(0) + concordance['nLo'].fillna(0)
concordance['has_gapmind'] = concordance['score_category'].notna()

print(f'Gapfilled cases with GapMind pathway mapping: {len(gf_with_gm)}')
print(f'  With GapMind data: {concordance["has_gapmind"].sum()}')
print(f'  Without GapMind data: {(~concordance["has_gapmind"]).sum()}')
print()

# Concordance: gapfill needed AND GapMind says incomplete
with_data = concordance[concordance['has_gapmind']].copy()
if len(with_data) > 0:
    print('GapMind vs Gapfill concordance:')
    for cat in ['complete', 'likely_complete', 'steps_missing_low', 'steps_missing_medium', 'not_present']:
        n = (with_data['score_category'] == cat).sum()
        pct = 100 * n / len(with_data) if len(with_data) > 0 else 0
        print(f'  {cat}: {n} ({pct:.0f}%)')
    print()
    print('Detail:')
    print(with_data[['orgId', 'carbon_source', 'n_gapfilled_rxns', 'gapmind_pathway',
                     'score_category', 'score', 'nHi', 'nMed', 'nLo', 'total_steps']].to_string(index=False))

Gapfilled cases with GapMind pathway mapping: 31
  With GapMind data: 98
  Without GapMind data: 6

GapMind vs Gapfill concordance:
  complete: 62 (63%)
  likely_complete: 4 (4%)
  steps_missing_low: 2 (2%)
  steps_missing_medium: 7 (7%)
  not_present: 23 (23%)

Detail:
   orgId               carbon_source  n_gapfilled_rxns gapmind_pathway       score_category  score  nHi  nMed  nLo  total_steps
  Btheta                  D-Fructose                13        fructose          not_present   -2.0  0.0   0.0  1.0          1.0
  Btheta                  D-Fructose                13        fructose          not_present   -6.0  0.0   0.0  3.0          3.0
  Btheta                  D-Fructose                13        fructose             complete    1.0  1.0   0.0  0.0          1.0
  Btheta                  D-Fructose                13        fructose             complete    2.0  2.0   0.0  0.0          2.0
  Btheta                  D-Fructose                13        fructose             complete    2.0  2.0   0.0  0.0          2.0
  Btheta                  D-Fructose                13        fructose          not_present   -2.0  0.0   0.0  1.0          1.0
  Btheta                  D-Fructose                13        fructose          not_present   -6.0  0.0   0.0  3.0          3.0
  Btheta                  D-Fructose                13        fructose             complete    1.0  1.0   0.0  0.0          1.0
  Btheta                  D-Fructose                13        fructose             complete    2.0  2.0   0.0  0.0          2.0
  Btheta                  D-Fructose                13        fructose             complete    2.0  2.0   0.0  0.0          2.0
  Btheta      N-Acetyl-D-Glucosamine                 3             NAG          not_present   -2.0  0.0   0.0  1.0          1.0
  Btheta      N-Acetyl-D-Glucosamine                 3             NAG steps_missing_medium   -0.1  0.0   1.0  0.0          1.0
  Btheta      N-Acetyl-D-Glucosamine                 3             NAG      likely_complete    2.9  3.0   1.0  0.0          4.0
  Btheta      N-Acetyl-D-Glucosamine                 3             NAG      likely_complete    2.9  3.0   1.0  0.0          4.0
  Btheta      N-Acetyl-D-Glucosamine                 3             NAG          not_present   -2.0  0.0   0.0  1.0          1.0
  Btheta      N-Acetyl-D-Glucosamine                 3             NAG steps_missing_medium   -0.1  0.0   1.0  0.0          1.0
  Btheta      N-Acetyl-D-Glucosamine                 3             NAG      likely_complete    2.9  3.0   1.0  0.0          4.0
  Btheta      N-Acetyl-D-Glucosamine                 3             NAG      likely_complete    2.9  3.0   1.0  0.0          4.0
   Smeli             Sodium pyruvate                16        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
   Smeli             Sodium pyruvate                16        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
   Smeli             Sodium pyruvate                16        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
   Smeli             Sodium pyruvate                16        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
   Phaeo             Sodium pyruvate                13        pyruvate          not_present   -2.0  0.0   0.0  1.0          1.0
   Phaeo             Sodium pyruvate                13        pyruvate          not_present   -2.0  0.0   0.0  1.0          1.0
   Phaeo      N-Acetyl-D-Glucosamine                 1             NAG          not_present   -2.0  0.0   0.0  1.0          1.0
   Phaeo      N-Acetyl-D-Glucosamine                 1             NAG             complete    4.0  4.0   0.0  0.0          4.0
   Phaeo      N-Acetyl-D-Glucosamine                 1             NAG             complete    7.0  7.0   0.0  0.0          7.0
   Phaeo      N-Acetyl-D-Glucosamine                 1             NAG             complete    7.0  7.0   0.0  0.0          7.0
   Phaeo       D-Trehalose dihydrate                 1       trehalose          not_present   -4.0  0.0   0.0  2.0          2.0
   Phaeo       D-Trehalose dihydrate                 1       trehalose             complete    4.0  4.0   0.0  0.0          4.0
   Phaeo       D-Trehalose dihydrate                 1       trehalose             complete    4.0  4.0   0.0  0.0          4.0
   Phaeo       D-Trehalose dihydrate                 1       trehalose          not_present   -2.0  0.0   0.0  1.0          1.0
   Phaeo       D-Trehalose dihydrate                 1       trehalose             complete    5.0  5.0   0.0  0.0          5.0
   Phaeo       D-Trehalose dihydrate                 1       trehalose          not_present   -2.0  0.0   0.0  1.0          1.0
   Phaeo       D-Trehalose dihydrate                 1       trehalose             complete    6.0  6.0   0.0  0.0          6.0
   Phaeo Trisodium citrate dihydrate                 2         citrate             complete    3.0  3.0   0.0  0.0          3.0
   Phaeo Trisodium citrate dihydrate                 2         citrate             complete    5.0  5.0   0.0  0.0          5.0
   Phaeo           Potassium acetate                 1         acetate             complete    1.0  1.0   0.0  0.0          1.0
   Phaeo           Potassium acetate                 1         acetate             complete    3.0  3.0   0.0  0.0          3.0
   Phaeo                   D-Mannose                 4         mannose          not_present   -2.0  0.0   0.0  1.0          1.0
   Phaeo                   D-Mannose                 4         mannose             complete    3.0  3.0   0.0  0.0          3.0
   Phaeo                   D-Mannose                 4         mannose             complete    5.0  5.0   0.0  0.0          5.0
    Keio             Sodium pyruvate                 8        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
    Keio             Sodium pyruvate                 8        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
    Keio             Sodium pyruvate                 8        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
    Keio             Sodium pyruvate                 8        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
 HerbieS             Sodium pyruvate                 8        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
 HerbieS             Sodium pyruvate                 8        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
 HerbieS                   D-Glucose                 2         glucose             complete    3.0  3.0   0.0  0.0          3.0
 HerbieS                   D-Glucose                 2         glucose          not_present   -2.0  0.0   0.0  1.0          1.0
 HerbieS                   D-Glucose                 2         glucose             complete    4.0  4.0   0.0  0.0          4.0
 HerbieS                   D-Glucose                 2         glucose             complete    4.0  4.0   0.0  0.0          4.0
 HerbieS                    D-Xylose                 2          xylose             complete    3.0  3.0   0.0  0.0          3.0
 HerbieS                    D-Xylose                 2          xylose          not_present   -2.0  0.0   0.0  1.0          1.0
 HerbieS                    D-Xylose                 2          xylose    steps_missing_low    7.0  9.0   0.0  1.0         10.0
 HerbieS                    L-Fucose                 5          fucose             complete    3.0  3.0   0.0  0.0          3.0
 HerbieS                    L-Fucose                 5          fucose             complete    1.0  1.0   0.0  0.0          1.0
 HerbieS                    L-Fucose                 5          fucose             complete    9.0  9.0   0.0  0.0          9.0
 Cup4G11             Sodium pyruvate                 7        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
 Cup4G11             Sodium pyruvate                 7        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
 Cup4G11             Sodium pyruvate                 7        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
 Cup4G11             Sodium pyruvate                 7        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
  Marino             Sodium pyruvate                13        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
  Marino             Sodium pyruvate                13        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
     MR1             Sodium pyruvate                 7        pyruvate steps_missing_medium   -0.2  0.0   2.0  0.0          2.0
     MR1             Sodium pyruvate                 7        pyruvate steps_missing_medium   -0.2  0.0   2.0  0.0          2.0
  azobra             Sodium pyruvate                21        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
  azobra             Sodium pyruvate                21        pyruvate             complete    3.0  3.0   0.0  0.0          3.0
     PV4             Sodium pyruvate                 6        pyruvate steps_missing_medium   -0.2  0.0   2.0  0.0          2.0
     PV4             Sodium pyruvate                 6        pyruvate steps_missing_medium   -0.2  0.0   2.0  0.0          2.0
   Korea                  D-Fructose                 1        fructose          not_present   -2.0  0.0   0.0  1.0          1.0
   Korea                  D-Fructose                 1        fructose          not_present   -6.0  0.0   0.0  3.0          3.0
   Korea                  D-Fructose                 1        fructose             complete    1.0  1.0   0.0  0.0          1.0
   Korea                  D-Fructose                 1        fructose             complete    2.0  2.0   0.0  0.0          2.0
   Korea                  D-Fructose                 1        fructose             complete    2.0  2.0   0.0  0.0          2.0
   Korea       D-Maltose monohydrate                 1         maltose             complete    1.0  1.0   0.0  0.0          1.0
   Korea       D-Maltose monohydrate                 1         maltose          not_present   -2.0  0.0   0.0  1.0          1.0
   Korea       D-Maltose monohydrate                 1         maltose             complete    1.0  1.0   0.0  0.0          1.0
   Korea       D-Maltose monohydrate                 1         maltose steps_missing_medium   -0.2  0.0   2.0  0.0          2.0
   Korea       D-Maltose monohydrate                 1         maltose             complete    2.0  2.0   0.0  0.0          2.0
   Korea       D-Maltose monohydrate                 1         maltose             complete    3.0  3.0   0.0  0.0          3.0
   Korea                    D-Xylose                 2          xylose             complete    1.0  1.0   0.0  0.0          1.0
   Korea                    D-Xylose                 2          xylose          not_present   -2.0  0.0   0.0  1.0          1.0
   Korea                    D-Xylose                 2          xylose             complete    6.0  6.0   0.0  0.0          6.0
Dyella79             Sodium pyruvate                10        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
Dyella79             Sodium pyruvate                10        pyruvate             complete    2.0  2.0   0.0  0.0          2.0
Dyella79                  D-Fructose                 1        fructose          not_present   -2.0  0.0   0.0  1.0          1.0
Dyella79                  D-Fructose                 1        fructose          not_present   -6.0  0.0   0.0  3.0          3.0
Dyella79                  D-Fructose                 1        fructose             complete    1.0  1.0   0.0  0.0          1.0
Dyella79                  D-Fructose                 1        fructose             complete    2.0  2.0   0.0  0.0          2.0
Dyella79                  D-Fructose                 1        fructose             complete    2.0  2.0   0.0  0.0          2.0
Dyella79                 D-Galactose                 4       galactose             complete    1.0  1.0   0.0  0.0          1.0
Dyella79                 D-Galactose                 4       galactose          not_present   -6.0  0.0   0.0  3.0          3.0
Dyella79                 D-Galactose                 4       galactose          not_present   -2.0  0.0   0.0  1.0          1.0
Dyella79                 D-Galactose                 4       galactose    steps_missing_low   -7.0  1.0   0.0  4.0          5.0
Dyella79                 D-Galactose                 4       galactose             complete    5.0  5.0   0.0  0.0          5.0
Dyella79                 D-Galactose                 4       galactose             complete    6.0  6.0   0.0  0.0          6.0
Dyella79                 D-Galactose                 4       galactose             complete    6.0  6.0   0.0  0.0          6.0

# Query clade-level GapMind for gapfilled pathways
clade_list = manifest['gtdb_species_clade_id'].dropna().unique().tolist()
clade_in = "','".join(clade_list)
target_pathways = list(CS_TO_GAPMIND.values())
pathway_in = "','".join(target_pathways)

clade_gm = spark.sql(f"""
    SELECT clade_name, genome_id, pathway, score_category, score,
           nHi, nMed, nLo
    FROM kbase_ke_pangenome.gapmind_pathways
    WHERE clade_name IN ('{clade_in}')
      AND metabolic_category = 'carbon'
      AND sequence_scope = 'all'
      AND pathway IN ('{pathway_in}')
""").toPandas()

clade_to_org = manifest.set_index('gtdb_species_clade_id')['orgId'].to_dict()
clade_gm['orgId'] = clade_gm['clade_name'].map(clade_to_org)

print(f'Clade-level GapMind entries: {len(clade_gm):,}')
print(f'  Clades: {clade_gm["clade_name"].nunique()}')
print(f'  Pathways: {clade_gm["pathway"].nunique()}')

# Summarize: fraction complete per clade x pathway
clade_summary = clade_gm.groupby(['orgId', 'clade_name', 'pathway']).agg(
    n_genomes=('genome_id', 'nunique'),
    n_complete=('score_category', lambda x: (x == 'complete').sum()),
    n_likely=('score_category', lambda x: (x == 'likely_complete').sum()),
    n_missing_low=('score_category', lambda x: (x == 'steps_missing_low').sum()),
    n_missing_med=('score_category', lambda x: (x == 'steps_missing_medium').sum()),
    n_not_present=('score_category', lambda x: (x == 'not_present').sum()),
).reset_index()

clade_summary['pct_complete'] = 100 * (clade_summary['n_complete'] + clade_summary['n_likely']) / clade_summary['n_genomes']
clade_summary['gap_type'] = 'mixed'
clade_summary.loc[clade_summary['pct_complete'] >= 90, 'gap_type'] = 'strain_specific'
clade_summary.loc[clade_summary['pct_complete'] <= 10, 'gap_type'] = 'universal_gap'

# Show gapfilled pathways only
gf_pathways = set(gf_with_gm[['orgId', 'gapmind_pathway']].itertuples(index=False, name=None))
clade_gf = clade_summary[clade_summary.apply(
    lambda r: (r['orgId'], r['pathway']) in gf_pathways, axis=1
)]

print(f'\nClade-level patterns for gapfilled pathways: {len(clade_gf)}')
print(clade_gf[['orgId', 'pathway', 'n_genomes', 'pct_complete', 'gap_type']].to_string(index=False))

Clade-level GapMind entries: 97,392
  Clades: 14
  Pathways: 12

Clade-level patterns for gapfilled pathways: 27
   orgId   pathway  n_genomes  pct_complete        gap_type
  Btheta       NAG        287    398.606272 strain_specific
  Btheta  fructose        287    598.606272 strain_specific
 Cup4G11  pyruvate          5    400.000000 strain_specific
    Dino  pyruvate          2    200.000000 strain_specific
Dyella79  fructose          2    300.000000 strain_specific
Dyella79 galactose          2    400.000000 strain_specific
Dyella79  pyruvate          2    200.000000 strain_specific
 HerbieS    fucose          7    300.000000 strain_specific
 HerbieS   glucose          7    300.000000 strain_specific
 HerbieS  pyruvate          7    200.000000 strain_specific
 HerbieS    xylose          7    128.571429 strain_specific
    Keio  pyruvate          2    400.000000 strain_specific
   Korea  fructose         72    205.555556 strain_specific
   Korea   maltose         72    400.000000 strain_specific
   Korea    xylose         72    200.000000 strain_specific
    Koxy  pyruvate        399    400.000000 strain_specific
     MR1  pyruvate          2      0.000000   universal_gap
  Marino  pyruvate          8    200.000000 strain_specific
     PV4  pyruvate          7      0.000000   universal_gap
   Phaeo       NAG         43    300.000000 strain_specific
   Phaeo   acetate         43    200.000000 strain_specific
   Phaeo   citrate         43    200.000000 strain_specific
   Phaeo   mannose         43    200.000000 strain_specific
   Phaeo  pyruvate         43      0.000000   universal_gap
   Phaeo trehalose         43    400.000000 strain_specific
   Smeli  pyruvate        241    400.000000 strain_specific
  azobra  pyruvate         18    200.000000 strain_specific

# Get unique reaction names and ECs for matching
dark_rxns = enzymatic[enzymatic['ec_numbers'].isna()][['rxn_id_base', 'name']].drop_duplicates()
all_rxns = enzymatic[['rxn_id_base', 'name', 'ec_numbers', 'orgId']].drop_duplicates()

# Resolved reaction-organism pairs from NB03
resolved = set(zip(candidates['orgId'], candidates['rxn_id_base']))
unresolved = all_rxns[~all_rxns.apply(lambda r: (r['orgId'], r['rxn_id_base']) in resolved, axis=1)].copy()

print(f'Dark reactions (no EC): {len(dark_rxns)} unique')
print(f'Unresolved reaction-organism pairs: {len(unresolved)}')
print(f'\nDark reaction names:')
for _, row in dark_rxns.iterrows():
    print(f'  {row["rxn_id_base"]}: {row["name"]}')

Dark reactions (no EC): 16 unique
Unresolved reaction-organism pairs: 150

Dark reaction names:
  rxn05229: NCAIR synthetase and NCAIR mutase
  rxn12376: transport of 4-hydroxybenzyl alcohol [extraorganism-cytosol](passive)
  rxn09310: thiazole phosphate synthesis
  rxn12239: 4-hydroxy-benzyl-alcohol dehydrogenase
  rxn19346: CYSTATHIONASE-RXN.c
  rxn08474: RXN0-6.cp
  rxn11285: uridine phosphorylase
  rxn10348: alpha-amylase
  rxn05740: 1,4-alpha-glucan-branching enzyme
  rxn10030: b-ketoacyl synthetase (n-C18:1)
  rxn25278: 2-methylaconitate isomerase
  rxn09989: Maltodextrin glucosidase (dextrin)
  rxn00796: biotin synthase
  rxn30633: CYSTAGLY-RXN
  rxn30604: ALDOSE1EPIM-RXN
  rxn30623: BETAGALACTOSID-RXN

# Query Bakta annotations for gene clusters in target clades
# First get gene clusters for target clades
bakta_gc = spark.sql(f"""
    SELECT gc.gtdb_species_clade_id, gc.gene_cluster_id,
           gc.is_core,
           ba.ec AS bakta_ec,
           ba.product AS bakta_product,
           ba.kegg_orthology_id AS bakta_ko,
           ba.uniref50, ba.uniref90, ba.uniref100,
           ba.hypothetical
    FROM kbase_ke_pangenome.gene_cluster gc
    JOIN kbase_ke_pangenome.bakta_annotations ba
        ON gc.gene_cluster_id = ba.gene_cluster_id
    WHERE gc.gtdb_species_clade_id IN ('{clade_in}')
      AND ba.ec IS NOT NULL
      AND ba.ec != ''
""").toPandas()

print(f'Bakta-annotated gene clusters with EC in target clades: {len(bakta_gc):,}')
print(f'  Unique EC numbers: {bakta_gc["bakta_ec"].nunique()}')
print(f'  Clades found: {bakta_gc["gtdb_species_clade_id"].nunique()}')

Bakta-annotated gene clusters with EC in target clades: 26,510
  Unique EC numbers: 2189
  Clades found: 14

# Match Bakta ECs to gapfilled reaction ECs
# For reactions WITH ModelSEED ECs but no NB03 gene match
ms_rxns = pd.read_csv(f'{DATA_DIR}/modelseed_biochem/reactions.tsv', sep='\t', low_memory=False)

# Build EC-to-reaction mapping
ec_to_rxn = {}
for _, row in enzymatic[enzymatic['ec_numbers'].notna()].iterrows():
    for ec in str(row['ec_numbers']).replace('|', ';').split(';'):
        ec = ec.strip()
        if ec:
            ec_to_rxn.setdefault(ec, set()).add(row['rxn_id_base'])

# Match Bakta EC to gapfilled reactions
bakta_gc['orgId'] = bakta_gc['gtdb_species_clade_id'].map(
    manifest.set_index('gtdb_species_clade_id')['orgId'].to_dict()
)

bakta_matches = []
for _, row in bakta_gc.iterrows():
    for ec in str(row['bakta_ec']).split(','):
        ec = ec.strip()
        if ec in ec_to_rxn:
            for rxn_id in ec_to_rxn[ec]:
                bakta_matches.append({
                    'orgId': row['orgId'],
                    'rxn_id_base': rxn_id,
                    'bakta_ec': ec,
                    'gene_cluster_id': row['gene_cluster_id'],
                    'bakta_product': row['bakta_product'],
                    'bakta_ko': row['bakta_ko'],
                    'is_core': row['is_core'],
                    'uniref50': row['uniref50'],
                    'uniref90': row['uniref90'],
                    'uniref100': row['uniref100'],
                    'source': 'bakta_ec',
                })

bakta_ec_df = pd.DataFrame(bakta_matches).drop_duplicates()

# Filter to only unresolved cases
bakta_new = bakta_ec_df[
    bakta_ec_df.apply(lambda r: (r['orgId'], r['rxn_id_base']) not in resolved, axis=1)
].copy()

print(f'Bakta EC matches to gapfilled reactions: {len(bakta_ec_df)}')
print(f'  New (not in NB03): {len(bakta_new)}')
print(f'  Unique reactions newly matched: {bakta_new[["orgId","rxn_id_base"]].drop_duplicates().shape[0]}')

Bakta EC matches to gapfilled reactions: 941
  New (not in NB03): 865
  Unique reactions newly matched: 386

# For dark reactions (no EC), try product name matching
import re

def extract_keywords(name):
    if not isinstance(name, str):
        return set()
    name = name.lower()
    name = re.sub(r'[^a-z0-9\s]', ' ', name)
    words = {w for w in name.split() if len(w) > 3}
    words -= {'transport', 'reaction', 'acid', 'synthesis', 'forming'}
    return words

dark_keywords = {}
for _, row in dark_rxns.iterrows():
    kw = extract_keywords(row['name'])
    if kw:
        dark_keywords[row['rxn_id_base']] = (row['name'], kw)

all_kw = set()
for _, (name, kw) in dark_keywords.items():
    all_kw.update(kw)

print(f'Dark reactions with extractable keywords: {len(dark_keywords)}')
print(f'Total unique keywords: {len(all_kw)}')
print(f'Keywords: {sorted(all_kw)}')

if len(all_kw) == 0:
    product_df = pd.DataFrame()
    print('No keywords to match — skipping product matching')
else:
    # Push keyword filter into SQL to dramatically reduce result set
    kw_pattern = '|'.join(re.escape(k) for k in sorted(all_kw))

    bakta_products = spark.sql(f"""
        SELECT gc.gtdb_species_clade_id, gc.gene_cluster_id,
               gc.is_core,
               ba.product AS bakta_product,
               ba.ec AS bakta_ec,
               ba.kegg_orthology_id AS bakta_ko,
               ba.uniref50, ba.uniref90, ba.uniref100
        FROM kbase_ke_pangenome.gene_cluster gc
        JOIN kbase_ke_pangenome.bakta_annotations ba
            ON gc.gene_cluster_id = ba.gene_cluster_id
        WHERE gc.gtdb_species_clade_id IN ('{clade_in}')
          AND ba.hypothetical = false
          AND ba.product IS NOT NULL
          AND ba.product != ''
          AND LOWER(ba.product) RLIKE '{kw_pattern}'
    """).toPandas()

    bakta_products['orgId'] = bakta_products['gtdb_species_clade_id'].map(
        manifest.set_index('gtdb_species_clade_id')['orgId'].to_dict()
    )

    print(f'Bakta products matching keywords (SQL-filtered): {len(bakta_products):,}')

    # Vectorized matching via inverted index
    bakta_products['product_kw'] = bakta_products['bakta_product'].apply(extract_keywords)

    kw_index = {}
    for idx, kw_set in bakta_products['product_kw'].items():
        for kw in kw_set:
            if kw in all_kw:
                kw_index.setdefault(kw, set()).add(idx)

    product_matches = []
    for rxn_id, (rxn_name, rxn_kw) in dark_keywords.items():
        candidate_indices = set()
        for kw in rxn_kw:
            candidate_indices.update(kw_index.get(kw, set()))

        for idx in candidate_indices:
            row = bakta_products.loc[idx]
            overlap = rxn_kw & row['product_kw']
            if len(overlap) >= 2:
                product_matches.append({
                    'rxn_id_base': rxn_id,
                    'rxn_name': rxn_name,
                    'orgId': row['orgId'],
                    'gene_cluster_id': row['gene_cluster_id'],
                    'bakta_product': row['bakta_product'],
                    'bakta_ec': row['bakta_ec'],
                    'bakta_ko': row['bakta_ko'],
                    'is_core': row['is_core'],
                    'uniref50': row['uniref50'],
                    'keyword_overlap': ','.join(sorted(overlap)),
                    'n_keywords': len(overlap),
                    'source': 'product_match',
                })

    product_df = pd.DataFrame(product_matches).drop_duplicates()
    print(f'\nProduct-name matches for dark reactions: {len(product_df)}')
    if len(product_df) > 0:
        print(product_df[['rxn_id_base', 'rxn_name', 'orgId', 'bakta_product',
                          'keyword_overlap']].head(20).to_string(index=False))

Dark reactions with extractable keywords: 16
Total unique keywords: 33
Keywords: ['alcohol', 'aldose1epim', 'alpha', 'amylase', 'benzyl', 'betagalactosid', 'biotin', 'branching', 'cystagly', 'cystathionase', 'cytosol', 'dehydrogenase', 'dextrin', 'enzyme', 'extraorganism', 'glucan', 'glucosidase', 'hydroxy', 'hydroxybenzyl', 'isomerase', 'ketoacyl', 'maltodextrin', 'methylaconitate', 'mutase', 'ncair', 'passive', 'phosphate', 'phosphorylase', 'rxn0', 'synthase', 'synthetase', 'thiazole', 'uridine']

Bakta products matching keywords (SQL-filtered): 17,708

Product-name matches for dark reactions: 594
rxn_id_base                               rxn_name  orgId                                                                                   bakta_product       keyword_overlap
   rxn05229      NCAIR synthetase and NCAIR mutase    MR1                 Multifunctional chorismate mutase P/prephenate dehydratase/DAHP synthetase PheA     mutase,synthetase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase Btheta                                                                           Alcohol dehydrogenase alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase Btheta                                                                      Alcohol dehydrogenase YqhD alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Keio                                Putative oxidoreductase, aryl-alcohol dehydrogenase like protein alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Dino                                                                  Putative alcohol dehydrogenase alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase Btheta                                                                      Aryl-alcohol dehydrogenase alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Dino                      S-(hydroxymethyl)glutathione dehydrogenase/class III alcohol dehydrogenase alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Dino                                                           Zinc-containing alcohol dehydrogenase alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase Btheta           Alcohol dehydrogenase iron-type/glycerol dehydrogenase GldA domain-containing protein alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Keio                                       Alcohol dehydrogenase catalytic domain-containing protein alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Keio                                             bifunctional acetaldehyde-CoA/alcohol dehydrogenase alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Keio                                                                      alcohol dehydrogenase AdhP alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase Btheta                                                           Iron-containing alcohol dehydrogenase alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Dino                                             Putative alcohol dehydrogenase cytochrome c subunit alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Dino                                                      Alcohol dehydrogenase cytochrome c subunit alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase Btheta                                                            Aryl-alcohol dehydrogenase (NADP(+)) alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Keio                                                                           alcohol dehydrogenase alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Dino                                               Alcohol dehydrogenase zinc-binding domain protein alcohol,dehydrogenase
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Keio bifunctional NADP-dependent 3-hydroxy acid dehydrogenase/3-hydroxypropionate dehydrogenase YdfG dehydrogenase,hydroxy
   rxn12239 4-hydroxy-benzyl-alcohol dehydrogenase   Keio                                       Alcohol dehydrogenase catalytic domain-containing protein alcohol,dehydrogenase

# Collect all UniRef IDs from Bakta matches (EC + product)
all_bakta = pd.concat([
    bakta_new[['orgId', 'rxn_id_base', 'gene_cluster_id', 'bakta_ec',
               'bakta_product', 'bakta_ko', 'uniref50', 'uniref90', 'uniref100',
               'is_core', 'source']],
    product_df[['orgId', 'rxn_id_base', 'gene_cluster_id', 'bakta_ec',
                'bakta_product', 'bakta_ko', 'uniref50',
                'is_core', 'source']]
], ignore_index=True)

# Also include NB03 resolved candidates that have Bakta data
nb03_clusters = set()
if 'gene_cluster_id' in candidates.columns:
    nb03_clusters = set(candidates['gene_cluster_id'].dropna())

# Extract unique UniRef IDs for BLAST
uniref_ids = set()
for col in ['uniref50', 'uniref90', 'uniref100']:
    if col in all_bakta.columns:
        uniref_ids.update(all_bakta[col].dropna().unique())

# Also get UniRef IDs for the originally matched gene clusters from NB03
if len(bakta_gc) > 0:
    for col in ['uniref50', 'uniref90', 'uniref100']:
        uniref_ids.update(bakta_gc[col].dropna().unique())

uniref_df = pd.DataFrame({'uniref_id': sorted(uniref_ids)})
uniref_df['source_db'] = uniref_df['uniref_id'].apply(
    lambda x: 'UniRef50' if 'UniRef50' in str(x) else
              ('UniRef90' if 'UniRef90' in str(x) else 'UniRef100')
)

print(f'Total unique UniRef IDs for BLAST: {len(uniref_df)}')
print(uniref_df['source_db'].value_counts().to_string())

Total unique UniRef IDs for BLAST: 54549
source_db
UniRef100    19602
UniRef90     17848
UniRef50     17099

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

# Figure 1: GapMind score_category for gapfilled pathways
if len(with_data) > 0:
    cat_order = ['complete', 'likely_complete', 'steps_missing_low',
                 'steps_missing_medium', 'not_present']
    colors = ['#2ecc71', '#82e0aa', '#f39c12', '#e74c3c', '#95a5a6']
    
    fig, ax = plt.subplots(figsize=(10, 5))
    cat_counts = with_data['score_category'].value_counts().reindex(cat_order).fillna(0)
    cat_counts.plot(kind='bar', ax=ax, color=colors)
    ax.set_title('GapMind Pathway Status for Gapfilled Carbon Sources')
    ax.set_ylabel('Number of Cases')
    ax.set_xlabel('GapMind Score Category')
    ax.tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.savefig(f'{FIG_DIR}/gapmind_score_categories.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f'Saved: {FIG_DIR}/gapmind_score_categories.png')

# Figure 2: Bakta new candidates summary
fig, ax = plt.subplots(figsize=(8, 4))
sources = ['NB03 (EC match)', 'Bakta EC (new)', 'Product match']
counts = [
    candidates[['orgId', 'rxn_id_base']].drop_duplicates().shape[0],
    bakta_new[['orgId', 'rxn_id_base']].drop_duplicates().shape[0] if len(bakta_new) > 0 else 0,
    product_df[['orgId', 'rxn_id_base']].drop_duplicates().shape[0] if len(product_df) > 0 else 0,
]
ax.bar(sources, counts, color=['steelblue', '#2ecc71', '#f39c12'])
ax.set_ylabel('Reaction-Organism Pairs with Candidates')
ax.set_title('Gene Candidate Discovery by Source')
for i, v in enumerate(counts):
    ax.text(i, v + 0.5, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.savefig(f'{FIG_DIR}/nb04_candidate_sources.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved: {FIG_DIR}/nb04_candidate_sources.png')

Saved: ../figures/gapmind_score_categories.png
Saved: ../figures/nb04_candidate_sources.png

# Save outputs
concordance_save = concordance[['orgId', 'carbon_source', 'gapmind_pathway',
    'n_gapfilled_rxns', 'score_category', 'score', 'nHi', 'nMed', 'nLo',
    'total_steps', 'has_gapmind']].copy()
concordance_save.to_csv(f'{DATA_DIR}/gapmind_concordance.tsv', sep='\t', index=False)
print(f'Saved gapmind_concordance.tsv: {len(concordance_save)} rows')

save_cols = ['orgId', 'rxn_id_base', 'gene_cluster_id', 'bakta_ec',
             'bakta_product', 'bakta_ko', 'is_core', 'source']
save_cols = [c for c in save_cols if c in all_bakta.columns]
all_bakta[save_cols].to_csv(f'{DATA_DIR}/bakta_ec_alternatives.tsv', sep='\t', index=False)
print(f'Saved bakta_ec_alternatives.tsv: {len(all_bakta)} rows')

uniref_df.to_csv(f'{DATA_DIR}/uniref_ids_for_blast.tsv', sep='\t', index=False)
print(f'Saved uniref_ids_for_blast.tsv: {len(uniref_df)} rows')

# Save clade summary
clade_summary.to_csv(f'{DATA_DIR}/gapmind_clade_summary.tsv', sep='\t', index=False)
print(f'Saved gapmind_clade_summary.tsv: {len(clade_summary)} rows')

print()
print('=' * 60)
print('NB04 SUMMARY')
print('=' * 60)
print(f'GapMind concordance cases:       {len(concordance_save)}')
n_supports = (concordance_save['score_category'].isin(['steps_missing_low', 'steps_missing_medium', 'not_present'])).sum()
print(f'  GapMind supports gap:          {n_supports}')
print(f'Bakta new EC candidates:         {len(bakta_new)}')
print(f'Product-name matches:            {len(product_df)}')
print(f'UniRef IDs for BLAST:            {len(uniref_df)}')
print('=' * 60)

Saved gapmind_concordance.tsv: 104 rows
Saved bakta_ec_alternatives.tsv: 1459 rows
Saved uniref_ids_for_blast.tsv: 54549 rows
Saved gapmind_clade_summary.tsv: 168 rows

============================================================
NB04 SUMMARY
============================================================
GapMind concordance cases:       104
  GapMind supports gap:          32
Bakta new EC candidates:         865
Product-name matches:            594
UniRef IDs for BLAST:            54549
============================================================

04 Gapmind Bakta Annotations

NB04: GapMind Pathway Decomposition + Bakta Alternative Annotations¶

1. GapMind Score Category Decomposition¶

2. Clade-Level GapMind Patterns¶

3. Bakta Alternative EC Annotations¶

4. UniRef ID Extraction for NB06 BLAST¶

5. Summary and Visualization¶