import pandas as pd, numpy as np, json, os, math
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path
from dotenv import load_dotenv

DATA = Path('../data'); FIG = Path('../figures')
pd.set_option('display.max_columns', 40); pd.set_option('display.width', 220)

# on-cluster Spark Connect (token from .env; never printed)
load_dotenv('/home/aparkin/BERIL-research-observatory/.env')
_tok = os.environ['KBASE_AUTH_TOKEN']
from pyspark.sql import SparkSession
_url = f'sc://jupyter-aparkin.jupyterhub-prod:15002/;use_ssl=false;x-kbase-token={_tok}'
spark = SparkSession.builder.remote(_url).getOrCreate()
DB = 'enigma_genome_depot_enigma'
print('spark connected:', spark.version)

spark connected: 4.0.1

import re
link = pd.read_csv(DATA / 'compound_linkage_deepened.tsv', sep='\t')
cache = json.loads((DATA / 'deepen_cache.json').read_text())

def _body(url):
    e = cache.get(url)
    if not e: return None
    return e.get('body') if isinstance(e, dict) else e

def rxns_for(cnum):
    body = _body(f'https://rest.kegg.jp/link/reaction/{cnum}')
    if not body: return []
    out = []
    for p in body.strip().split('\n'):
        f = p.split('\t')
        if len(f) == 2 and f[1].startswith('rn:'):
            out.append(f[1][3:])
    return out

# Reaction-level degradation-map membership: the 28 KEGG maps whose names match
# /degrad|catabol/, expanded to their member reactions (link/reaction/map{XXXXX}).
DEG_RE = re.compile(r'degrad|catabol', re.I)
deg_maps = []
for line in (_body('https://rest.kegg.jp/list/pathway') or '').strip().split('\n'):
    f = line.split('\t')
    if len(f) == 2 and DEG_RE.search(f[1]):
        deg_maps.append(f[0].replace('path:', '').replace('map', ''))
degmap_rxns = set()
for m in deg_maps:
    b = _body(f'https://rest.kegg.jp/link/reaction/map{m}')
    if not b: continue
    for line in b.strip().split('\n'):
        f = line.split('\t')
        if len(f) == 2 and f[1].startswith('rn:'):
            degmap_rxns.add(f[1][3:])

# Curated catabolic allowlist: unambiguous compound-CONSUMING steps in mixed maps.
#   R02107  xanthine -> urate            (xanthine oxidase; purine catabolism)
#   R02612  phenethylamine -> PAA + NH3  (deaminating amine oxidase)
#   R07202  abscisate -> 8'-OH-abscisate (ABA 8'-hydroxylase; ABA inactivation)
CATABOLIC_ALLOWLIST = {'R02107', 'R02612', 'R07202'}

kid = link[link['kegg_id'].notna()].copy()
kid['rxn_set'] = kid['kegg_id'].map(rxns_for)
kid['has_degmap'] = kid['kegg_deg_maps'].apply(lambda x: isinstance(x, str) and len(str(x)) > 3)
all_rxn = sorted({r for s in kid['rxn_set'] for r in s})
print('KEGG-mapped compounds:', len(kid))
print('with >=1 reaction   :', int((kid['rxn_set'].map(len) > 0).sum()))
print('distinct reactions  :', len(all_rxn))
print('degradation maps    :', len(deg_maps), '| reactions in degradation maps:', len(degmap_rxns))

KEGG-mapped compounds: 54
with >=1 reaction   : 37
distinct reactions  : 207
degradation maps    : 28 | reactions in degradation maps: 1207

kr = spark.sql(f'SELECT id, kegg_id FROM {DB}.browser_kegg_reaction').toPandas()
r2fk = dict(zip(kr['kegg_id'], kr['id']))
fk2r = {v: k for k, v in r2fk.items()}
fks = [r2fk[r] for r in all_rxn if r in r2fk]
NGEN = spark.sql(f'SELECT COUNT(DISTINCT genome_id) n FROM {DB}.browser_gene').toPandas()['n'][0]
print(f'reactions in genome_depot lookup: {len(fks)}/{len(all_rxn)}   total genomes: {NGEN}')

fkstr = ','.join(map(str, fks))
# (reaction_fk, genome_id) carriage for ALL in-genome reactions
carr = spark.sql(f'''
  SELECT rr.kegg_reaction_id fk, g.genome_id
  FROM {DB}.browser_protein_kegg_reactions rr
  JOIN {DB}.browser_gene g ON g.protein_id = rr.protein_id
  WHERE rr.kegg_reaction_id IN ({fkstr})
  GROUP BY rr.kegg_reaction_id, g.genome_id
''').toPandas()
carr['rxn'] = carr['fk'].map(fk2r)
prev = carr.groupby('rxn')['genome_id'].nunique().rename('n_gen').reset_index()
prev['prev_frac'] = prev['n_gen'] / NGEN
prev_map = dict(zip(prev['rxn'], prev['prev_frac']))
print('carriage rows:', len(carr))
print(prev['prev_frac'].describe().to_string())

reactions in genome_depot lookup: 67/207   total genomes: 3109

carriage rows: 57631
count    67.000000
mean      0.276669
std       0.312125
min       0.000322
25%       0.007076
50%       0.137665
75%       0.566259
max       0.964940

SIG = 0.10
prev['kind'] = np.where(prev['prev_frac'] < SIG, 'signature', 'generic')
sig_rxns = set(prev[prev.kind == 'signature']['rxn'])

def cat_source(r):
    if r in degmap_rxns: return 'degmap'
    if r in CATABOLIC_ALLOWLIST: return 'allowlist'
    return ''
prev['cat_source'] = prev['rxn'].map(lambda r: cat_source(r) if r in sig_rxns else '')
cat_rxns = {r for r in sig_rxns if cat_source(r)}

# map each signature reaction to the compound name(s) it serves, for the audit print
rxn2names = {}
for _, r in kid.iterrows():
    for x in r['rxn_set']:
        if x in sig_rxns:
            rxn2names.setdefault(x, set()).add(r['name'])

print(prev['kind'].value_counts().to_string())
print(f'signature reactions: {len(sig_rxns)}  ->  catabolic (kept): {len(cat_rxns)}'
      f'  (degmap {sum(1 for r in cat_rxns if r in degmap_rxns)},'
      f' allowlist {sum(1 for r in cat_rxns if r in CATABOLIC_ALLOWLIST)})')
print()
print('SIGNATURE REACTIONS (catabolic status; * = dropped as biosynthetic/wrong-direction):')
sg = prev[prev.kind == 'signature'].sort_values('prev_frac')
for _, row in sg.iterrows():
    r = row['rxn']; src = row['cat_source'] or '*DROP'
    nm = ','.join(sorted(rxn2names.get(r, [])))[:34]
    print(f'  {r:8s} {row["prev_frac"]*100:6.2f}%  {src:9s}  {nm}')

kind
generic      37
signature    30
signature reactions: 30  ->  catabolic (kept): 13  (degmap 10, allowlist 3)

SIGNATURE REACTIONS (catabolic status; * = dropped as biosynthetic/wrong-direction):
  R01274     0.03%  *DROP      Palmitic Acid
  R10412     0.03%  *DROP      Farnesol
  R09849     0.03%  *DROP      Farnesol
  R06957     0.03%  *DROP      Abscisic acid
  R07202     0.06%  allowlist  Abscisic acid
  R01706     0.06%  *DROP      Palmitic Acid
  R10902     0.13%  *DROP      Tyramine
  R00697     0.16%  *DROP      Cinnamic acid
  R07703     0.26%  degmap     phthalic acid
  R11664     0.32%  *DROP      lauric acid
  R02255     0.39%  *DROP      Cinnamic acid
  R01943     0.39%  *DROP      caffeic acid
  R07965     0.58%  *DROP      xanthine
  R07966     0.58%  *DROP      xanthine
  R01769     0.61%  *DROP      xanthine
  R02107     0.61%  allowlist  xanthine
  R09227     0.64%  degmap     phthalic acid
  R01883     0.77%  *DROP      GUANIDINEACETIC ACID
  R02252     0.77%  *DROP      Cinnamic acid
  R01294     0.80%  degmap     4-hydroxybenzaldehyde
  R05643     1.83%  degmap     phthalic acid
  R05360     2.19%  degmap     salicylic acid
  R02612     2.61%  allowlist  Phenylethylamine
  R01628     2.96%  degmap     3-hydroxybenzoic acid
  R05148     3.12%  degmap     TEREPHTHALIC ACID
  R10597     3.67%  *DROP      3-hydroxybenzoic acid
  R02941     4.21%  degmap     salicylic acid
  R00565     4.60%  *DROP      GUANIDINEACETIC ACID
  R02675     5.34%  degmap     4-hydroxybenzaldehyde
  R05375     7.24%  degmap     3-hydroxybenzoic acid

rows = []
for _, r in kid.iterrows():
    rs = r['rxn_set']
    in_gd = [x for x in rs if x in prev_map]
    sig = [x for x in in_gd if x in sig_rxns]
    cat = [x for x in sig if x in cat_rxns]
    cat_dm = [x for x in cat if x in degmap_rxns]
    rows.append(dict(compound_id=r['compound_id'], name=r['name'],
                     npc_pathway=r['npc_pathway'], kegg_id=r['kegg_id'],
                     n_rxn=len(rs), n_in_gd=len(in_gd), n_sig=len(sig),
                     n_cat=len(cat), has_cat_degmap=bool(cat_dm),
                     cat_rxns=';'.join(cat)))
cov = pd.DataFrame(rows)
N = len(link)
print(f'compounds total                          : {N}')
print(f'  with KEGG id                           : {len(kid)}')
print(f'  with >=1 reaction in genome_depot      : {int((cov.n_in_gd > 0).sum())}')
print(f'  with >=1 signature reaction            : {int((cov.n_sig > 0).sum())}')
print(f'  with >=1 CATABOLIC signature (callable) : {int((cov.n_cat > 0).sum())}')
print(f'    of which degradation-map (T2)        : {int((cov.has_cat_degmap).sum())}')
print(f'    allowlist-only (T3)                  : {int(((cov.n_cat > 0) & ~cov.has_cat_degmap).sum())}')
print()
print(cov[cov.n_cat > 0][['name', 'kegg_id', 'n_rxn', 'n_in_gd', 'n_sig', 'n_cat', 'has_cat_degmap']]
      .sort_values(['has_cat_degmap', 'n_cat'], ascending=[False, False]).to_string(index=False))

compounds total                          : 83
  with KEGG id                           : 54
  with >=1 reaction in genome_depot      : 21
  with >=1 signature reaction            : 15
  with >=1 CATABOLIC signature (callable) : 8
    of which degradation-map (T2)        : 5
    allowlist-only (T3)                  : 3

                 name kegg_id  n_rxn  n_in_gd  n_sig  n_cat  has_cat_degmap
        phthalic acid  C01606      7        4      3      3            True
4-hydroxybenzaldehyde  C00633     12        4      2      2            True
       salicylic acid  C00805     17        6      2      2            True
3-hydroxybenzoic acid  C00587      9        5      3      2            True
    TEREPHTHALIC ACID  C06337      3        2      1      1            True
             xanthine  C00385     17       10      4      1           False
     Phenylethylamine  C05332      4        3      1      1           False
        Abscisic acid  C06082      6        2      2      1           False

gen = spark.sql(f'SELECT id genome_id, name genome_name, strain_id, taxon_id FROM {DB}.browser_genome').toPandas()
strn = spark.sql(f'SELECT id strain_pk, strain_id strain_code, full_name strain_full FROM {DB}.browser_strain').toPandas()
tax = spark.sql(f'SELECT id taxon_pk, taxonomy_id ncbi_taxid, rank, name taxon_name FROM {DB}.browser_taxon').toPandas()
gmap = gen.merge(strn, left_on='strain_id', right_on='strain_pk', how='left') \
          .merge(tax, left_on='taxon_id', right_on='taxon_pk', how='left')
gmap = gmap[['genome_id', 'genome_name', 'strain_full', 'ncbi_taxid', 'rank', 'taxon_name']]
print('genome metadata rows:', len(gmap))
print(gmap.head(5).to_string(index=False))

genome metadata rows: 3110
 genome_id     genome_name                   strain_full ncbi_taxid    rank                    taxon_name
         1          2APBS1   Rhodanobacter denitrificans     666685 species   Rhodanobacter denitrificans
         2 acidovorax_3H11     Acidovorax sp. GW101-3H11    1813946 species     Acidovorax sp. GW101-3H11
         3            Agro Agrobacterium fabrum str. C58     176299  strain Agrobacterium fabrum str. C58
         4            ANA3          Shewanella sp. ANA-3      94122 species          Shewanella sp. ANA-3
         5          azobra Azospirillum brasilense Sp245    1064539 species      Azospirillum baldaniorum

# reaction -> compounds (a CATABOLIC signature reaction may serve several compounds)
rxn2comp = {}
for _, r in kid.iterrows():
    for x in r['rxn_set']:
        if x in cat_rxns:
            rxn2comp.setdefault(x, []).append(r['compound_id'])

# carriage restricted to CATABOLIC signature reactions
csig = carr[carr['rxn'].isin(cat_rxns)].copy()
comp_meta = kid.set_index('compound_id')
cat_tot = {cid: sum(1 for x in comp_meta.loc[cid, 'rxn_set'] if x in cat_rxns)
           for cid in comp_meta.index}
w = {x: -math.log10(prev_map[x]) for x in cat_rxns}

from collections import defaultdict
acc = defaultdict(lambda: {'rxns': [], 'score': 0.0})
for _, row in csig.iterrows():
    rx, gid = row['rxn'], row['genome_id']
    for cid in rxn2comp.get(rx, []):
        a = acc[(cid, gid)]
        a['rxns'].append(rx); a['score'] += w[rx]

pred = []
for (cid, gid), a in acc.items():
    m = comp_meta.loc[cid]
    carried = sorted(set(a['rxns']))
    nsig = len(carried)
    # reaction-level tier: degradation-map context beats allowlist-only
    tier = 'T2_pathway' if any(x in degmap_rxns for x in carried) else 'T3_signature'
    pred.append(dict(compound_id=cid, name=m['name'], npc_pathway=m['npc_pathway'],
                     kegg_id=m['kegg_id'], genome_id=gid,
                     tier=tier, n_sig_carried=nsig,
                     sig_completeness=round(nsig / max(cat_tot[cid], 1), 3),
                     score=round(a['score'], 3),
                     sig_rxns_carried=';'.join(carried)))
pred = pd.DataFrame(pred).merge(gmap, on='genome_id', how='left')
pred = pred.sort_values(['compound_id', 'score'], ascending=[True, False]).reset_index(drop=True)
print('prediction rows (compound x genome):', len(pred))
print('distinct compounds with calls :', pred['compound_id'].nunique())
print('distinct genomes called       :', pred['genome_id'].nunique())
print(pred['tier'].value_counts().to_string())

prediction rows (compound x genome): 948
distinct compounds with calls : 8
distinct genomes called       : 800
tier
T2_pathway      846
T3_signature    102

# per-compound summary: how many genomes, tier, top isolate
summ = (pred.groupby(['compound_id', 'name', 'npc_pathway', 'tier'])
        .agg(n_genomes=('genome_id', 'nunique'),
             max_score=('score', 'max'),
             max_completeness=('sig_completeness', 'max'))
        .reset_index().sort_values('n_genomes', ascending=False))
print(summ.to_string(index=False))

                compound_id                  name                     npc_pathway         tier  n_genomes  max_score  max_completeness
                     Cc1_10 3-hydroxybenzoic acid Shikimates and Phenylpropanoids   T2_pathway        296      2.669             1.000
                     Cc1_38        salicylic acid Shikimates and Phenylpropanoids   T2_pathway        197      3.035             1.000
                    Cc1_112 4-hydroxybenzaldehyde Shikimates and Phenylpropanoids   T2_pathway        175      3.367             1.000
KKEYFWRCBNTPAC-UHFFFAOYSA-N     TEREPHTHALIC ACID Shikimates and Phenylpropanoids   T2_pathway         97      1.506             1.000
XNGIFLGASWRNHJ-UHFFFAOYSA-N         phthalic acid Shikimates and Phenylpropanoids   T2_pathway         81      3.928             0.667
                    Cc1_208      Phenylethylamine                       Alkaloids T3_signature         81      1.584             1.000
                      Cc1_9              xanthine                       Alkaloids T3_signature         19      2.214             1.000
                    Cc1_117         Abscisic acid                      Terpenoids T3_signature          2      3.192             1.000

fig, ax = plt.subplots(figsize=(8, 5))
s = summ.sort_values('n_genomes')
colors = {'T2_pathway': '#2c7fb8', 'T3_signature': '#7fcdbb'}
ax.barh(s['name'], s['n_genomes'], color=[colors[t] for t in s['tier']])
ax.set_xlabel('ENIGMA isolate genomes carrying a signature reaction')
ax.set_title('Reaction-bridge organism calls (n=%d compounds of 83)' % summ['compound_id'].nunique())
from matplotlib.patches import Patch
ax.legend(handles=[Patch(color=colors['T2_pathway'], label='T2 (degradation-context)'),
                   Patch(color=colors['T3_signature'], label='T3 (signature enzyme)')],
          loc='lower right')
fig.tight_layout(); fig.savefig(FIG / '03_organism_calls.png', dpi=150)
print('saved 03_organism_calls.png'); plt.close(fig)

saved 03_organism_calls.png

pred_cols = ['compound_id', 'name', 'npc_pathway', 'kegg_id', 'tier',
             'genome_id', 'genome_name', 'strain_full', 'ncbi_taxid', 'rank', 'taxon_name',
             'n_sig_carried', 'sig_completeness', 'score', 'sig_rxns_carried']
pred[pred_cols].to_csv(DATA / 'compound_organism_predictions.tsv', sep='\t', index=False)
print('wrote data/compound_organism_predictions.tsv', pred.shape)

# compounds with NO enzyme-level organism call — the gap that widens at the organism step
called = set(pred['compound_id'])
dark = link[~link['compound_id'].isin(called)][['compound_id', 'name', 'npc_pathway', 'kegg_id', 'best_tier']].copy()
def why(r):
    if pd.isna(r['kegg_id']): return 'no_kegg'
    sub = cov[cov.compound_id == r['compound_id']]
    if len(sub) == 0: return 'no_kegg'
    s = sub.iloc[0]
    if s['n_in_gd'] == 0: return 'kegg_no_rxn_in_genomes'
    if s['n_sig'] == 0: return 'only_generic_rxns'
    return 'only_biosynthetic_signatures'  # had signature(s), none catabolic
dark['organism_dark_reason'] = dark.apply(why, axis=1)
dark.to_csv(DATA / 'compound_organism_dark.tsv', sep='\t', index=False)
print('organism-dark compounds:', len(dark), '/', len(link))
print(dark['organism_dark_reason'].value_counts().to_string())
print()
print(dark[['name', 'npc_pathway', 'organism_dark_reason']].to_string(index=False))

wrote data/compound_organism_predictions.tsv (948, 15)
organism-dark compounds: 75 / 83
organism_dark_reason
kegg_no_rxn_in_genomes          33
no_kegg                         29
only_biosynthetic_signatures     7
only_generic_rxns                6

                                         name                                npc_pathway         organism_dark_reason
                                       harman                                  Alkaloids       kegg_no_rxn_in_genomes
1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid                                  Alkaloids                      no_kegg
           2-hydroxy-4,7,8-trimethylquinoline                                  Alkaloids                      no_kegg
                         3,6-dimethylchromone                                Polyketides                      no_kegg
          7-hydroxy-4,8-dimethylchromen-2-one            Shikimates and Phenylpropanoids                      no_kegg
                                     Fraxetin            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
                      2',5'-dihydroxychalcone            Shikimates and Phenylpropanoids                      no_kegg
        Phthalic acid mono-2-ethylhexyl ester            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
          2-Isopropyl-5-methylcyclohexanamine                                 Terpenoids                      no_kegg
                                       Manool                                 Terpenoids                      no_kegg
                           Mellein / Ochracin            Shikimates and Phenylpropanoids                      no_kegg
                         (-)-Perillyl alcohol                                 Terpenoids       kegg_no_rxn_in_genomes
                                     Tyramine                                  Alkaloids only_biosynthetic_signatures
    choline sulfate / choline o-sulfuric acid                   Amino acids and Peptides       kegg_no_rxn_in_genomes
                    5,6-Dimethylbenzimidazole                                  Alkaloids            only_generic_rxns
                                     acridone                                  Alkaloids       kegg_no_rxn_in_genomes
                                       BIOTIN                                  Alkaloids            only_generic_rxns
                               Triacetonamine                                  Alkaloids                      no_kegg
                                   Cadaverine                                  Alkaloids            only_generic_rxns
           4,6,8-trimethylhydroquinolin-2-one                                  Alkaloids                      no_kegg
                        Indole-3-butyric acid                                  Alkaloids       kegg_no_rxn_in_genomes
                        indole-3-acetonitrile                                  Alkaloids            only_generic_rxns
               4-hydroxy-1-methyl-2-quinolone                                  Alkaloids                      no_kegg
               1,4-dimethylquinolin-2(1h)-one                                  Alkaloids                      no_kegg
                           2-hydroxyquinoline                                  Alkaloids       kegg_no_rxn_in_genomes
                         3-Indoleacrylic acid Alkaloids, Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
                         GUANIDINEACETIC ACID                   Amino acids and Peptides only_biosynthetic_signatures
                           DODECANEDIOIC ACID                                Fatty acids       kegg_no_rxn_in_genomes
                        TETRADECANEDIOIC ACID                                Fatty acids                      no_kegg
                Decanedioic acid/Sebacic acid                                Fatty acids       kegg_no_rxn_in_genomes
                               Brassylic acid                                Fatty acids                      no_kegg
                         Hexadecanedioic acid                                Fatty acids       kegg_no_rxn_in_genomes
                 alpha-hydroxyisobutyric acid                                Fatty acids       kegg_no_rxn_in_genomes
                    12-Hydroxydodecanoic acid                                Fatty acids       kegg_no_rxn_in_genomes
                  sn-glycero-3-phosphocholine                                Fatty acids       kegg_no_rxn_in_genomes
                             6-methylchromone                                Polyketides       kegg_no_rxn_in_genomes
                      3-hydroxybenzyl alcohol            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
               4-(4-Hydroxyphenyl)butan-2-one            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
                                    zingerone            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
            3-(2-Hydroxyphenyl)propionic acid            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
                         6-aminochromen-2-one            Shikimates and Phenylpropanoids                      no_kegg
                           2-Hydroxychalcone             Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
                           7-hydroxyflavanone            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
                   2-Hydroxyphenylacetic acid            Shikimates and Phenylpropanoids            only_generic_rxns
                                     guaiacol            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
                                Cinnamic acid            Shikimates and Phenylpropanoids only_biosynthetic_signatures
                                 caffeic acid            Shikimates and Phenylpropanoids only_biosynthetic_signatures
                                     ANETHOLE            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
                           Benzylideneacetone            Shikimates and Phenylpropanoids       kegg_no_rxn_in_genomes
                     (±)-Dihydroactinidiolide                                 Terpenoids                      no_kegg
                        (S)-(-)-Perillic acid                                 Terpenoids       kegg_no_rxn_in_genomes
                                      CARVEOL                                 Terpenoids       kegg_no_rxn_in_genomes
                               (1R)-(-)-Nopol                                 Terpenoids                      no_kegg
                               ketopinic acid                                 Terpenoids                      no_kegg
                               (+)-Nootkatone                                 Terpenoids       kegg_no_rxn_in_genomes
                                   lumichrome                                  Alkaloids                      no_kegg
      2,3-dihydro-1H-indole-7-carboxylic acid                                  Alkaloids                      no_kegg
                      INDOLE-3-CARBOXALDEHYDE                                  Alkaloids       kegg_no_rxn_in_genomes
                           6-Hydroxyquinoline                                  Alkaloids                      no_kegg
               (2S)indoline-2-carboxylic acid                                  Alkaloids                      no_kegg
                                Palmitic Acid                                Fatty acids only_biosynthetic_signatures
                                  lauric acid                                Fatty acids only_biosynthetic_signatures
                   3-Hydroxyphenylacetic acid            Shikimates and Phenylpropanoids            only_generic_rxns
                                     Farnesol                                 Terpenoids only_biosynthetic_signatures
                                    D-camphor                                 Terpenoids       kegg_no_rxn_in_genomes
                                    NEROLIDOL                                 Terpenoids                      no_kegg
                                    Bisabolol                                 Terpenoids                      no_kegg
                            (-)-Isolongifolol                                 Terpenoids                      no_kegg
                        b-Caryophyllene oxide                                 Terpenoids                      no_kegg
                                     Sclareol                                 Terpenoids       kegg_no_rxn_in_genomes
                                   SALSOLINOL                                  Alkaloids       kegg_no_rxn_in_genomes
  1H-pyrrolo[3,2-b]pyridine-5-carboxylic acid                                  Alkaloids                      no_kegg
                2,6-Pyridinedicarboxylic acid                                  Alkaloids                      no_kegg
                    N-methylanthranilic acid                                   Alkaloids       kegg_no_rxn_in_genomes
                       4-Methoxycinnamic acid            Shikimates and Phenylpropanoids                      no_kegg

03 Organism Mapping

NB03 — Organism Mapping (ENIGMA isolates, reaction bridge)¶

Load deepened linkage + KEGG reaction sets (from NB02b cache)¶

Reaction lookup + genome prevalence¶

Classify reactions: signature (<10% of genomes), then catabolic filter¶

Per-compound reaction coverage¶

Genome → strain → taxon resolution¶

Build per-(compound, genome) predictions¶

Figure: candidate-utilizer breadth per compound¶

Save predictions + organism-dark gap list¶