import pandas as pd, numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path

DATA = Path('../data'); FIG = Path('../figures')
pd.set_option('display.max_columns', 50); pd.set_option('display.width', 240)

comp = pd.read_csv(DATA / 'compounds_selected.tsv', sep='\t')
link = pd.read_csv(DATA / 'compound_linkage_deepened.tsv', sep='\t')
dark = pd.read_csv(DATA / 'compound_organism_dark.tsv', sep='\t')
deliv_a = pd.read_csv(DATA / 'enigma_utilizer_predictions.tsv', sep='\t')
phylo = pd.read_csv(DATA / 'phylo_utilizer_map.tsv', sep='\t')
env = pd.read_csv(DATA / 'environmental_atlas.tsv', sep='\t')
print('compounds:', len(comp), '| linkage rows:', len(link), '| organism-dark:', len(dark))
print('deliverable (a) rows:', len(deliv_a), '| (b) strain placements:', len(phylo),
      '| (c) genus field-calls:', len(env))

compounds: 83 | linkage rows: 83 | organism-dark: 75
deliverable (a) rows: 569 | (b) strain placements: 494 | (c) genus field-calls: 156

n_total = len(comp)
n_struct = link['inchikey'].notna().sum()
n_kegg = link['kegg_id'].notna().sum()
# review I1: callable = ENIGMA-isolate call OR Tier-1 measured carbon-source fitness
enigma_ids = set(deliv_a[deliv_a['tier'] != 'T0_organism_dark']['compound_id'])
measured_ids = set(link[link['best_tier'] == 'T1_measured']['compound_id'])
callable_ids = enigma_ids | measured_ids
# basis label: enigma call dominates; measured_fitness only where there is no enigma call
callable_basis = {cid: ('enigma_isolate_call' if cid in enigma_ids else 'measured_fitness')
                  for cid in callable_ids}
n_callable = len(callable_ids)
n_measured_only = len(measured_ids - enigma_ids)
n_dark = comp['compound_id'].nunique() - n_callable
print(f'callable bases: enigma_isolate_call={len(enigma_ids)}, '
      f'measured_fitness (no enigma call)={n_measured_only}, total callable={n_callable}')
n_placed = phylo['ncbi_taxid'].nunique()
n_high = (phylo['certainty'] == 'high').sum()
n_field_genera = env[env['field_detected']]['genus'].nunique()

funnel = pd.DataFrame([
    ('compounds in census', n_total),
    ('structure resolved (InChIKey)', int(n_struct)),
    ('KEGG-linked', int(n_kegg)),
    ('callable (organism call or measured)', n_callable),
    ('organism-dark (no call)', n_dark),
], columns=['step', 'n'])
print(funnel.to_string(index=False))
print(f'\nutilizer strains placed (b): {n_placed} | high-certainty strain-calls: {n_high}')
print(f'utilizer genera present in SSO field (c): {n_field_genera}')

callable bases: enigma_isolate_call=8, measured_fitness (no enigma call)=1, total callable=9
                                step  n
                 compounds in census 83
       structure resolved (InChIKey) 83
                         KEGG-linked 54
callable (organism call or measured)  9
             organism-dark (no call) 74

utilizer strains placed (b): 359 | high-certainty strain-calls: 64
utilizer genera present in SSO field (c): 62

fig, ax = plt.subplots(figsize=(8, 4))
vals = funnel[funnel['step'] != 'organism-dark (no call)']
ax.barh(range(len(vals)), vals['n'], color='#2b8cbe')
ax.set_yticks(range(len(vals))); ax.set_yticklabels(vals['step'])
ax.invert_yaxis()
for i, v in enumerate(vals['n']):
    ax.text(v + 0.5, i, str(int(v)), va='center', fontsize=9)
ax.set_xlabel('compounds')
ax.set_title('ENIGMA Carbon Census discovery funnel (83 compounds)')
fig.tight_layout(); fig.savefig(FIG / '08_funnel.png', dpi=150)
print('saved 08_funnel.png'); plt.close(fig)

saved 08_funnel.png

# (b) breadth + certainty per compound
b = (phylo.groupby('name')
     .agg(n_strains=('ncbi_taxid', 'nunique'),
          n_high=('certainty', lambda s: (s == 'high').sum()),
          n_genera=('genus', 'nunique')).reset_index())
# (c) field occurrence per compound
c = (env.groupby('name')
     .agg(n_genera_c=('genus', 'nunique'),
          n_field=('field_detected', 'sum'),
          top_field_prev=('prevalence', 'max')).reset_index())

master = comp[['compound_id', 'name', 'source_short', 'npc_pathway']].copy()
master = master.merge(link[['compound_id', 'best_tier', 'deep_tier', 'rescued']],
                      on='compound_id', how='left')
master['callable'] = master['compound_id'].isin(callable_ids)
master['callable_basis'] = master['compound_id'].map(callable_basis).fillna('')
master = master.merge(b, on='name', how='left').merge(c, on='name', how='left')
for col in ['n_strains', 'n_high', 'n_genera', 'n_genera_c', 'n_field']:
    master[col] = master[col].fillna(0).astype(int)
master = master.sort_values(['callable', 'n_strains'], ascending=[False, False])
print('=== CALLABLE COMPOUNDS ===')
print(master[master['callable']][['name', 'source_short', 'npc_pathway', 'best_tier',
      'callable_basis', 'n_strains', 'n_high', 'n_genera', 'n_field',
      'top_field_prev']].to_string(index=False))

=== CALLABLE COMPOUNDS ===
                 name source_short                     npc_pathway     best_tier      callable_basis  n_strains  n_high  n_genera  n_field  top_field_prev
       salicylic acid  groundwater Shikimates and Phenylpropanoids T2_3_reaction enigma_isolate_call        129       2        24       20        0.862745
3-hydroxybenzoic acid  groundwater Shikimates and Phenylpropanoids T2_3_reaction enigma_isolate_call        127      18        42       33        0.901961
4-hydroxybenzaldehyde  groundwater Shikimates and Phenylpropanoids T2_3_reaction enigma_isolate_call        125      10        34       25        0.862745
        phthalic acid    necromass Shikimates and Phenylpropanoids T2_3_reaction enigma_isolate_call         36       0        15       12        0.745098
    TEREPHTHALIC ACID    necromass Shikimates and Phenylpropanoids T2_3_reaction enigma_isolate_call         34      34        17       10        0.666667
     Phenylethylamine  groundwater                       Alkaloids T2_3_reaction enigma_isolate_call         28       0        11        7        0.607843
             xanthine  groundwater                       Alkaloids       T3_kegg enigma_isolate_call         13       0         6        4        0.745098
        Abscisic acid  groundwater                      Terpenoids T2_3_reaction enigma_isolate_call          2       0         1        1        0.019608
          lauric acid    necromass                     Fatty acids   T1_measured    measured_fitness          0       0         0        0             NaN

dk = master[~master['callable']].copy()
dk = dk.merge(dark[['compound_id', 'organism_dark_reason']], on='compound_id', how='left')
print(f'organism-dark compounds: {len(dk)} / {len(master)} '
      f'({100*len(dk)/len(master):.0f}%)')
print('\nby compound source:')
print(dk.groupby('source_short')['compound_id'].nunique().to_string())
print('\nby NPC pathway (top dark classes):')
print(dk.groupby('npc_pathway')['compound_id'].nunique().sort_values(ascending=False).to_string())
print('\nby dark reason:')
print(dk['organism_dark_reason'].value_counts().to_string())

organism-dark compounds: 74 / 83 (89%)

by compound source:
source_short
groundwater    53
necromass      21

by NPC pathway (top dark classes):
npc_pathway
Alkaloids                                     24
Shikimates and Phenylpropanoids               20
Terpenoids                                    16
Fatty acids                                    9
Amino acids and Peptides                       2
Polyketides                                    2
Alkaloids, Shikimates and Phenylpropanoids     1

by dark reason:
organism_dark_reason
kegg_no_rxn_in_genomes          33
no_kegg                         29
only_biosynthetic_signatures     6
only_generic_rxns                6

out = master[['compound_id', 'name', 'source_short', 'npc_pathway',
              'best_tier', 'deep_tier', 'rescued', 'callable', 'callable_basis',
              'n_strains', 'n_high', 'n_genera', 'n_field', 'top_field_prev']].copy()
out = out.merge(dark[['compound_id', 'organism_dark_reason']], on='compound_id', how='left')
out = out.sort_values(['callable', 'n_strains'], ascending=[False, False]).reset_index(drop=True)
out.to_csv(DATA / 'census_master_summary.tsv', sep='\t', index=False)
print('wrote data/census_master_summary.tsv', out.shape)

print('\n' + '=' * 60)
print('ENIGMA CARBON CENSUS — HEADLINE')
print('=' * 60)
print(f'compounds in census              : {n_total}')
print(f'structure-resolved               : {int(n_struct)}')
print(f'KEGG-linked                      : {int(n_kegg)}')
print(f'CALLABLE (organism call OR measured): {n_callable}')
print(f'  of which enigma_isolate_call     : {len(enigma_ids)}')
print(f'  of which measured_fitness only   : {n_measured_only}  (lauric acid; FB ref org, not ENIGMA)')
print(f'ORGANISM-DARK (discovery targets): {n_dark}  <- the actionable gap')
print(f'utilizer strains placed (b)      : {n_placed}  ({n_high} high-certainty)')
print(f'utilizer genera in SSO field (c) : {n_field_genera}')
print('H2 (modularity): co-occurrence not supported beyond the shared aromatic funnel; '
      'phylogenetic concentration in Burkholderiales IS supported.')
print('H3 (source-tracking): UNTESTABLE/CONFOUNDED (2/8 necromass, both phthalate-class).')

wrote data/census_master_summary.tsv (83, 15)

============================================================
ENIGMA CARBON CENSUS — HEADLINE
============================================================
compounds in census              : 83
structure-resolved               : 83
KEGG-linked                      : 54
CALLABLE (organism call OR measured): 9
  of which enigma_isolate_call     : 8
  of which measured_fitness only   : 1  (lauric acid; FB ref org, not ENIGMA)
ORGANISM-DARK (discovery targets): 74  <- the actionable gap
utilizer strains placed (b)      : 359  (64 high-certainty)
utilizer genera in SSO field (c) : 62
H2 (modularity): co-occurrence not supported beyond the shared aromatic funnel; phylogenetic concentration in Burkholderiales IS supported.
H3 (source-tracking): UNTESTABLE/CONFOUNDED (2/8 necromass, both phthalate-class).

08 Synthesis

NB08 — Synthesis: Three Deliverables + the Honest Gap Map¶

The discovery funnel (83 → callable)¶

Per-compound master summary¶

The organism-dark gap list (the headline result)¶

Write the master census table¶