import pandas as pd, numpy as np
import requests, json, time, re
from pathlib import Path
import matplotlib; matplotlib.use('Agg'); import matplotlib.pyplot as plt

DATA = Path('../data'); FIG = Path('../figures')
pd.set_option('display.max_columns', 40); pd.set_option('display.width', 200)

res = pd.read_csv(DATA / 'resolved_compounds.tsv', sep='\t')
lk  = pd.read_csv(DATA / 'compound_linkage.tsv', sep='\t')
# carry SMILES + iupac onto the linkage frame
lk = lk.merge(res[['compound_id','smiles','iupac_name']], on='compound_id', how='left')
dark = lk[lk['best_tier'] == 'T0_dark'].copy()
print('compounds:', len(lk), '| dark:', len(dark), '| KEGG-bearing:', int(lk['has_kegg'].sum()))
print('dark all have SMILES:', dark['smiles'].notna().all())

compounds: 83 | dark: 21 | KEGG-bearing: 54
dark all have SMILES: True

CACHE_PATH = DATA / 'deepen_cache.json'
_cache = json.loads(CACHE_PATH.read_text()) if CACHE_PATH.exists() else {}
_sess = requests.Session()

def _get(url, params=None, kind='text'):
    key = url + ('?' + '&'.join(f'{k}={v}' for k,v in sorted(params.items())) if params else '')
    if key in _cache:
        return _cache[key]
    time.sleep(0.34)  # polite: KEGG <3 req/s, NCBI <3 req/s anonymous
    try:
        r = _sess.get(url, params=params, timeout=30)
        body = r.json() if (kind=='json' and r.status_code==200) else r.text
        out = {'status': r.status_code, 'body': body}
    except Exception as e:
        out = {'status': -1, 'body': None, 'error': str(e)}
    _cache[key] = out
    return out

def _save():
    CACHE_PATH.write_text(json.dumps(_cache))

KEGG = 'https://rest.kegg.jp'
EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
print('cache entries:', len(_cache))

cache entries: 211

r = _get(f'{KEGG}/list/pathway')
MAP_NAME = {}
for line in r['body'].strip().splitlines():
    pid, _, nm = line.partition('\t')
    MAP_NAME[pid.replace('path:','').replace('map','')] = nm
DEGRAD = {mid: nm for mid, nm in MAP_NAME.items()
          if re.search(r'degrad|catabol', nm, re.I)}
_save()
print('total KEGG maps:', len(MAP_NAME), '| degradation maps:', len(DEGRAD))
print('e.g.:', {k: DEGRAD[k] for k in list(DEGRAD)[:6]})

total KEGG maps: 586 | degradation maps: 28
e.g.: {'01220': 'Degradation of aromatic compounds', '00071': 'Fatty acid degradation', '00280': 'Valine, leucine and isoleucine degradation', '00310': 'Lysine degradation', '00531': 'Glycosaminoglycan degradation', '00511': 'Other glycan degradation'}

def kegg_reactions(cnum):
    r = _get(f'{KEGG}/link/reaction/{cnum}')
    if r['status'] != 200 or not r['body'].strip():
        return []
    return sorted({ln.split('\t')[1].replace('rn:','')
                   for ln in r['body'].strip().splitlines() if '\t' in ln})

def kegg_pathways(cnum):
    r = _get(f'{KEGG}/link/pathway/{cnum}')
    if r['status'] != 200 or not r['body'].strip():
        return []
    out = []
    for ln in r['body'].strip().splitlines():
        if '\t' in ln:
            out.append(ln.split('\t')[1].replace('path:','').replace('map',''))
    return sorted(set(out))

rowsA = []
for _, row in lk[lk['has_kegg']].iterrows():
    c = str(row['kegg_id'])
    rxns = kegg_reactions(c)
    pws  = kegg_pathways(c)
    deg  = [p for p in pws if p in DEGRAD]
    rowsA.append(dict(compound_id=row['compound_id'], kegg_id=c,
                      kegg_n_rxn=len(rxns), kegg_n_pw=len(pws),
                      kegg_n_degpw=len(deg),
                      kegg_deg_maps=';'.join(f'{d}:{DEGRAD[d]}' for d in deg)))
_save()
A = pd.DataFrame(rowsA)
print('KEGG-bearing with >=1 real reaction:', int((A['kegg_n_rxn']>0).sum()), '/', len(A))
print('KEGG-bearing in >=1 DEGRADATION pathway:', int((A['kegg_n_degpw']>0).sum()))
print()
show = A[A['kegg_n_degpw']>0].merge(lk[['compound_id','name','npc_pathway']], on='compound_id')
print(show[['name','npc_pathway','kegg_n_rxn','kegg_deg_maps']].to_string(index=False))

KEGG-bearing with >=1 real reaction: 37 / 54
KEGG-bearing in >=1 DEGRADATION pathway: 16

                             name                     npc_pathway  kegg_n_rxn                                                                                                                                    kegg_deg_maps
             (-)-Perillyl alcohol                      Terpenoids           3                                                                                                                       00903:Limonene degradation
                       Cadaverine                       Alkaloids          11                                                                                                                         00310:Lysine degradation
          3-hydroxybenzyl alcohol Shikimates and Phenylpropanoids           2                                                                                00623:Toluene degradation;01220:Degradation of aromatic compounds
3-(2-Hydroxyphenyl)propionic acid Shikimates and Phenylpropanoids           5                                                                                                00624:Polycyclic aromatic hydrocarbon degradation
            4-hydroxybenzaldehyde Shikimates and Phenylpropanoids          12                                                            00363:Bisphenol degradation;00623:Toluene degradation;00627:Aminobenzoate degradation
                   salicylic acid Shikimates and Phenylpropanoids          17 00621:Dioxin degradation;00624:Polycyclic aromatic hydrocarbon degradation;00626:Naphthalene degradation;01220:Degradation of aromatic compounds
            3-hydroxybenzoic acid Shikimates and Phenylpropanoids           9   00362:Benzoate degradation;00623:Toluene degradation;00624:Polycyclic aromatic hydrocarbon degradation;01220:Degradation of aromatic compounds
       2-Hydroxyphenylacetic acid Shikimates and Phenylpropanoids           4                                                                                                                        00643:Styrene degradation
                         guaiacol Shikimates and Phenylpropanoids           4                                                                                                                  00627:Aminobenzoate degradation
                    Cinnamic acid Shikimates and Phenylpropanoids          15                                                                                                          01220:Degradation of aromatic compounds
                     caffeic acid Shikimates and Phenylpropanoids           9                                                                                                          01220:Degradation of aromatic compounds
                    Palmitic Acid                     Fatty acids          14                                                                                                                     00071:Fatty acid degradation
       3-Hydroxyphenylacetic acid Shikimates and Phenylpropanoids           4                                                                                                                        00643:Styrene degradation
                TEREPHTHALIC ACID Shikimates and Phenylpropanoids           3                        00624:Polycyclic aromatic hydrocarbon degradation;00627:Aminobenzoate degradation;01220:Degradation of aromatic compounds
                    phthalic acid Shikimates and Phenylpropanoids           7                                                        00624:Polycyclic aromatic hydrocarbon degradation;01220:Degradation of aromatic compounds
                        D-camphor                      Terpenoids           4                                                                                                   00907:Pinene, camphor and geraniol degradation

def chebi_to_kegg(chebi):
    cid = str(chebi).split(':')[-1]
    r = _get(f'{KEGG}/conv/compound/chebi:{cid}')
    if r['status']==200 and r['body'].strip():
        for ln in r['body'].strip().splitlines():
            if '\t' in ln:
                return ln.split('\t')[1].replace('cpd:','')
    return None

def kegg_find(name):
    from urllib.parse import quote
    nm = str(name).strip()
    if len(nm) < 4:
        return None
    r = _get(f'{KEGG}/find/compound/{quote(nm)}')
    if r['status']==200 and r['body'].strip():
        first = r['body'].strip().splitlines()[0]
        return first.split('\t')[0].replace('cpd:','')
    return None

rowsA2 = []
for _, row in dark.iterrows():
    knum = via = None
    if pd.notna(row.get('chebi_id')):
        knum = chebi_to_kegg(row['chebi_id'])
        if knum: via = 'chebi_bridge'
    if not knum:
        for nm in [row['name'], row.get('iupac_name')]:
            if pd.notna(nm):
                knum = kegg_find(nm)
                if knum:
                    via = 'name_search'; break
    ndeg = nrxn = 0; degmaps = ''
    if knum:
        rxns = kegg_reactions(knum); pws = kegg_pathways(knum)
        deg = [p for p in pws if p in DEGRAD]
        nrxn, ndeg = len(rxns), len(deg)
        degmaps = ';'.join(f'{d}:{DEGRAD[d]}' for d in deg)
    rowsA2.append(dict(compound_id=row['compound_id'], name=row['name'],
                       new_kegg=knum, kegg_via=via, kegg_n_rxn=nrxn,
                       kegg_n_degpw=ndeg, kegg_deg_maps=degmaps))
_save()
A2 = pd.DataFrame(rowsA2)
print('dark compounds newly mapped to a KEGG compound:', int(A2['new_kegg'].notna().sum()), '/', len(A2))
print(A2[A2['new_kegg'].notna()][['name','new_kegg','kegg_via','kegg_n_rxn','kegg_n_degpw']].to_string(index=False))

dark compounds newly mapped to a KEGG compound: 0 / 21
Empty DataFrame
Columns: [name, new_kegg, kegg_via, kegg_n_rxn, kegg_n_degpw]
Index: []

from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import rdMolDescriptors, DataStructs
from rdkit import RDLogger; RDLogger.DisableLog('rdApp.*')

def mol(s):
    return Chem.MolFromSmiles(s) if isinstance(s, str) and s else None
def real_scaffold(m):
    '''Bemis-Murcko scaffold WITH heteroatoms + aromaticity preserved.'''
    if m is None: return None
    try:
        core = MurckoScaffold.GetScaffoldForMol(m)
        if core is None or core.GetNumAtoms()==0: return None
        return Chem.MolToSmiles(core)
    except Exception:
        return None
def fp(m):
    return rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 2, 2048) if m is not None else None

lk['mol'] = lk['smiles'].map(mol)
lk['rscaf'] = lk['mol'].map(real_scaffold)
lk['fp'] = lk['mol'].map(fp)
linked = lk[lk['best_tier'] != 'T0_dark'].copy()
TANI_MIN = 0.40

rowsB = []
for _, d in lk[lk['best_tier']=='T0_dark'].iterrows():
    dm, dscaf, dfp = d['mol'], d['rscaf'], d['fp']
    best = dict(analog=None, kind=None, tier='', pw='', tani=0.0)
    if dscaf is not None and dfp is not None:
        # candidates that share the identical real scaffold
        same = linked[linked['rscaf'] == dscaf]
        for _, a in same.iterrows():
            if a['fp'] is None: continue
            t = DataStructs.TanimotoSimilarity(dfp, a['fp'])
            if t >= TANI_MIN and t > best['tani']:
                best = dict(analog=a['name'], kind='same_real_scaffold',
                            tier=a['best_tier'], pw=a['npc_pathway'], tani=round(t,3))
    rowsB.append(dict(compound_id=d['compound_id'], name=d['name'],
                      npc_pathway=d['npc_pathway'], analog=best['analog'],
                      analog_kind=best['kind'], analog_tier=best['tier'],
                      analog_pathway=best['pw'], analog_tani=best['tani']))
B = pd.DataFrame(rowsB)
print(f'dark with chemically-meaningful analog (same real scaffold & Tanimoto>={TANI_MIN}):',
      int(B['analog'].notna().sum()), '/', len(B))
print(B[B['analog'].notna()][['name','analog','analog_tani','analog_tier','analog_pathway']].to_string(index=False))

dark with chemically-meaningful analog (same real scaffold & Tanimoto>=0.4): 2 / 21
                  name           analog  analog_tani   analog_tier                  analog_pathway
  3,6-dimethylchromone 6-methylchromone        0.545       T3_kegg                     Polyketides
4-Methoxycinnamic acid    Cinnamic acid        0.571 T2_3_reaction Shikimates and Phenylpropanoids

DEG_WORDS = re.compile(r'degrad|catabol|biotransform|metaboli|biodegrad', re.I)
MICROBE   = re.compile(r'bacter|microb|pseudomonas|rhodococcus|soil|strain|isolat', re.I)

def esearch(term, retmax=5):
    r = _get(f'{EUTILS}/esearch.fcgi',
             params={'db':'pubmed','term':term,'retmode':'json','retmax':retmax}, kind='json')
    if r['status']==200 and isinstance(r['body'], dict):
        es = r['body'].get('esearchresult', {})
        return int(es.get('count', 0)), es.get('idlist', [])
    return 0, []

def esummary_titles(pmids):
    if not pmids: return []
    r = _get(f'{EUTILS}/esummary.fcgi',
             params={'db':'pubmed','id':','.join(pmids),'retmode':'json'}, kind='json')
    if r['status']==200 and isinstance(r['body'], dict):
        res = r['body'].get('result', {})
        return [res[u].get('title','') for u in res.get('uids', [])]
    return []

def query_terms(name, iupac):
    cands = []
    for nm in [name, iupac]:
        if isinstance(nm, str) and nm.strip() and not nm.startswith('Cc1_'):
            c = re.sub(r'[_]+',' ', nm).strip()
            if c not in cands: cands.append(c)
    return cands

rowsC = []
for _, d in dark.iterrows():
    best = dict(n=0, pmids=[], titles=[], term='')
    for core in query_terms(d['name'], d.get('iupac_name')):
        term = f'("{core}") AND (degradation OR catabolism OR biodegradation OR metabolism)'
        n, ids = esearch(term)
        if n > best['n']:
            best = dict(n=n, pmids=ids, titles=esummary_titles(ids), term=core)
    screened = [t for t in best['titles'] if DEG_WORDS.search(t) and MICROBE.search(t)]
    rowsC.append(dict(compound_id=d['compound_id'], name=d['name'],
                      lit_n=best['n'], lit_screened=len(screened),
                      lit_term=best['term'], lit_top_pmids=';'.join(best['pmids'][:3]),
                      lit_top_title=(best['titles'][0] if best['titles'] else '')))
_save()
C = pd.DataFrame(rowsC)
print('dark with >=1 PubMed hit:', int((C['lit_n']>0).sum()), '/', len(C))
print('dark with screened (degradation+microbial) title:', int((C['lit_screened']>0).sum()))
print()
print(C[C['lit_n']>0][['name','lit_n','lit_screened','lit_top_title']].to_string(index=False))

dark with >=1 PubMed hit: 17 / 21
dark with screened (degradation+microbial) title: 0

                                         name   lit_n  lit_screened                                                                                                                                                                                        lit_top_title
1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid 2307435             0                                                                                                        Transformation characteristics of typical antidepressants in UV-advanced oxidation processes.
           2-hydroxy-4,7,8-trimethylquinoline    8802             0                                               Resistance training and diabetes mellitus type 2: An umbrella review of systematic reviews and meta-analyses on glycemic and cardiometabolic outcomes.
                         3,6-dimethylchromone 3545799             0                                                                Nitrate boosts arsenic accumulation in hyperaccumulator Pteris vittata by coupling nitrogen assimilation with arsenic detoxification.
          7-hydroxy-4,8-dimethylchromen-2-one     185             0                                                                                                                                      Intelligent Biopolymer-Based Films for Food Quality Monitoring.
                      2',5'-dihydroxychalcone       8             0                                                                                                                           Perturbing Endothelial Biomechanics via Connexin 43 Structural Disruption.
                               Triacetonamine       2             0                                                             Chemosensitizing Activity of Histone Deacetylases Inhibitory Cyclic Hydroxamic Acids for Combination Chemotherapy of Lymphatic Leukemia.
           4,6,8-trimethylhydroquinolin-2-one   23502             0                                                                           Probiotic-Mediated Vitamin D Supplementation Improves the Gut Microbiota and Vitamin D Receptor Composition in Obese Rats.
               1,4-dimethylquinolin-2(1h)-one 4772986             0                                                                Nitrate boosts arsenic accumulation in hyperaccumulator Pteris vittata by coupling nitrogen assimilation with arsenic detoxification.
                               Brassylic acid      12             0                                                                      Comparative metabolomics elucidates the early defense response mechanisms to Plutella xylostella infestation in Brassica napus.
                     (±)-Dihydroactinidiolide      29             0                                                                                Seasonal and spatial shifts in the volatile chemical profile of Cymodocea nodosa across marine and lagoon ecosystems.
                               (1R)-(-)-Nopol       1             0                                                                                                  Terpenes and Phenylpropanoids as Acetyl- and Butyrylcholinesterase Inhibitors: A Comparative Study.
                               ketopinic acid       3             0 Novel O-acylated amidoximes and substituted 1,2,4-oxadiazoles synthesised from (+)-ketopinic acid possessing potent virus-inhibiting activity against phylogenetically distinct influenza A viruses.
      2,3-dihydro-1H-indole-7-carboxylic acid 1018620             0                                                                                                        Transformation characteristics of typical antidepressants in UV-advanced oxidation processes.
               (2S)indoline-2-carboxylic acid       1             0                                                            Design, synthesis and structure-activity relationship of new arginine vasopressin analogues containing proline derivatives in position 2.
                            (-)-Isolongifolol       5             0                                                      Aromatherapy with Chrysanthemum morifolium cv. Chuju essential oil alleviates allergic rhinitis by modulating the mTOR-PPARγ signaling cascade.
                        b-Caryophyllene oxide       2             0                                                                                              β-caryophyllene reduces inflammation to protect against ischemic stroke by suppressing HMGB1 signaling.
                       4-Methoxycinnamic acid      58             0                                                                             A key SNP in the flavonoid biosynthesis pathway gene CsaV3_6G009030 is highly correlated with cucumber fruit bitterness.

aug = lk.drop(columns=['mol','rscaf','fp']).copy()
aug['was_dark'] = aug['best_tier'] == 'T0_dark'

# fold KEGG reaction/degradation info (Channel A for KEGG-bearing)
aug = aug.merge(A[['compound_id','kegg_n_rxn','kegg_n_degpw','kegg_deg_maps']],
                on='compound_id', how='left')
# fold Channel A2 (dark newly resolved) — combine counts
a2 = A2.rename(columns={'kegg_n_rxn':'kegg_n_rxn2','kegg_n_degpw':'kegg_n_degpw2',
                        'kegg_deg_maps':'kegg_deg_maps2'})
aug = aug.merge(a2[['compound_id','new_kegg','kegg_via','kegg_n_rxn2','kegg_n_degpw2','kegg_deg_maps2']],
                on='compound_id', how='left')
for col in ['kegg_n_rxn','kegg_n_degpw','kegg_n_rxn2','kegg_n_degpw2']:
    aug[col] = aug[col].fillna(0).astype(int)
aug['kegg_n_rxn']   = aug[['kegg_n_rxn','kegg_n_rxn2']].max(axis=1)
aug['kegg_n_degpw'] = aug[['kegg_n_degpw','kegg_n_degpw2']].max(axis=1)
aug['kegg_deg_maps'] = aug['kegg_deg_maps'].fillna('').where(
    aug['kegg_deg_maps'].fillna('')!='' , aug['kegg_deg_maps2'].fillna(''))
# fold Channel B (analogy overlay) and C (literature)
aug = aug.merge(B[['compound_id','analog','analog_kind','analog_tier','analog_pathway','analog_tani']],
                on='compound_id', how='left')
aug = aug.merge(C[['compound_id','lit_n','lit_screened','lit_top_pmids','lit_top_title']],
                on='compound_id', how='left')
aug['lit_n'] = aug['lit_n'].fillna(0).astype(int)
aug['lit_screened'] = aug['lit_screened'].fillna(0).astype(int)

def hard_tier(r):
    if r['best_tier'] == 'T1_measured':      return 'T1_measured'
    if r['kegg_n_degpw'] > 0:                 return 'T2_kegg_degrad'
    if r['best_tier'] == 'T2_3_reaction':     return 'T2_3_reaction'
    if r['kegg_n_rxn'] > 0:                    return 'T2_3_kegg_rxn'
    if r['best_tier'] == 'T3_kegg':           return 'T3_kegg'
    if str(r['best_tier']).startswith('T5_lit'): return r['best_tier']
    if r.get('lit_screened',0) > 0:           return 'T5_lit_dark'
    if r['best_tier'] == 'T5_lit_lowprec':    return 'T5_lit_lowprec'
    return 'T0_dark'

aug['deep_tier'] = aug.apply(hard_tier, axis=1)
aug['hard_linked'] = aug['deep_tier'] != 'T0_dark'
aug['has_analogy'] = aug['analog'].notna()
aug['rescued'] = aug['was_dark'] & aug['hard_linked']
aug['analogy_only'] = aug['was_dark'] & ~aug['hard_linked'] & aug['has_analogy']
aug['deep_dark'] = aug['was_dark'] & ~aug['hard_linked'] & ~aug['has_analogy']
print('NB02 dark:', int(aug['was_dark'].sum()))
print('  hard-rescued (gained measured/KEGG-rxn/lit):', int(aug['rescued'].sum()))
print('  analogy-only (structural hypothesis, still not hard-linked):', int(aug['analogy_only'].sum()))
print('  deep dark (no evidence at all):', int(aug['deep_dark'].sum()))

NB02 dark: 21
  hard-rescued (gained measured/KEGG-rxn/lit): 0
  analogy-only (structural hypothesis, still not hard-linked): 2
  deep dark (no evidence at all): 19

# hard tier distribution before vs after
order = ['T1_measured','T2_kegg_degrad','T2_3_reaction','T2_3_kegg_rxn','T3_kegg',
         'T5_lit_strong','T5_lit','T5_lit_dark','T5_lit_lowprec','T0_dark']
before = lk['best_tier'].value_counts()
after  = aug['deep_tier'].value_counts()
comp = pd.DataFrame({'NB02': before, 'NB02b': after}).reindex(order).fillna(0).astype(int)
print(comp.to_string())
print()
hl_before = int((lk['best_tier']!='T0_dark').sum())
hl_after  = int(aug['hard_linked'].sum())
print(f'HARD LINKAGE: {hl_before}/83 ({hl_before/83:.0%}) -> {hl_after}/83 ({hl_after/83:.0%})')
print(f'  qualitative upgrade: {int((aug["deep_tier"]=="T2_kegg_degrad").sum())} compounds now in a confirmed KEGG DEGRADATION pathway')
print(f'ANALOGY OVERLAY: {int(aug["has_analogy"].sum())} dark compounds carry a structural hypothesis (NOT counted as linked)')

                NB02  NB02b
T1_measured        1      1
T2_kegg_degrad     0     16
T2_3_reaction     54     38
T2_3_kegg_rxn      0      1
T3_kegg            6      5
T5_lit_strong      0      0
T5_lit             1      1
T5_lit_dark        0      0
T5_lit_lowprec     0      0
T0_dark           21     21

HARD LINKAGE: 62/83 (75%) -> 62/83 (75%)
  qualitative upgrade: 16 compounds now in a confirmed KEGG DEGRADATION pathway
ANALOGY OVERLAY: 2 dark compounds carry a structural hypothesis (NOT counted as linked)

# detail: degradation-pathway compounds, analogy overlay, and deep dark
print('=== CONFIRMED KEGG DEGRADATION PATHWAY (T2_kegg_degrad) ===')
print(aug[aug['deep_tier']=='T2_kegg_degrad'][['name','npc_pathway','kegg_n_rxn','kegg_deg_maps']].to_string(index=False))
print()
print('=== ANALOGY OVERLAY (still dark, structural hypothesis only) ===')
ao = aug[aug['analogy_only']][['name','npc_pathway','analog','analog_tani','analog_tier','analog_pathway']]
print(ao.to_string(index=False))
print()
print('=== DEEP DARK (no evidence, no analog) ===')
print(aug[aug['deep_dark']][['compound_id','name','npc_pathway']].to_string(index=False))

=== CONFIRMED KEGG DEGRADATION PATHWAY (T2_kegg_degrad) ===
                             name                     npc_pathway  kegg_n_rxn                                                                                                                                    kegg_deg_maps
             (-)-Perillyl alcohol                      Terpenoids           3                                                                                                                       00903:Limonene degradation
                       Cadaverine                       Alkaloids          11                                                                                                                         00310:Lysine degradation
          3-hydroxybenzyl alcohol Shikimates and Phenylpropanoids           2                                                                                00623:Toluene degradation;01220:Degradation of aromatic compounds
3-(2-Hydroxyphenyl)propionic acid Shikimates and Phenylpropanoids           5                                                                                                00624:Polycyclic aromatic hydrocarbon degradation
            4-hydroxybenzaldehyde Shikimates and Phenylpropanoids          12                                                            00363:Bisphenol degradation;00623:Toluene degradation;00627:Aminobenzoate degradation
                   salicylic acid Shikimates and Phenylpropanoids          17 00621:Dioxin degradation;00624:Polycyclic aromatic hydrocarbon degradation;00626:Naphthalene degradation;01220:Degradation of aromatic compounds
            3-hydroxybenzoic acid Shikimates and Phenylpropanoids           9   00362:Benzoate degradation;00623:Toluene degradation;00624:Polycyclic aromatic hydrocarbon degradation;01220:Degradation of aromatic compounds
       2-Hydroxyphenylacetic acid Shikimates and Phenylpropanoids           4                                                                                                                        00643:Styrene degradation
                         guaiacol Shikimates and Phenylpropanoids           4                                                                                                                  00627:Aminobenzoate degradation
                    Cinnamic acid Shikimates and Phenylpropanoids          15                                                                                                          01220:Degradation of aromatic compounds
                     caffeic acid Shikimates and Phenylpropanoids           9                                                                                                          01220:Degradation of aromatic compounds
                    Palmitic Acid                     Fatty acids          14                                                                                                                     00071:Fatty acid degradation
       3-Hydroxyphenylacetic acid Shikimates and Phenylpropanoids           4                                                                                                                        00643:Styrene degradation
                TEREPHTHALIC ACID Shikimates and Phenylpropanoids           3                        00624:Polycyclic aromatic hydrocarbon degradation;00627:Aminobenzoate degradation;01220:Degradation of aromatic compounds
                    phthalic acid Shikimates and Phenylpropanoids           7                                                        00624:Polycyclic aromatic hydrocarbon degradation;01220:Degradation of aromatic compounds
                        D-camphor                      Terpenoids           4                                                                                                   00907:Pinene, camphor and geraniol degradation

=== ANALOGY OVERLAY (still dark, structural hypothesis only) ===
                  name                     npc_pathway           analog  analog_tani   analog_tier                  analog_pathway
  3,6-dimethylchromone                     Polyketides 6-methylchromone        0.545       T3_kegg                     Polyketides
4-Methoxycinnamic acid Shikimates and Phenylpropanoids    Cinnamic acid        0.571 T2_3_reaction Shikimates and Phenylpropanoids

=== DEEP DARK (no evidence, no analog) ===
                compound_id                                          name                     npc_pathway
                    Cc1_102 1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid                       Alkaloids
                     Cc1_22            2-hydroxy-4,7,8-trimethylquinoline                       Alkaloids
                     Cc1_39           7-hydroxy-4,8-dimethylchromen-2-one Shikimates and Phenylpropanoids
                     Cc1_34                       2',5'-dihydroxychalcone Shikimates and Phenylpropanoids
                    Cc1_106           2-Isopropyl-5-methylcyclohexanamine                      Terpenoids
                    Cc1_266                                Triacetonamine                       Alkaloids
                     Cc1_96            4,6,8-trimethylhydroquinolin-2-one                       Alkaloids
                     Cc1_42                4-hydroxy-1-methyl-2-quinolone                       Alkaloids
                     Cc1_24                1,4-dimethylquinolin-2(1h)-one                       Alkaloids
                     Cc1_66                                Brassylic acid                     Fatty acids
                    Cc1_113                          6-aminochromen-2-one Shikimates and Phenylpropanoids
                    Cc1_104                      (±)-Dihydroactinidiolide                      Terpenoids
                     Cc1_99                                (1R)-(-)-Nopol                      Terpenoids
                     Cc1_36                                ketopinic acid                      Terpenoids
GDNYELSKMFYCBB-UHFFFAOYSA-N       2,3-dihydro-1H-indole-7-carboxylic acid                       Alkaloids
QNRXNRGSOJZINA-UHFFFAOYSA-N                (2S)indoline-2-carboxylic acid                       Alkaloids
VZJHQHUOVIDRCF-DGMCESFYSA-N                             (-)-Isolongifolol                      Terpenoids
NVEQFIOZRFFVFW-UHFFFAOYSA-N                         b-Caryophyllene oxide                      Terpenoids
QWJIWJWEGVAMEB-UHFFFAOYSA-N   1H-pyrrolo[3,2-b]pyridine-5-carboxylic acid                       Alkaloids

byc = aug.groupby('npc_pathway').agg(n=('compound_id','size'),
        hard_before=('was_dark', lambda s: int((~s).sum())),
        hard_after=('hard_linked','sum'),
        analogy=('analogy_only','sum')).sort_values('n', ascending=False)
byc['deep_dark'] = byc['n'] - byc['hard_after'] - byc['analogy']
print(byc.to_string())

                                             n  hard_before  hard_after  analogy  deep_dark
npc_pathway                                                                                
Alkaloids                                   26           17          17        0          9
Shikimates and Phenylpropanoids             25           21          21        1          3
Terpenoids                                  17           11          11        0          6
Fatty acids                                 10            9           9        0          1
Amino acids and Peptides                     2            2           2        0          0
Polyketides                                  2            1           1        1          0
Alkaloids, Shikimates and Phenylpropanoids   1            1           1        0          0

fig, axes = plt.subplots(1, 2, figsize=(13, 4.6))
# left: per-class stacked hard / analogy / deep-dark
x = np.arange(len(byc))
axes[0].barh(x, byc['hard_after'], color='#2c7fb8', label='hard-linked')
axes[0].barh(x, byc['analogy'], left=byc['hard_after'], color='#9e9ac8', label='analogy overlay')
axes[0].barh(x, byc['deep_dark'], left=byc['hard_after']+byc['analogy'], color='#d7301f', label='deep dark')
axes[0].set_yticks(x); axes[0].set_yticklabels(byc.index, fontsize=8)
axes[0].invert_yaxis(); axes[0].set_xlabel('compounds'); axes[0].legend(fontsize=8)
axes[0].set_title('Per-class: hard linkage + analogy overlay + deep dark')
# right: hard tier distribution
tc = aug['deep_tier'].value_counts().reindex(order).dropna()
colors = {'T1_measured':'#08519c','T2_kegg_degrad':'#238b45','T2_3_reaction':'#41ab5d',
          'T2_3_kegg_rxn':'#74c476','T3_kegg':'#a1d99b','T5_lit_strong':'#fe9929',
          'T5_lit':'#fdae6b','T5_lit_dark':'#fec44f','T5_lit_lowprec':'#fee391','T0_dark':'#d7301f'}
axes[1].barh(range(len(tc)), tc.values, color=[colors.get(t,'#888') for t in tc.index])
axes[1].set_yticks(range(len(tc))); axes[1].set_yticklabels(tc.index, fontsize=8)
axes[1].invert_yaxis(); axes[1].set_xlabel('compounds')
axes[1].set_title('Hard confidence tiers (n=83)')
for i,v in enumerate(tc.values): axes[1].text(v+0.3, i, str(int(v)), va='center', fontsize=8)
fig.tight_layout(); fig.savefig(FIG / '02b_linkage_deepened.png', dpi=150)
print('saved 02b_linkage_deepened.png'); plt.close(fig)

saved 02b_linkage_deepened.png

out_cols = ['compound_id','name','npc_pathway','inchikey','kegg_id','chebi_id',
            'best_tier','was_dark','deep_tier','rescued',
            'kegg_n_rxn','kegg_n_degpw','kegg_deg_maps','new_kegg','kegg_via',
            'analog','analog_kind','analog_tier','analog_pathway',
            'lit_n','lit_screened','lit_top_pmids','lit_top_title',
            'fb_carbon','msd_match','msd_n_rxn','has_kegg','lit_strong']
out_cols = [c for c in out_cols if c in aug.columns]
aug[out_cols].to_csv(DATA / 'compound_linkage_deepened.tsv', sep='\t', index=False)
print('wrote data/compound_linkage_deepened.tsv', aug[out_cols].shape)
aug[out_cols].head(8)

wrote data/compound_linkage_deepened.tsv (83, 28)

02B Linkage Deepening

NB02b — Deepening Phase-1 Pathway Linkage¶

Cached HTTP helper (KEGG + NCBI E-utilities)¶

Build KEGG degradation-map dictionary (one bulk call)¶

Channel A — KEGG reaction + degradation-pathway membership (54 KEGG-bearing)¶

Channel A2 — fresh KEGG resolution of the 21 dark (ChEBI bridge + name search)¶

Channel B — RDKit scaffold analogy (Tier-6 overlay, not hard linkage)¶

Channel C — compound-specific PubMed degradation literature (21 dark)¶

Integrate — hard linkage vs analogy overlay; dark set preserved¶

Coverage by class (hard linkage) + figure¶

Save deepened linkage table¶

	compound_id	name	npc_pathway	inchikey	kegg_id	chebi_id	best_tier	was_dark	deep_tier	rescued	kegg_n_rxn	new_kegg	kegg_via	analog	analog_kind	analog_tier	analog_pathway	lit_n	lit_top_pmids	lit_top_title	fb_carbon	msd_match	msd_n_rxn	has_kegg	lit_strong
0	Cc1_29	harman	Alkaloids	PSFDQSOCUJVVGF-UHFFFAOYSA-N	C09209	CHEBI:5623	T2_3_reaction	False	T2_3_reaction	False	1	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN	False	inchikey	1	True	True
1	Cc1_102	1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid	Alkaloids	NBSNSQJYXCXGLK-UHFFFAOYSA-N	NaN	NaN	T0_dark	True	T0_dark	False	0	None	None	NaN	NaN			2307435	42258980;42258977;42258945	Transformation characteristics of typical anti...	False	NaN	0	False	False
2	Cc1_22	2-hydroxy-4,7,8-trimethylquinoline	Alkaloids	QRSYDKNRNVBMFQ-UHFFFAOYSA-N	NaN	NaN	T0_dark	True	T0_dark	False	0	None	None	NaN	NaN			8802	42248329;42206798;42202165	Resistance training and diabetes mellitus type...	False	NaN	0	False	False
3	Cc1_46	3,6-dimethylchromone	Polyketides	CNWMARIMEBTYMR-UHFFFAOYSA-N	NaN	NaN	T0_dark	True	T0_dark	False	0	None	None	6-methylchromone	same_real_scaffold	T3_kegg	Polyketides	3545799	42258977;42258941;42258939	Nitrate boosts arsenic accumulation in hyperac...	False	NaN	0	False	False
4	Cc1_39	7-hydroxy-4,8-dimethylchromen-2-one	Shikimates and Phenylpropanoids	MVMMGVPSTRNMSV-UHFFFAOYSA-N	NaN	NaN	T0_dark	True	T0_dark	False	0	None	None	NaN	NaN			185	41901825;41508554;41482132	Intelligent Biopolymer-Based Films for Food Qu...	False	NaN	0	False	False
5	Cc1_71	Fraxetin	Shikimates and Phenylpropanoids	HAVWRBANWNTOJX-UHFFFAOYSA-N	C09265	CHEBI:5169	T2_3_reaction	False	T2_3_reaction	False	3	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN	False	inchikey	3	True	True
6	Cc1_34	2',5'-dihydroxychalcone	Shikimates and Phenylpropanoids	PZVRZRARFZZBCA-UHFFFAOYSA-N	NaN	NaN	T0_dark	True	T0_dark	False	0	None	None	NaN	NaN			8	31633688;22239485;21712085	Perturbing Endothelial Biomechanics via Connex...	False	NaN	0	False	False
7	Cc1_60	Phthalic acid mono-2-ethylhexyl ester	Shikimates and Phenylpropanoids	DJDSLBVSSOQSLW-UHFFFAOYSA-N	C03343	CHEBI:17243	T2_3_reaction	False	T2_3_reaction	False	1	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	NaN	False	connectivity	1	True	False