import os, re, json
import pandas as pd
from pathlib import Path
from pyspark.sql import SparkSession, functions as F

DATA = Path('../data'); FIG = Path('../figures')
pd.set_option('display.max_columns', 40); pd.set_option('display.width', 200)

res = pd.read_csv(DATA / 'resolved_compounds.tsv', sep='\t')
print('resolved compounds:', len(res))
res['ik14'] = res['inchikey'].str.slice(0, 14)  # connectivity block (stereo-insensitive)
res[['compound_id', 'name', 'npc_pathway', 'inchikey', 'kegg_id']].head()

resolved compounds: 83

# On-cluster Spark Connect (token from env; never printed)
_tok = os.environ['KBASE_AUTH_TOKEN']
spark = SparkSession.builder.remote(
    f'sc://jupyter-aparkin.jupyterhub-prod:15002/;use_ssl=false;x-kbase-token={_tok}'
).getOrCreate()
print('spark connected')

spark connected

SALT_WORDS = ['hydrochloride', 'monohydrate', 'dihydrate', 'sodium salt', 'lithium salt',
              'potassium salt', 'sodium', 'potassium', 'lithium', 'hydrate', 'hcl',
              'acid', 'salt']
STEREO = re.compile(r'\([^)]*\)|\b[dlrs]\b|±|\+/-|rac-', re.IGNORECASE)
def norm(s):
    s = str(s).lower()
    s = STEREO.sub(' ', s)
    for w in SALT_WORDS:
        s = s.replace(w, ' ')
    return re.sub(r'[^a-z0-9]', '', s)

# pull distinct carbon-source conditions with the organisms that used them
fb = (spark.table('kescience_fitnessbrowser.experiment')
      .filter(F.lower('expGroup') == 'carbon source')
      .filter(F.col('condition_1').isNotNull())
      .groupBy('condition_1').agg(
          F.countDistinct('orgId').alias('n_orgs'),
          F.count('*').alias('n_exps'),
          F.concat_ws(',', F.collect_set('orgId')).alias('orgs'))
      .toPandas())
fb['norm'] = fb['condition_1'].map(norm)
print('distinct FB carbon-source conditions:', len(fb))
fb_by_norm = fb.groupby('norm').agg(
    fb_conditions=('condition_1', lambda x: ' | '.join(sorted(set(x)))),
    fb_n_orgs=('n_orgs', 'sum'), fb_n_exps=('n_exps', 'sum'),
    fb_orgs=('orgs', lambda x: ','.join(sorted(set(','.join(x).split(',')))))).reset_index()

distinct FB carbon-source conditions: 154

# build normalized match keys for our compounds: name + slash alternates
def name_norms(name):
    parts = [p.strip() for p in str(name).split('/')] if '/' in str(name) else [str(name)]
    return sorted({norm(p) for p in parts if norm(p)})
res['name_norms'] = res['name'].map(name_norms)
fbmap = dict(zip(fb_by_norm['norm'], fb_by_norm.index))
def fb_match(norms):
    for nm in norms:
        if nm in fbmap:
            return fb_by_norm.loc[fbmap[nm]]
    return None
t1 = []
for _, r in res.iterrows():
    m = fb_match(r['name_norms'])
    t1.append(dict(compound_id=r['compound_id'],
                   fb_carbon=m is not None,
                   fb_conditions=m['fb_conditions'] if m is not None else None,
                   fb_n_orgs=int(m['fb_n_orgs']) if m is not None else 0,
                   fb_n_exps=int(m['fb_n_exps']) if m is not None else 0,
                   fb_orgs=m['fb_orgs'] if m is not None else None))
t1 = pd.DataFrame(t1)
print('compounds matched to FB carbon source (T1 measured):', int(t1['fb_carbon'].sum()))
print(t1[t1['fb_carbon']][['compound_id', 'fb_conditions', 'fb_n_orgs', 'fb_n_exps']].to_string(index=False))

compounds matched to FB carbon source (T1 measured): 1
                compound_id fb_conditions  fb_n_orgs  fb_n_exps
POULHZVOKOAJMA-UHFFFAOYSA-N   Lauric acid          1          2

mol = (spark.table('kbase_msd_biochemistry.molecule')
       .select('id', 'inchikey', F.col('name').alias('msd_name'))
       .filter(F.col('inchikey').isNotNull()))
mol = mol.withColumn('ik14', F.substring('inchikey', 1, 14)).toPandas()
# reaction participation counts per molecule id
reag = (spark.table('kbase_msd_biochemistry.reagent')
        .groupBy('molecule_id').agg(F.countDistinct('reaction_id').alias('n_rxn'))
        .toPandas())
rxn_by_mol = dict(zip(reag['molecule_id'], reag['n_rxn']))
ik_exact = {ik: i for i, ik in zip(mol['id'], mol['inchikey'])}
ik14_map = mol.groupby('ik14')['id'].apply(list).to_dict()
def msd_lookup(ik, ik14):
    mid = ik_exact.get(ik)
    how = 'inchikey' if mid else None
    if not mid and ik14 in ik14_map:
        mid = ik14_map[ik14][0]; how = 'connectivity'
    if not mid:
        return None, 0, None
    return mid, int(rxn_by_mol.get(mid, 0)), how
t2 = []
for _, r in res.iterrows():
    mid, nrxn, how = msd_lookup(r['inchikey'], r['ik14'])
    t2.append(dict(compound_id=r['compound_id'], msd_cpd=mid,
                   msd_match=how, msd_n_rxn=nrxn))
t2 = pd.DataFrame(t2)
print('compounds matched to a ModelSEED metabolite:', int(t2['msd_cpd'].notna().sum()))
print('  of those, with >=1 reaction:', int((t2['msd_n_rxn'] > 0).sum()))
print(t2['msd_match'].value_counts(dropna=False).to_string())

compounds matched to a ModelSEED metabolite: 62
  of those, with >=1 reaction: 55
msd_match
connectivity    33
inchikey        29
NaN             21

GENERIC = {'indole', 'choline', 'biotin', 'xanthine', 'tyramine', 'cadaverine',
           'guaiacol', 'anethole', 'acridone', 'manool', 'carveol'}
# search terms: full name (and slash alternates), len>=5
terms = []
for _, r in res.iterrows():
    seen = set()
    for p in (str(r['name']).split('/') if '/' in str(r['name']) else [str(r['name'])]):
        t = p.strip().lower()
        if len(t) >= 5 and t not in seen:
            seen.add(t)
            terms.append((r['compound_id'], t, t in GENERIC))
terms_df = spark.createDataFrame(pd.DataFrame(terms, columns=['compound_id', 'term', 'generic']))
cur = spark.table('kescience_paperblast.curatedgene').select(
    F.lower('desc').alias('descl'), 'desc')
print('curatedgene rows:', cur.count())
hits = (terms_df.join(cur, F.col('descl').contains(F.col('term')))
        .groupBy('compound_id', 'generic')
        .agg(F.count('*').alias('lit_hits'),
             F.first('desc').alias('lit_example'))
        .toPandas())
t5 = hits.groupby('compound_id').agg(
    lit_hits=('lit_hits', 'sum'),
    lit_generic=('generic', 'max'),
    lit_example=('lit_example', 'first')).reset_index()
print('compounds with >=1 curated literature mention:', len(t5))
print('  (of those, name is generic/low-precision):', int(t5['lit_generic'].sum()))

curatedgene rows: 255096

compounds with >=1 curated literature mention: 33
  (of those, name is generic/low-precision): 8

link = res[['compound_id', 'name', 'npc_pathway', 'inchikey', 'kegg_id', 'chebi_id']].copy()
link['has_kegg'] = link['kegg_id'].notna()  # T3 entry point
link = link.merge(t1, on='compound_id', how='left')
link = link.merge(t2, on='compound_id', how='left')
link = link.merge(t5, on='compound_id', how='left')
link['fb_carbon'] = link['fb_carbon'].fillna(False)
link['msd_rxn'] = link['msd_n_rxn'].fillna(0) > 0
link['lit'] = link['lit_hits'].fillna(0) > 0
link['lit_strong'] = link['lit'] & ~link['lit_generic'].fillna(False)
# any linkage = any channel fired (literature counts even if low-precision)
link['any_linkage'] = link[['fb_carbon', 'msd_rxn', 'has_kegg', 'lit']].any(axis=1)
# best tier (most specific evidence available)
def best_tier(r):
    if r['fb_carbon']: return 'T1_measured'
    if r['msd_rxn']:   return 'T2_3_reaction'
    if r['has_kegg']:  return 'T3_kegg'
    if r['lit_strong']:return 'T5_lit'
    if r['lit']:       return 'T5_lit_lowprec'
    return 'T0_dark'
link['best_tier'] = link.apply(best_tier, axis=1)
print(link['best_tier'].value_counts().to_string())
print('\nany linkage:', int(link['any_linkage'].sum()), '/', len(link))
print('DARK (no linkage):', int((~link['any_linkage']).sum()))

best_tier
T2_3_reaction    54
T0_dark          21
T3_kegg           6
T5_lit            1
T1_measured       1

any linkage: 62 / 83
DARK (no linkage): 21

g = link.groupby('npc_pathway')
cov = pd.DataFrame({
    'n': g.size(),
    'T1_fb': g['fb_carbon'].sum(),
    'T2_3_msd_rxn': g['msd_rxn'].sum(),
    'T3_kegg': g['has_kegg'].sum(),
    'T5_lit_any': g['lit'].sum(),
    'any': g['any_linkage'].sum(),
}).astype(int)
cov['dark'] = cov['n'] - cov['any']
cov['pct_any'] = (100 * cov['any'] / cov['n']).round(0).astype(int)
cov = cov.sort_values('n', ascending=False)
print(cov.to_string())
print('\nTOTAL any-linkage: %d/%d (%d%%)' % (
    link['any_linkage'].sum(), len(link),
    round(100*link['any_linkage'].sum()/len(link))))

                                             n  T1_fb  T2_3_msd_rxn  T3_kegg  T5_lit_any  any  dark  pct_any
npc_pathway                                                                                                 
Alkaloids                                   26      0            15       14          11   17     9       65
Shikimates and Phenylpropanoids             25      0            18       20          12   21     4       84
Terpenoids                                  17      0            10        8           6   11     6       65
Fatty acids                                 10      1             9        8           4    9     1       90
Amino acids and Peptides                     2      0             2        2           0    2     0      100
Polyketides                                  2      0             0        1           0    1     1       50
Alkaloids, Shikimates and Phenylpropanoids   1      0             1        1           0    1     0      100

TOTAL any-linkage: 62/83 (75%)

# the dark compounds — explicit knowledge-gap list
dark = link[~link['any_linkage']][['compound_id', 'name', 'npc_pathway', 'inchikey']]
print('DARK compounds (T0 — no database linkage on any channel):', len(dark))
print(dark.to_string(index=False))

DARK compounds (T0 — no database linkage on any channel): 21
                compound_id                                          name                     npc_pathway                    inchikey
                    Cc1_102 1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid                       Alkaloids NBSNSQJYXCXGLK-UHFFFAOYSA-N
                     Cc1_22            2-hydroxy-4,7,8-trimethylquinoline                       Alkaloids QRSYDKNRNVBMFQ-UHFFFAOYSA-N
                     Cc1_46                          3,6-dimethylchromone                     Polyketides CNWMARIMEBTYMR-UHFFFAOYSA-N
                     Cc1_39           7-hydroxy-4,8-dimethylchromen-2-one Shikimates and Phenylpropanoids MVMMGVPSTRNMSV-UHFFFAOYSA-N
                     Cc1_34                       2',5'-dihydroxychalcone Shikimates and Phenylpropanoids PZVRZRARFZZBCA-UHFFFAOYSA-N
                    Cc1_106           2-Isopropyl-5-methylcyclohexanamine                      Terpenoids RBMUAGDCCJDQLE-UHFFFAOYSA-N
                    Cc1_266                                Triacetonamine                       Alkaloids JWUXJYZVKZKLTJ-UHFFFAOYSA-N
                     Cc1_96            4,6,8-trimethylhydroquinolin-2-one                       Alkaloids UHXASMSNKAVYHU-UHFFFAOYSA-N
                     Cc1_42                4-hydroxy-1-methyl-2-quinolone                       Alkaloids RTNPPPQVXREFKX-UHFFFAOYSA-N
                     Cc1_24                1,4-dimethylquinolin-2(1h)-one                       Alkaloids CEONKCOBRZOYJS-UHFFFAOYSA-N
                     Cc1_66                                Brassylic acid                     Fatty acids DXNCZXXFRKPEPY-UHFFFAOYSA-N
                    Cc1_113                          6-aminochromen-2-one Shikimates and Phenylpropanoids ZOJAINJCZSVZGW-UHFFFAOYSA-N
                    Cc1_104                      (±)-Dihydroactinidiolide                      Terpenoids IMKHDCBNRDRUEB-LLVKDONJSA-N
                     Cc1_99                                (1R)-(-)-Nopol                      Terpenoids ROKSAUSPJGWCSM-UWVGGRQHSA-N
                     Cc1_36                                ketopinic acid                      Terpenoids WDODWBQJVMBHCO-UHFFFAOYSA-N
GDNYELSKMFYCBB-UHFFFAOYSA-N       2,3-dihydro-1H-indole-7-carboxylic acid                       Alkaloids GDNYELSKMFYCBB-UHFFFAOYSA-N
QNRXNRGSOJZINA-UHFFFAOYSA-N                (2S)indoline-2-carboxylic acid                       Alkaloids QNRXNRGSOJZINA-UHFFFAOYSA-N
VZJHQHUOVIDRCF-DGMCESFYSA-N                             (-)-Isolongifolol                      Terpenoids VZJHQHUOVIDRCF-DGMCESFYSA-N
NVEQFIOZRFFVFW-UHFFFAOYSA-N                         b-Caryophyllene oxide                      Terpenoids NVEQFIOZRFFVFW-UHFFFAOYSA-N
QWJIWJWEGVAMEB-UHFFFAOYSA-N   1H-pyrrolo[3,2-b]pyridine-5-carboxylic acid                       Alkaloids QWJIWJWEGVAMEB-UHFFFAOYSA-N
AFDXODALSZRGIH-QPJJXVBHSA-N                        4-Methoxycinnamic acid Shikimates and Phenylpropanoids AFDXODALSZRGIH-QPJJXVBHSA-N

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
chan = ['T1_fb', 'T2_3_msd_rxn', 'T3_kegg', 'T5_lit_any']
labels = ['T1 FB carbon', 'T2/3 ModelSEED rxn', 'T3 KEGG', 'T5 literature']
classes = cov.index.tolist()
M = cov[chan].div(cov['n'], axis=0).values * 100
fig, ax = plt.subplots(figsize=(8, 4.8))
im = ax.imshow(M, cmap='viridis', aspect='auto', vmin=0, vmax=100)
ax.set_xticks(range(len(labels))); ax.set_xticklabels(labels, rotation=20, ha='right')
ax.set_yticks(range(len(classes)))
ax.set_yticklabels([f'{c} (n={cov.loc[c,"n"]})' for c in classes])
for i in range(len(classes)):
    for j in range(len(chan)):
        ax.text(j, i, f'{int(cov[chan].values[i,j])}', ha='center', va='center',
                color='w' if M[i, j] < 60 else 'k', fontsize=9)
ax.set_title('Carbon Census: pathway-linkage coverage by class\n(cell = # compounds; color = % of class)')
fig.colorbar(im, label='% of class'); fig.tight_layout()
fig.savefig(FIG / '02_linkage_coverage.png', dpi=150); plt.close(fig)
print('saved 02_linkage_coverage.png')

saved 02_linkage_coverage.png

out_cols = ['compound_id', 'name', 'npc_pathway', 'inchikey', 'kegg_id', 'chebi_id',
            'fb_carbon', 'fb_conditions', 'fb_n_orgs', 'fb_orgs',
            'msd_cpd', 'msd_match', 'msd_n_rxn', 'msd_rxn',
            'has_kegg', 'lit', 'lit_hits', 'lit_generic', 'lit_strong', 'lit_example',
            'any_linkage', 'best_tier']
link[out_cols].to_csv(DATA / 'compound_linkage.tsv', sep='\t', index=False)
print('wrote data/compound_linkage.tsv', link.shape)
link[['compound_id', 'name', 'npc_pathway', 'best_tier', 'any_linkage']].head(15)

wrote data/compound_linkage.tsv (83, 23)

02 Pathway Linkage

NB02 — Multi-Channel Pathway Linkage (Phase-1 stop-gate)¶

Channel T1 — Fitness Browser carbon-source experiments (measured)¶

Channel T2/3 — ModelSEED reactions (InChIKey join)¶

Channel T5 — PaperBLAST curated-gene literature mentions (low-precision)¶

Assemble the linkage matrix¶

STOP-GATE: linkage coverage by chemical class¶

Figure + save linkage matrix¶

	compound_id	name	npc_pathway	inchikey	kegg_id
0	Cc1_29	harman	Alkaloids	PSFDQSOCUJVVGF-UHFFFAOYSA-N	C09209
1	Cc1_102	1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid	Alkaloids	NBSNSQJYXCXGLK-UHFFFAOYSA-N	NaN
2	Cc1_22	2-hydroxy-4,7,8-trimethylquinoline	Alkaloids	QRSYDKNRNVBMFQ-UHFFFAOYSA-N	NaN
3	Cc1_46	3,6-dimethylchromone	Polyketides	CNWMARIMEBTYMR-UHFFFAOYSA-N	NaN
4	Cc1_39	7-hydroxy-4,8-dimethylchromen-2-one	Shikimates and Phenylpropanoids	MVMMGVPSTRNMSV-UHFFFAOYSA-N	NaN