import os, re, json, time
import pandas as pd, requests
from pathlib import Path
DATA = Path('../data')
pd.set_option('display.max_columns', 40); pd.set_option('display.width', 200)
res = pd.read_csv(DATA / 'resolved_compounds.tsv', sep='\t')
res['ik14'] = res['inchikey'].str.slice(0, 14)
ours14 = dict(zip(res['ik14'], res['name']))
print('resolved compounds:', len(res))

resolved compounds: 83

from pyspark.sql import SparkSession, functions as F
_tok = os.environ['KBASE_AUTH_TOKEN']
spark = SparkSession.builder.remote(
    f'sc://jupyter-aparkin.jupyterhub-prod:15002/;use_ssl=false;x-kbase-token={_tok}'
).getOrCreate()
fb = (spark.table('kescience_fitnessbrowser.experiment')
      .filter(F.lower('expGroup') == 'carbon source')
      .filter(F.col('condition_1').isNotNull())
      .groupBy('condition_1').agg(
          F.countDistinct('orgId').alias('n_orgs'),
          F.concat_ws(',', F.collect_set('orgId')).alias('orgs'))
      .toPandas())
fb.to_csv(DATA / 'fb_carbon_conditions.tsv', sep='\t', index=False)
print('distinct FB carbon-source conditions:', len(fb))

distinct FB carbon-source conditions: 154

cache_path = DATA / 'fb_pubchem_cache.json'
cache = json.loads(cache_path.read_text()) if cache_path.exists() else {}
SALT = ['disodium salt','monopotassium salt','dihydrochloride','hydrochloride','monohydrate',
        'dihydrate','hexahydrate','pentahydrate','sodium salt','lithium salt','potassium salt',
        'sodium','potassium','lithium',' salt',' hydrate',' dibasic',' basic']
def clean(s):
    s = re.sub(r'\([^)]*\)', '', str(s)); low = s.lower()
    for w in SALT: low = low.replace(w, '')
    return low.strip(' ,;-')
def pubchem_ik(name):
    if name in cache: return cache[name]
    ik = None
    for q in [name, clean(name)]:
        if not q: continue
        try:
            u = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'
                 f'{requests.utils.quote(q)}/property/InChIKey/JSON')
            r = requests.get(u, timeout=20)
            if r.status_code == 200:
                ik = r.json()['PropertyTable']['Properties'][0]['InChIKey']; break
        except Exception: pass
        time.sleep(0.2)
    cache[name] = ik; return ik
fb['ik'] = [pubchem_ik(n) for n in fb['condition_1']]
cache_path.write_text(json.dumps(cache))
fb['ik14'] = fb['ik'].str.slice(0, 14)
print('FB names resolved to InChIKey: %d/%d' % (fb['ik'].notna().sum(), len(fb)))
print('unresolved (polymers/mixtures):')
print('  ' + ', '.join(sorted(fb[fb['ik'].isna()]['condition_1'])[:25]))

FB names resolved to InChIKey: 134/154
unresolved (polymers/mixtures):
  1,4-B-D-Galactobiose, 2-Deoxyadenosine 5-monophosphate, Agro_defined_trehalose, Chondroitin sulfate A sodium salt from bovine trachea, Deoxyribonucleic acid from herring sperm, Gelatin, Hyaluronic acid sodium salt from Streptococcus equi, Mannan from Saccharomyces cerevisiae, Red Arabinan from sugar-beet, Rhamnogalacturonan - from potato, Sodium D,L-Lactate, Stachyose - 70%, Starch, Supernatant; Agrobacterium rhizogenes K599 grown in Agro_defined_trehalose (~7.5 mM 3-keto-trehalose), Tween 20, amylopectin from maize, casamino acids, dextran, Mw ~200,000, m-Inositol, polygalacturonic acid

fb['match'] = fb['ik14'].map(lambda k: ours14.get(k) if pd.notna(k) else None)
hits = fb[fb['match'].notna()].copy()
hits.to_csv(DATA / 'fb_inchikey_matches.tsv', sep='\t', index=False)
print('Tier-1 compounds by structural (InChIKey) match:', hits['match'].nunique())
print(hits[['condition_1', 'n_orgs', 'match']].to_string(index=False))

# compare to the name-based Tier-1 set from NB02
link = pd.read_csv(DATA / 'compound_linkage.tsv', sep='\t')
name_t1 = set(link[link['fb_carbon']]['name'])
ik_t1 = set(hits['match'])
print('\nname-based Tier-1 set :', sorted(name_t1))
print('InChIKey Tier-1 set    :', sorted(ik_t1))
print('recovered ONLY by InChIKey (name-match missed):', sorted(ik_t1 - name_t1))
print('in name set but not InChIKey:', sorted(name_t1 - ik_t1))

Tier-1 compounds by structural (InChIKey) match: 1
condition_1  n_orgs       match
Lauric acid       1 lauric acid

name-based Tier-1 set : ['lauric acid']
InChIKey Tier-1 set    : ['lauric acid']
recovered ONLY by InChIKey (name-match missed): []
in name set but not InChIKey: []

02C Fb Inchikey Rematch

NB02c — Tier-1 robustness: FB carbon-source match by InChIKey (review I1)¶

Pull distinct FB carbon-source conditions (Spark)¶

Resolve FB names → InChIKey (PubChem, cached)¶

Structural match: FB InChIKey-14 ↔ our 83 compounds¶

Conclusion (I1)¶