import pandas as pd
import requests
import re, json, time
from pathlib import Path

DATA = Path('../data'); FIG = Path('../figures')
pd.set_option('display.max_columns', 40); pd.set_option('display.width', 200)

comp = pd.read_csv(DATA / 'compounds_selected.tsv', sep='\t')
print('compounds:', len(comp))
INCHIKEY_RE = re.compile(r'^[A-Z]{14}-[A-Z]{10}-[A-Z]$')
comp['id_is_inchikey'] = comp['compound_id'].str.match(INCHIKEY_RE)
print('ids that are already InChIKeys:', int(comp['id_is_inchikey'].sum()))
print('ids that are Cc1_xx (name-only):', int((~comp['id_is_inchikey']).sum()))

compounds: 83
ids that are already InChIKeys: 24
ids that are Cc1_xx (name-only): 59

B = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
CACHE_PATH = DATA / 'pubchem_cache.json'
_cache = json.loads(CACHE_PATH.read_text()) if CACHE_PATH.exists() else {}
_sess = requests.Session()

def _get(url):
    if url in _cache:
        return _cache[url]
    time.sleep(0.22)  # stay under 5 req/s
    try:
        r = _sess.get(url, timeout=30)
        out = {'status': r.status_code,
               'json': r.json() if r.status_code == 200 else None}
    except Exception as e:
        out = {'status': -1, 'json': None, 'error': str(e)}
    _cache[url] = out
    return out

def _save_cache():
    CACHE_PATH.write_text(json.dumps(_cache))

PROPS = 'IUPACName,MolecularFormula,InChIKey,SMILES'

def props_by_inchikey(ik):
    r = _get(f'{B}/compound/inchikey/{ik}/property/{PROPS}/JSON')
    if r['status'] == 200:
        return r['json']['PropertyTable']['Properties'][0]
    return None

def cid_by_name(name):
    from urllib.parse import quote
    r = _get(f'{B}/compound/name/{quote(name)}/cids/JSON')
    if r['status'] == 200:
        cids = r['json'].get('IdentifierList', {}).get('CID', [])
        return cids[0] if cids else None
    return None

def props_by_cid(cid):
    r = _get(f'{B}/compound/cid/{cid}/property/{PROPS}/JSON')
    if r['status'] == 200:
        return r['json']['PropertyTable']['Properties'][0]
    return None

KEGG_RE = re.compile(r'^[CD]\d{5}$')
def xrefs_by_cid(cid):
    '''Scrape KEGG (C/D-number) and ChEBI ids from the synonym list.'''
    r = _get(f'{B}/compound/cid/{cid}/synonyms/JSON')
    if r['status'] != 200:
        return None, None
    syn = r['json']['InformationList']['Information'][0].get('Synonym', [])
    kegg = next((s for s in syn if KEGG_RE.match(s)), None)
    chebi = next((s for s in syn if s.upper().startswith('CHEBI:')), None)
    return kegg, chebi
print('helpers ready; cache entries:', len(_cache))

helpers ready; cache entries: 223

STEREO_RE = re.compile(r'^(\([^)]*\)-?|[DL]-|rac-)+', re.IGNORECASE)
def name_candidates(name):
    name = str(name).strip()
    cands = []
    parts = [p.strip() for p in name.split('/') if p.strip()] if '/' in name else [name]
    for p in parts:
        if p not in cands:
            cands.append(p)
    # fallback: strip leading stereo/racemic descriptors, e.g. (±)-, (S)-(-)-, D-
    for p in list(cands):
        bare = STEREO_RE.sub('', p).strip()
        if bare and bare not in cands:
            cands.append(bare)
    return cands

# preview the messy ones
for nm in ['Mellein / Ochracin', 'Decanedioic acid/Sebacic acid',
           '1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid', '2-Hydroxychalcone ']:
    print(repr(nm), '->', name_candidates(nm))

'Mellein / Ochracin' -> ['Mellein', 'Ochracin']
'Decanedioic acid/Sebacic acid' -> ['Decanedioic acid', 'Sebacic acid']
'1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid' -> ['1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid']
'2-Hydroxychalcone ' -> ['2-Hydroxychalcone']

rows = []
for _, r in comp.iterrows():
    cid = ik = formula = smiles = iupac = kegg = chebi = None
    via = 'failed'
    if r['id_is_inchikey']:
        ik0 = r['compound_id']
        p = props_by_inchikey(ik0)
        if p:
            via = 'inchikey'
            cid = p.get('CID'); ik = p.get('InChIKey', ik0)
            formula = p.get('MolecularFormula'); smiles = p.get('SMILES')
            iupac = p.get('IUPACName')
        else:
            ik = ik0  # keep the structural id even if PubChem lookup failed
            via = 'inchikey_only'
    else:
        for cand in name_candidates(r['name']):
            c = cid_by_name(cand)
            if c:
                p = props_by_cid(c)
                if p:
                    via = 'name'; cid = c
                    ik = p.get('InChIKey'); formula = p.get('MolecularFormula')
                    smiles = p.get('SMILES'); iupac = p.get('IUPACName')
                    break
    if cid is not None:
        kegg, chebi = xrefs_by_cid(cid)
    rows.append(dict(compound_id=r['compound_id'], name=r['name'],
                     npc_pathway=r['npc_pathway'], resolved_via=via,
                     cid=cid, inchikey=ik, molecular_formula=formula,
                     smiles=smiles, iupac_name=iupac, kegg_id=kegg, chebi_id=chebi))
_save_cache()
res = pd.DataFrame(rows)
print('cache entries after run:', len(_cache))
print(res['resolved_via'].value_counts().to_string())

cache entries after run: 226
resolved_via
name        59
inchikey    24

res['has_structure'] = res['inchikey'].notna()
res['has_kegg'] = res['kegg_id'].notna()
res['has_chebi'] = res['chebi_id'].notna()
n = len(res)
print(f'structure (InChIKey): {res["has_structure"].sum():2d}/{n}')
print(f'SMILES              : {res["smiles"].notna().sum():2d}/{n}')
print(f'KEGG id             : {res["has_kegg"].sum():2d}/{n}')
print(f'ChEBI id            : {res["has_chebi"].sum():2d}/{n}')
print()
cov = res.groupby('npc_pathway').agg(
    n=('compound_id', 'size'),
    structure=('has_structure', 'sum'),
    kegg=('has_kegg', 'sum'),
    chebi=('has_chebi', 'sum')).sort_values('n', ascending=False)
print(cov.to_string())

structure (InChIKey): 83/83
SMILES              : 83/83
KEGG id             : 54/83
ChEBI id            : 64/83

                                             n  structure  kegg  chebi
npc_pathway                                                           
Alkaloids                                   26         26    14     19
Shikimates and Phenylpropanoids             25         25    20     21
Terpenoids                                  17         17     8     11
Fatty acids                                 10         10     8     10
Amino acids and Peptides                     2          2     2      2
Polyketides                                  2          2     1      0
Alkaloids, Shikimates and Phenylpropanoids   1          1     1      1

# show any unresolved (no structure) compounds — these need manual curation
missing = res[~res['has_structure']][['compound_id', 'name', 'npc_pathway', 'resolved_via']]
print('UNRESOLVED (no structure):', len(missing))
print(missing.to_string(index=False))

UNRESOLVED (no structure): 0
Empty DataFrame
Columns: [compound_id, name, npc_pathway, resolved_via]
Index: []

out_cols = ['compound_id', 'name', 'npc_pathway', 'resolved_via', 'cid',
            'inchikey', 'molecular_formula', 'smiles', 'iupac_name',
            'kegg_id', 'chebi_id']
res[out_cols].to_csv(DATA / 'resolved_compounds.tsv', sep='\t', index=False)
print('wrote data/resolved_compounds.tsv', res.shape)
res[out_cols].head(12)

wrote data/resolved_compounds.tsv (83, 14)

	compound_id	name	npc_pathway	resolved_via	cid	inchikey	molecular_formula	smiles	iupac_name	kegg_id	chebi_id
0	Cc1_29	harman	Alkaloids	name	5281404	PSFDQSOCUJVVGF-UHFFFAOYSA-N	C12H10N2	CC1=NC=CC2=C1NC3=CC=CC=C23	1-methyl-9H-pyrido[3,4-b]indole	C09209	CHEBI:5623
1	Cc1_102	1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid	Alkaloids	name	28599882	NBSNSQJYXCXGLK-UHFFFAOYSA-N	C12H11NO2	C=CCN1C=C(C2=CC=CC=C21)C(=O)O	1-prop-2-enylindole-3-carboxylic acid	NaN	NaN
2	Cc1_22	2-hydroxy-4,7,8-trimethylquinoline	Alkaloids	name	864589	QRSYDKNRNVBMFQ-UHFFFAOYSA-N	C12H13NO	CC1=C(C2=C(C=C1)C(=CC(=O)N2)C)C	4,7,8-trimethyl-1H-quinolin-2-one	NaN	NaN
3	Cc1_46	3,6-dimethylchromone	Polyketides	name	688894	CNWMARIMEBTYMR-UHFFFAOYSA-N	C11H10O2	CC1=CC2=C(C=C1)OC=C(C2=O)C	3,6-dimethylchromen-4-one	NaN	NaN
4	Cc1_39	7-hydroxy-4,8-dimethylchromen-2-one	Shikimates and Phenylpropanoids	name	5356597	MVMMGVPSTRNMSV-UHFFFAOYSA-N	C11H10O3	CC1=CC(=O)OC2=C1C=CC(=C2C)O	7-hydroxy-4,8-dimethylchromen-2-one	NaN	NaN
5	Cc1_71	Fraxetin	Shikimates and Phenylpropanoids	name	5273569	HAVWRBANWNTOJX-UHFFFAOYSA-N	C10H8O5	COC1=C(C(=C2C(=C1)C=CC(=O)O2)O)O	7,8-dihydroxy-6-methoxychromen-2-one	C09265	CHEBI:5169
6	Cc1_34	2',5'-dihydroxychalcone	Shikimates and Phenylpropanoids	name	3526360	PZVRZRARFZZBCA-UHFFFAOYSA-N	C15H12O3	C1=CC=C(C=C1)C=CC(=O)C2=C(C=CC(=C2)O)O	1-(2,5-dihydroxyphenyl)-3-phenylprop-2-en-1-one	NaN	NaN
7	Cc1_60	Phthalic acid mono-2-ethylhexyl ester	Shikimates and Phenylpropanoids	name	20393	DJDSLBVSSOQSLW-UHFFFAOYSA-N	C16H22O4	CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)O	2-(2-ethylhexoxycarbonyl)benzoic acid	C03343	CHEBI:17243
8	Cc1_106	2-Isopropyl-5-methylcyclohexanamine	Terpenoids	name	89497	RBMUAGDCCJDQLE-UHFFFAOYSA-N	C10H21N	CC1CCC(C(C1)N)C(C)C	5-methyl-2-propan-2-ylcyclohexan-1-amine	NaN	NaN
9	CECREIRZLPLYDM-UHFFFAOYSA-N	Manool	Terpenoids	inchikey	238792	CECREIRZLPLYDM-UHFFFAOYSA-N	C20H34O	CC1(CCCC2(C1CCC(=C)C2CCC(C)(C=C)O)C)C	5-(5,5,8a-trimethyl-2-methylidene-3,4,4a,6,7,8...	NaN	NaN
10	KWILGNNWGSNMPA-UHFFFAOYSA-N	Mellein / Ochracin	Shikimates and Phenylpropanoids	inchikey	28516	KWILGNNWGSNMPA-UHFFFAOYSA-N	C10H10O3	CC1CC2=C(C(=CC=C2)O)C(=O)O1	8-hydroxy-3-methyl-3,4-dihydroisochromen-1-one	NaN	CHEBI:38760
11	Cc1_216	(-)-Perillyl alcohol	Terpenoids	name	369312	NDTYTMIUWGWIMO-SNVBAGLBSA-N	C10H16O	CC(=C)[C@H]1CCC(=CC1)CO	[(4S)-4-prop-1-en-2-ylcyclohexen-1-yl]methanol	C02452	CHEBI:10782

01 Identity Resolution

NB01 — Compound Identity Resolution (PubChem)¶

PubChem PUG-REST helpers (cached)¶

Build name candidates for the 62 name-only compounds¶

Resolve all 83 compounds¶

Coverage report¶

Save resolved table for NB02¶