01 Identity Resolution
Jupyter notebook from the ENIGMA Carbon Census 1 project.
NB01 — Compound Identity Resolution (PubChem)¶
Resolve each of the 83 compounds to a structural identifier (CID, InChIKey, SMILES, formula) plus KEGG/ChEBI cross-references — the bridge from the name-only source sheet to the structure- and pathway-keyed databases used downstream (NB02).
Two resolution channels:
- 21 compounds already carry an InChIKey as their
compound_id(e.g.CECREIRZLPLYDM-...= Manool) → resolve by structure (most reliable). - 62 carry
Cc1_xxids and only a name → resolvename → CID, trying whitespace-stripped and slash-split alternates.
All PubChem responses are disk-cached (data/pubchem_cache.json) so reruns are free and the notebook is reproducible offline once populated.
import pandas as pd
import requests
import re, json, time
from pathlib import Path
DATA = Path('../data'); FIG = Path('../figures')
pd.set_option('display.max_columns', 40); pd.set_option('display.width', 200)
comp = pd.read_csv(DATA / 'compounds_selected.tsv', sep='\t')
print('compounds:', len(comp))
INCHIKEY_RE = re.compile(r'^[A-Z]{14}-[A-Z]{10}-[A-Z]$')
comp['id_is_inchikey'] = comp['compound_id'].str.match(INCHIKEY_RE)
print('ids that are already InChIKeys:', int(comp['id_is_inchikey'].sum()))
print('ids that are Cc1_xx (name-only):', int((~comp['id_is_inchikey']).sum()))
compounds: 83 ids that are already InChIKeys: 24 ids that are Cc1_xx (name-only): 59
PubChem PUG-REST helpers (cached)¶
Rate-limited to <5 req/s (PubChem policy). Every URL response is memoized to data/pubchem_cache.json.
B = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
CACHE_PATH = DATA / 'pubchem_cache.json'
_cache = json.loads(CACHE_PATH.read_text()) if CACHE_PATH.exists() else {}
_sess = requests.Session()
def _get(url):
if url in _cache:
return _cache[url]
time.sleep(0.22) # stay under 5 req/s
try:
r = _sess.get(url, timeout=30)
out = {'status': r.status_code,
'json': r.json() if r.status_code == 200 else None}
except Exception as e:
out = {'status': -1, 'json': None, 'error': str(e)}
_cache[url] = out
return out
def _save_cache():
CACHE_PATH.write_text(json.dumps(_cache))
PROPS = 'IUPACName,MolecularFormula,InChIKey,SMILES'
def props_by_inchikey(ik):
r = _get(f'{B}/compound/inchikey/{ik}/property/{PROPS}/JSON')
if r['status'] == 200:
return r['json']['PropertyTable']['Properties'][0]
return None
def cid_by_name(name):
from urllib.parse import quote
r = _get(f'{B}/compound/name/{quote(name)}/cids/JSON')
if r['status'] == 200:
cids = r['json'].get('IdentifierList', {}).get('CID', [])
return cids[0] if cids else None
return None
def props_by_cid(cid):
r = _get(f'{B}/compound/cid/{cid}/property/{PROPS}/JSON')
if r['status'] == 200:
return r['json']['PropertyTable']['Properties'][0]
return None
KEGG_RE = re.compile(r'^[CD]\d{5}$')
def xrefs_by_cid(cid):
'''Scrape KEGG (C/D-number) and ChEBI ids from the synonym list.'''
r = _get(f'{B}/compound/cid/{cid}/synonyms/JSON')
if r['status'] != 200:
return None, None
syn = r['json']['InformationList']['Information'][0].get('Synonym', [])
kegg = next((s for s in syn if KEGG_RE.match(s)), None)
chebi = next((s for s in syn if s.upper().startswith('CHEBI:')), None)
return kegg, chebi
print('helpers ready; cache entries:', len(_cache))
helpers ready; cache entries: 223
Build name candidates for the 62 name-only compounds¶
Strip whitespace; split slash-delimited synonyms (Mellein / Ochracin, Decanedioic acid/Sebacic acid) into alternates tried in order. Underscore-mangled names (1_prop_2_en_1_yl__...) are tried as-is and flagged if they fail.
STEREO_RE = re.compile(r'^(\([^)]*\)-?|[DL]-|rac-)+', re.IGNORECASE)
def name_candidates(name):
name = str(name).strip()
cands = []
parts = [p.strip() for p in name.split('/') if p.strip()] if '/' in name else [name]
for p in parts:
if p not in cands:
cands.append(p)
# fallback: strip leading stereo/racemic descriptors, e.g. (±)-, (S)-(-)-, D-
for p in list(cands):
bare = STEREO_RE.sub('', p).strip()
if bare and bare not in cands:
cands.append(bare)
return cands
# preview the messy ones
for nm in ['Mellein / Ochracin', 'Decanedioic acid/Sebacic acid',
'1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid', '2-Hydroxychalcone ']:
print(repr(nm), '->', name_candidates(nm))
'Mellein / Ochracin' -> ['Mellein', 'Ochracin'] 'Decanedioic acid/Sebacic acid' -> ['Decanedioic acid', 'Sebacic acid'] '1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid' -> ['1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid'] '2-Hydroxychalcone ' -> ['2-Hydroxychalcone']
Resolve all 83 compounds¶
rows = []
for _, r in comp.iterrows():
cid = ik = formula = smiles = iupac = kegg = chebi = None
via = 'failed'
if r['id_is_inchikey']:
ik0 = r['compound_id']
p = props_by_inchikey(ik0)
if p:
via = 'inchikey'
cid = p.get('CID'); ik = p.get('InChIKey', ik0)
formula = p.get('MolecularFormula'); smiles = p.get('SMILES')
iupac = p.get('IUPACName')
else:
ik = ik0 # keep the structural id even if PubChem lookup failed
via = 'inchikey_only'
else:
for cand in name_candidates(r['name']):
c = cid_by_name(cand)
if c:
p = props_by_cid(c)
if p:
via = 'name'; cid = c
ik = p.get('InChIKey'); formula = p.get('MolecularFormula')
smiles = p.get('SMILES'); iupac = p.get('IUPACName')
break
if cid is not None:
kegg, chebi = xrefs_by_cid(cid)
rows.append(dict(compound_id=r['compound_id'], name=r['name'],
npc_pathway=r['npc_pathway'], resolved_via=via,
cid=cid, inchikey=ik, molecular_formula=formula,
smiles=smiles, iupac_name=iupac, kegg_id=kegg, chebi_id=chebi))
_save_cache()
res = pd.DataFrame(rows)
print('cache entries after run:', len(_cache))
print(res['resolved_via'].value_counts().to_string())
cache entries after run: 226 resolved_via name 59 inchikey 24
Coverage report¶
res['has_structure'] = res['inchikey'].notna()
res['has_kegg'] = res['kegg_id'].notna()
res['has_chebi'] = res['chebi_id'].notna()
n = len(res)
print(f'structure (InChIKey): {res["has_structure"].sum():2d}/{n}')
print(f'SMILES : {res["smiles"].notna().sum():2d}/{n}')
print(f'KEGG id : {res["has_kegg"].sum():2d}/{n}')
print(f'ChEBI id : {res["has_chebi"].sum():2d}/{n}')
print()
cov = res.groupby('npc_pathway').agg(
n=('compound_id', 'size'),
structure=('has_structure', 'sum'),
kegg=('has_kegg', 'sum'),
chebi=('has_chebi', 'sum')).sort_values('n', ascending=False)
print(cov.to_string())
structure (InChIKey): 83/83
SMILES : 83/83
KEGG id : 54/83
ChEBI id : 64/83
n structure kegg chebi
npc_pathway
Alkaloids 26 26 14 19
Shikimates and Phenylpropanoids 25 25 20 21
Terpenoids 17 17 8 11
Fatty acids 10 10 8 10
Amino acids and Peptides 2 2 2 2
Polyketides 2 2 1 0
Alkaloids, Shikimates and Phenylpropanoids 1 1 1 1
# show any unresolved (no structure) compounds — these need manual curation
missing = res[~res['has_structure']][['compound_id', 'name', 'npc_pathway', 'resolved_via']]
print('UNRESOLVED (no structure):', len(missing))
print(missing.to_string(index=False))
UNRESOLVED (no structure): 0 Empty DataFrame Columns: [compound_id, name, npc_pathway, resolved_via] Index: []
Save resolved table for NB02¶
out_cols = ['compound_id', 'name', 'npc_pathway', 'resolved_via', 'cid',
'inchikey', 'molecular_formula', 'smiles', 'iupac_name',
'kegg_id', 'chebi_id']
res[out_cols].to_csv(DATA / 'resolved_compounds.tsv', sep='\t', index=False)
print('wrote data/resolved_compounds.tsv', res.shape)
res[out_cols].head(12)
wrote data/resolved_compounds.tsv (83, 14)
| compound_id | name | npc_pathway | resolved_via | cid | inchikey | molecular_formula | smiles | iupac_name | kegg_id | chebi_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Cc1_29 | harman | Alkaloids | name | 5281404 | PSFDQSOCUJVVGF-UHFFFAOYSA-N | C12H10N2 | CC1=NC=CC2=C1NC3=CC=CC=C23 | 1-methyl-9H-pyrido[3,4-b]indole | C09209 | CHEBI:5623 |
| 1 | Cc1_102 | 1_prop_2_en_1_yl__1H_indole_3_carboxylic_acid | Alkaloids | name | 28599882 | NBSNSQJYXCXGLK-UHFFFAOYSA-N | C12H11NO2 | C=CCN1C=C(C2=CC=CC=C21)C(=O)O | 1-prop-2-enylindole-3-carboxylic acid | NaN | NaN |
| 2 | Cc1_22 | 2-hydroxy-4,7,8-trimethylquinoline | Alkaloids | name | 864589 | QRSYDKNRNVBMFQ-UHFFFAOYSA-N | C12H13NO | CC1=C(C2=C(C=C1)C(=CC(=O)N2)C)C | 4,7,8-trimethyl-1H-quinolin-2-one | NaN | NaN |
| 3 | Cc1_46 | 3,6-dimethylchromone | Polyketides | name | 688894 | CNWMARIMEBTYMR-UHFFFAOYSA-N | C11H10O2 | CC1=CC2=C(C=C1)OC=C(C2=O)C | 3,6-dimethylchromen-4-one | NaN | NaN |
| 4 | Cc1_39 | 7-hydroxy-4,8-dimethylchromen-2-one | Shikimates and Phenylpropanoids | name | 5356597 | MVMMGVPSTRNMSV-UHFFFAOYSA-N | C11H10O3 | CC1=CC(=O)OC2=C1C=CC(=C2C)O | 7-hydroxy-4,8-dimethylchromen-2-one | NaN | NaN |
| 5 | Cc1_71 | Fraxetin | Shikimates and Phenylpropanoids | name | 5273569 | HAVWRBANWNTOJX-UHFFFAOYSA-N | C10H8O5 | COC1=C(C(=C2C(=C1)C=CC(=O)O2)O)O | 7,8-dihydroxy-6-methoxychromen-2-one | C09265 | CHEBI:5169 |
| 6 | Cc1_34 | 2',5'-dihydroxychalcone | Shikimates and Phenylpropanoids | name | 3526360 | PZVRZRARFZZBCA-UHFFFAOYSA-N | C15H12O3 | C1=CC=C(C=C1)C=CC(=O)C2=C(C=CC(=C2)O)O | 1-(2,5-dihydroxyphenyl)-3-phenylprop-2-en-1-one | NaN | NaN |
| 7 | Cc1_60 | Phthalic acid mono-2-ethylhexyl ester | Shikimates and Phenylpropanoids | name | 20393 | DJDSLBVSSOQSLW-UHFFFAOYSA-N | C16H22O4 | CCCCC(CC)COC(=O)C1=CC=CC=C1C(=O)O | 2-(2-ethylhexoxycarbonyl)benzoic acid | C03343 | CHEBI:17243 |
| 8 | Cc1_106 | 2-Isopropyl-5-methylcyclohexanamine | Terpenoids | name | 89497 | RBMUAGDCCJDQLE-UHFFFAOYSA-N | C10H21N | CC1CCC(C(C1)N)C(C)C | 5-methyl-2-propan-2-ylcyclohexan-1-amine | NaN | NaN |
| 9 | CECREIRZLPLYDM-UHFFFAOYSA-N | Manool | Terpenoids | inchikey | 238792 | CECREIRZLPLYDM-UHFFFAOYSA-N | C20H34O | CC1(CCCC2(C1CCC(=C)C2CCC(C)(C=C)O)C)C | 5-(5,5,8a-trimethyl-2-methylidene-3,4,4a,6,7,8... | NaN | NaN |
| 10 | KWILGNNWGSNMPA-UHFFFAOYSA-N | Mellein / Ochracin | Shikimates and Phenylpropanoids | inchikey | 28516 | KWILGNNWGSNMPA-UHFFFAOYSA-N | C10H10O3 | CC1CC2=C(C(=CC=C2)O)C(=O)O1 | 8-hydroxy-3-methyl-3,4-dihydroisochromen-1-one | NaN | CHEBI:38760 |
| 11 | Cc1_216 | (-)-Perillyl alcohol | Terpenoids | name | 369312 | NDTYTMIUWGWIMO-SNVBAGLBSA-N | C10H16O | CC(=C)[C@H]1CCC(=CC1)CO | [(4S)-4-prop-1-en-2-ylcyclohexen-1-yl]methanol | C02452 | CHEBI:10782 |