import pandas as pd, numpy as np, os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from itertools import combinations
from scipy.stats import fisher_exact
from pathlib import Path
from dotenv import load_dotenv

DATA = Path('../data'); FIG = Path('../figures')
pd.set_option('display.max_columns', 40); pd.set_option('display.width', 220)

pred = pd.read_csv(DATA / 'compound_organism_predictions.tsv', sep='\t')
print('prediction rows:', len(pred),
      '| genomes with >=1 call:', pred['genome_id'].nunique(),
      '| callable compounds:', pred['compound_id'].nunique())

# aromatic-funnel block (shared protocatechuate/catechol lower pathway)
AROMATIC = {'salicylic acid', '3-hydroxybenzoic acid', '4-hydroxybenzaldehyde',
            'phthalic acid', 'TEREPHTHALIC ACID'}
pred['block'] = pred['name'].map(lambda n: 'aromatic-funnel' if n in AROMATIC else 'other')
print('\ncompounds by block:')
print(pred.groupby('block')['name'].apply(lambda s: sorted(set(s))).to_string())

prediction rows: 948 | genomes with >=1 call: 800 | callable compounds: 8

compounds by block:
block
aromatic-funnel    [3-hydroxybenzoic acid, 4-hydroxybenzaldehyde,...
other                    [Abscisic acid, Phenylethylamine, xanthine]

load_dotenv('/home/aparkin/BERIL-research-observatory/.env')
_tok = os.environ['KBASE_AUTH_TOKEN']
from pyspark.sql import SparkSession
_url = f'sc://jupyter-aparkin.jupyterhub-prod:15002/;use_ssl=false;x-kbase-token={_tok}'
spark = SparkSession.builder.remote(_url).getOrCreate()
DB = 'enigma_genome_depot_enigma'
NGEN = spark.sql(f'SELECT COUNT(DISTINCT genome_id) n FROM {DB}.browser_gene').toPandas()['n'][0]
print('total genomes in DB (null denominator):', NGEN)

total genomes in DB (null denominator): 3109

# binary genome x compound matrix over the callable set
order = (pred.groupby('name')['genome_id'].nunique()
         .sort_values(ascending=False).index.tolist())
mat = (pred.assign(v=1).pivot_table(index='genome_id', columns='name', values='v',
                                    aggfunc='max', fill_value=0)[order].astype(int))
print('matrix:', mat.shape, '(genomes x compounds)')
print('\nprevalence (fraction of all', NGEN, 'genomes):')
for c in order:
    print(f'  {c:24s} {mat[c].sum():4d}  {mat[c].sum()/NGEN*100:5.2f}%')

matrix: (800, 8) (genomes x compounds)

prevalence (fraction of all 3109 genomes):
  3-hydroxybenzoic acid     296   9.52%
  salicylic acid            197   6.34%
  4-hydroxybenzaldehyde     175   5.63%
  TEREPHTHALIC ACID          97   3.12%
  phthalic acid              81   2.61%
  Phenylethylamine           81   2.61%
  xanthine                   19   0.61%
  Abscisic acid               2   0.06%

vers = mat.sum(axis=1)
dist = vers.value_counts().sort_index()
print('versatility (capacities per genome, among the 800 with >=1):')
for k, v in dist.items():
    print(f'  {k} capacit{"y" if k==1 else "ies"}: {v:4d} genomes')
print(f'\nmean {vers.mean():.2f} | median {int(vers.median())} | max {int(vers.max())}')
# how many genomes carry BOTH an aromatic and a non-aromatic capacity
arom_cols = [c for c in order if c in AROMATIC]
other_cols = [c for c in order if c not in AROMATIC]
has_arom = mat[arom_cols].sum(axis=1) > 0
has_other = mat[other_cols].sum(axis=1) > 0
print(f'\ngenomes with an aromatic capacity      : {int(has_arom.sum())}')
print(f'genomes with a non-aromatic capacity   : {int(has_other.sum())}')
print(f'genomes with BOTH (cross-block overlap): {int((has_arom & has_other).sum())}')

versatility (capacities per genome, among the 800 with >=1):
  1 capacity:  675 genomes
  2 capacities:  107 genomes
  3 capacities:   13 genomes
  4 capacities:    5 genomes

mean 1.19 | median 1 | max 4

genomes with an aromatic capacity      : 724
genomes with a non-aromatic capacity   : 101
genomes with BOTH (cross-block overlap): 25

fig, ax = plt.subplots(figsize=(7, 4))
ax.bar(dist.index, dist.values, color='#41b6c4', edgecolor='k', linewidth=0.4)
ax.set_xlabel('catabolic capacities carried (of 8)')
ax.set_ylabel('genomes')
ax.set_title('Within-genome catabolic versatility (n=%d genomes with >=1 call)' % len(mat))
fig.tight_layout(); fig.savefig(FIG / '05_versatility.png', dpi=150)
print('saved 05_versatility.png'); plt.close(fig)

saved 05_versatility.png

rows = []
g_all = NGEN
present = {c: set(mat.index[mat[c] == 1]) for c in order}
for a, b in combinations(order, 2):
    A, B = present[a], present[b]
    n11 = len(A & B)
    n10 = len(A - B)
    n01 = len(B - A)
    n00 = g_all - n11 - n10 - n01
    # phi coefficient
    num = n11 * n00 - n10 * n01
    den = np.sqrt((n11+n10)*(n01+n00)*(n11+n01)*(n10+n00))
    phi = num / den if den > 0 else np.nan
    _, p = fisher_exact([[n11, n10], [n01, n00]], alternative='greater')
    blk = 'within-aromatic' if (a in AROMATIC and b in AROMATIC) else (
          'within-other' if (a not in AROMATIC and b not in AROMATIC) else 'cross-block')
    rows.append(dict(a=a, b=b, block=blk, n11=n11, n10=n10, n01=n01,
                     phi=phi, p=p))
co = pd.DataFrame(rows)
# Benjamini-Hochberg
m = len(co); co = co.sort_values('p').reset_index(drop=True)
co['q'] = (co['p'] * m / (co.index + 1)).iloc[::-1].cummin().iloc[::-1].clip(upper=1)
co = co.sort_values('phi', ascending=False).reset_index(drop=True)
print(co[['a','b','block','n11','phi','p','q']].to_string(index=False))

                    a                     b           block  n11       phi            p        q
             xanthine         Abscisic acid    within-other    1  0.160782 1.218719e-02 0.056874
3-hydroxybenzoic acid 4-hydroxybenzaldehyde within-aromatic   39  0.106218 1.564614e-07 0.000004
3-hydroxybenzoic acid        salicylic acid within-aromatic   40  0.095565 1.477470e-06 0.000021
3-hydroxybenzoic acid      Phenylethylamine     cross-block   16  0.057020 3.294961e-03 0.030753
4-hydroxybenzaldehyde      Phenylethylamine     cross-block   11  0.056426 5.030669e-03 0.035215
        phthalic acid      Phenylethylamine     cross-block    6  0.049305 1.778041e-02 0.071122
3-hydroxybenzoic acid     TEREPHTHALIC ACID within-aromatic   17  0.048945 8.837001e-03 0.049487
4-hydroxybenzaldehyde         Abscisic acid     cross-block    1  0.048845 1.094251e-01 0.340434
    TEREPHTHALIC ACID      Phenylethylamine     cross-block    5  0.028720 1.065967e-01 0.340434
4-hydroxybenzaldehyde     TEREPHTHALIC ACID within-aromatic    8  0.020389 1.769190e-01 0.495373
       salicylic acid 4-hydroxybenzaldehyde within-aromatic   12  0.005220 4.317021e-01 1.000000
3-hydroxybenzoic acid         phthalic acid within-aromatic    8  0.001983 5.125138e-01 1.000000
       salicylic acid     TEREPHTHALIC ACID within-aromatic    6 -0.001111 5.862088e-01 1.000000
4-hydroxybenzaldehyde              xanthine     cross-block    1 -0.001244 6.684739e-01 1.000000
        phthalic acid         Abscisic acid     cross-block    0 -0.004150 1.000000e+00 1.000000
     Phenylethylamine         Abscisic acid    within-other    0 -0.004150 1.000000e+00 1.000000
    TEREPHTHALIC ACID         Abscisic acid     cross-block    0 -0.004553 1.000000e+00 1.000000
    TEREPHTHALIC ACID         phthalic acid within-aromatic    2 -0.006123 7.271307e-01 1.000000
       salicylic acid         Abscisic acid     cross-block    0 -0.006599 1.000000e+00 1.000000
3-hydroxybenzoic acid         Abscisic acid     cross-block    0 -0.008230 1.000000e+00 1.000000
        phthalic acid              xanthine     cross-block    0 -0.012825 1.000000e+00 1.000000
     Phenylethylamine              xanthine    within-other    0 -0.012825 1.000000e+00 1.000000
4-hydroxybenzaldehyde         phthalic acid within-aromatic    3 -0.013661 8.445550e-01 1.000000
    TEREPHTHALIC ACID              xanthine     cross-block    0 -0.014072 1.000000e+00 1.000000
       salicylic acid              xanthine     cross-block    0 -0.020396 1.000000e+00 1.000000
3-hydroxybenzoic acid              xanthine     cross-block    0 -0.025437 1.000000e+00 1.000000
       salicylic acid      Phenylethylamine     cross-block    0 -0.042540 1.000000e+00 1.000000
       salicylic acid         phthalic acid within-aromatic    0 -0.042540 1.000000e+00 1.000000

print('co-occurrence summary by pair class (mean phi, n significant at q<0.05):')
for blk, grp in co.groupby('block'):
    print(f'  {blk:16s} pairs={len(grp):2d}  mean_phi={grp["phi"].mean():+.3f}  '
          f'q<0.05: {int((grp["q"]<0.05).sum())}/{len(grp)}')
cross = co[co['block'] == 'cross-block']
print('\ncross-block pairs (the real H2 test — distinct lower pathways):')
print(cross[['a','b','n11','phi','p','q']].to_string(index=False))

co-occurrence summary by pair class (mean phi, n significant at q<0.05):
  cross-block      pairs=15  mean_phi=+0.007  q<0.05: 2/15
  within-aromatic  pairs=10  mean_phi=+0.021  q<0.05: 3/10
  within-other     pairs= 3  mean_phi=+0.048  q<0.05: 0/3

cross-block pairs (the real H2 test — distinct lower pathways):
                    a                b  n11       phi        p        q
3-hydroxybenzoic acid Phenylethylamine   16  0.057020 0.003295 0.030753
4-hydroxybenzaldehyde Phenylethylamine   11  0.056426 0.005031 0.035215
        phthalic acid Phenylethylamine    6  0.049305 0.017780 0.071122
4-hydroxybenzaldehyde    Abscisic acid    1  0.048845 0.109425 0.340434
    TEREPHTHALIC ACID Phenylethylamine    5  0.028720 0.106597 0.340434
4-hydroxybenzaldehyde         xanthine    1 -0.001244 0.668474 1.000000
        phthalic acid    Abscisic acid    0 -0.004150 1.000000 1.000000
    TEREPHTHALIC ACID    Abscisic acid    0 -0.004553 1.000000 1.000000
       salicylic acid    Abscisic acid    0 -0.006599 1.000000 1.000000
3-hydroxybenzoic acid    Abscisic acid    0 -0.008230 1.000000 1.000000
        phthalic acid         xanthine    0 -0.012825 1.000000 1.000000
    TEREPHTHALIC ACID         xanthine    0 -0.014072 1.000000 1.000000
       salicylic acid         xanthine    0 -0.020396 1.000000 1.000000
3-hydroxybenzoic acid         xanthine    0 -0.025437 1.000000 1.000000
       salicylic acid Phenylethylamine    0 -0.042540 1.000000 1.000000

# heatmap of phi
M = pd.DataFrame(np.eye(len(order)), index=order, columns=order)
for _, r in co.iterrows():
    M.loc[r['a'], r['b']] = r['phi']; M.loc[r['b'], r['a']] = r['phi']
fig, ax = plt.subplots(figsize=(7.5, 6.5))
im = ax.imshow(M.values, cmap='RdBu_r', vmin=-1, vmax=1)
ax.set_xticks(range(len(order))); ax.set_xticklabels(order, rotation=45, ha='right', fontsize=8)
ax.set_yticks(range(len(order))); ax.set_yticklabels(order, fontsize=8)
for i in range(len(order)):
    for j in range(len(order)):
        if i != j:
            ax.text(j, i, f'{M.values[i,j]:.2f}', ha='center', va='center',
                    fontsize=6, color='k')
ax.set_title('Catabolic co-occurrence (phi) across %d genomes' % g_all)
fig.colorbar(im, ax=ax, shrink=0.7, label='phi coefficient')
fig.tight_layout(); fig.savefig(FIG / '05_cooccurrence.png', dpi=150)
print('saved 05_cooccurrence.png'); plt.close(fig)

saved 05_cooccurrence.png

tax = pred[['genome_id', 'taxon_name']].drop_duplicates('genome_id').set_index('genome_id')
gv = mat.join(tax)
gv['genus'] = gv['taxon_name'].astype(str).str.split().str[0]
gv['versatility'] = vers
by_genus = (gv.groupby('genus')
            .agg(n_genomes=('versatility', 'size'),
                 mean_vers=('versatility', 'mean'),
                 max_vers=('versatility', 'max'))
            .sort_values('mean_vers', ascending=False))
print('top genera by mean catabolic versatility (>=3 genomes):')
print(by_genus[by_genus['n_genomes'] >= 3].head(15).to_string())

multi = gv[gv['versatility'] >= 3]
print(f'\nhigh-versatility genomes (>=3 capacities): {len(multi)}')
print('their genus concentration:')
gc = multi['genus'].value_counts()
print(gc.head(10).to_string())
print(f'top-3 genera hold {gc.head(3).sum()}/{len(multi)} '
      f'({gc.head(3).sum()/len(multi)*100:.0f}%) of high-versatility genomes')

top genera by mean catabolic versatility (>=3 genomes):
                  n_genomes  mean_vers  max_vers
genus                                           
Polaromonas               3   2.333333         4
Alcaligenes               5   2.200000         3
Burkholderia             10   2.100000         4
Achromobacter             5   2.000000         2
Rugosibacter              3   2.000000         2
Paraburkholderia         23   1.869565         4
Comamonas                13   1.846154         2
Hydrogenophaga            5   1.800000         3
Herbaspirillum            4   1.750000         2
Acidovorax               16   1.562500         2
Burkholderiaceae          5   1.400000         3
Bosea                     8   1.375000         2
Pseudomonas              96   1.302083         2
Collimonas               10   1.200000         2
Streptomyces              6   1.166667         2

high-versatility genomes (>=3 capacities): 18
their genus concentration:
genus
Paraburkholderia    5
Burkholderia        4
Hydrogenophaga      2
Burkholderiaceae    1
Cupriavidus         1
Myxococcaceae       1
Caballeronia        1
Polaromonas         1
Pigmentiphaga       1
Alcaligenes         1
top-3 genera hold 11/18 (61%) of high-versatility genomes

topg = by_genus[by_genus['n_genomes'] >= 3].head(15).iloc[::-1]
fig, ax = plt.subplots(figsize=(8, 6))
ax.barh(topg.index, topg['mean_vers'], color='#fc8d59', edgecolor='k', linewidth=0.4)
ax.set_xlabel('mean capacities per genome (of 8)')
ax.set_title('Catabolic versatility by genus (genera with >=3 genomes)')
fig.tight_layout(); fig.savefig(FIG / '05_genus_versatility.png', dpi=150)
print('saved 05_genus_versatility.png'); plt.close(fig)

saved 05_genus_versatility.png

co.to_csv(DATA / 'cooccurrence_matrix.tsv', sep='\t', index=False)
print('wrote data/cooccurrence_matrix.tsv', co.shape)

wa = co[co['block'] == 'within-aromatic']
xb = co[co['block'] == 'cross-block']
wo = co[co['block'] == 'within-other']
both = int((has_arom & has_other).sum())
print('\n=== H2 VERDICT ===')
print(f'Scope: 8 catabolically-coherent compounds over {len(mat)} genomes '
      f'(of {NGEN} total).')
print(f'\nWithin-genome: versatility right-skewed, mean {vers.mean():.2f}, max {int(vers.max())}.')
print(f'\nWithin-aromatic pairs: mean phi {wa["phi"].mean():+.3f}, '
      f'{int((wa["q"]<0.05).sum())}/{len(wa)} significant (q<0.05).')
print('  -> EXPECTED: shared protocatechuate/catechol lower pathway, not independent modules.')
print(f'\nCross-block pairs (distinct lower pathways = real H2 test):')
print(f'  mean phi {xb["phi"].mean():+.3f}, {int((xb["q"]<0.05).sum())}/{len(xb)} significant.')
print(f'  genomes carrying BOTH an aromatic and a non-aromatic capacity: {both}')
if (xb['q'] < 0.05).any():
    sig = xb[xb['q'] < 0.05].sort_values('phi', ascending=False)
    print('  significant cross-block co-occurrences:')
    for _, r in sig.iterrows():
        print(f'    {r["a"]} + {r["b"]}: phi={r["phi"]:+.3f} (n11={r["n11"]}, q={r["q"]:.1e})')
else:
    print('  -> NO significant cross-block co-occurrence.')
print('\nInterpretation: H2 support among aromatics is largely mechanistic (one shared')
print('lower module). Genuine cross-module modularity is judged on the cross-block result')
print('above. Phylogenetic clustering: see 05_genus_versatility.png.')

wrote data/cooccurrence_matrix.tsv (28, 9)

=== H2 VERDICT ===
Scope: 8 catabolically-coherent compounds over 800 genomes (of 3109 total).

Within-genome: versatility right-skewed, mean 1.19, max 4.

Within-aromatic pairs: mean phi +0.021, 3/10 significant (q<0.05).
  -> EXPECTED: shared protocatechuate/catechol lower pathway, not independent modules.

Cross-block pairs (distinct lower pathways = real H2 test):
  mean phi +0.007, 2/15 significant.
  genomes carrying BOTH an aromatic and a non-aromatic capacity: 25
  significant cross-block co-occurrences:
    3-hydroxybenzoic acid + Phenylethylamine: phi=+0.057 (n11=16, q=3.1e-02)
    4-hydroxybenzaldehyde + Phenylethylamine: phi=+0.056 (n11=11, q=3.5e-02)

Interpretation: H2 support among aromatics is largely mechanistic (one shared
lower module). Genuine cross-module modularity is judged on the cross-block result
above. Phylogenetic clustering: see 05_genus_versatility.png.

05 Cooccurrence

NB05 — Catabolic Co-occurrence & Modularity (H2)¶

Genome x compound capacity matrix¶

Within-genome versatility¶

Pairwise co-occurrence across genomes¶

Phylogenetic concentration¶

H2 verdict¶