import pandas as pd, numpy as np, os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path
from dotenv import load_dotenv

DATA = Path('../data'); FIG = Path('../figures')
pd.set_option('display.max_columns', 40); pd.set_option('display.width', 220)

phylo = pd.read_csv(DATA / 'phylo_utilizer_map.tsv', sep='\t')
comp = pd.read_csv(DATA / 'compounds_selected.tsv', sep='\t')
src = dict(zip(comp['name'], comp['source_short']))
# implicated genera per compound (drop nulls / unresolved)
imp = (phylo[phylo['genus'].notna()][['name', 'npc_pathway', 'genus', 'certainty']]
       .drop_duplicates())
imp['source'] = imp['name'].map(src)
print('compounds with >=1 genus-resolved utilizer:', imp['name'].nunique())
print('distinct implicated genera:', imp['genus'].nunique())
print(imp.groupby('source')['name'].nunique().to_string())

compounds with >=1 genus-resolved utilizer: 8
distinct implicated genera: 86
source
groundwater    6
necromass      2

load_dotenv('/home/aparkin/BERIL-research-observatory/.env')
_tok = os.environ['KBASE_AUTH_TOKEN']
from pyspark.sql import SparkSession, functions as F
_url = f'sc://jupyter-aparkin.jupyterhub-prod:15002/;use_ssl=false;x-kbase-token={_tok}'
spark = SparkSession.builder.remote(_url).getOrCreate()
DB = 'enigma_coral'

def survey_genus_relab(counts_b, tax_b, survey):
    counts = (spark.table(f'{DB}.{counts_b}')
              .select('sdt_asv_name', 'sdt_community_name',
                      F.col('count_count_unit').cast('double').alias('cnt'))
              .filter('cnt > 0'))
    tax = (spark.table(f'{DB}.{tax_b}')
           .filter("taxonomic_level_sys_oterm_name = 'genus'")
           .select('sdt_asv_name', F.col('sdt_taxon_name').alias('genus')))
    tot = counts.groupBy('sdt_community_name').agg(F.sum('cnt').alias('tot'))
    gc = (counts.join(tax, 'sdt_asv_name')
          .groupBy('sdt_community_name', 'genus').agg(F.sum('cnt').alias('gcnt'))
          .join(tot, 'sdt_community_name'))
    gc = gc.withColumn('relab', F.col('gcnt') / F.col('tot'))
    pdf = gc.toPandas()
    pdf['survey'] = survey
    return pdf

sed = survey_genus_relab('ddt_brick0000459', 'ddt_brick0000458', 'SSO_sediment')
pmp = survey_genus_relab('ddt_brick0000462', 'ddt_brick0000461', 'SSO_pump_test')
fld = pd.concat([sed, pmp], ignore_index=True)
ncomm = fld.groupby('survey')['sdt_community_name'].nunique()
print('communities per survey:'); print(ncomm.to_string())
print('total SSO communities pooled:', fld['sdt_community_name'].nunique())
print('genus x community rows:', len(fld), '| distinct field genera:', fld['genus'].nunique())

communities per survey:
survey
SSO_pump_test    14
SSO_sediment     37
total SSO communities pooled: 51
genus x community rows: 9647 | distinct field genera: 1092

N_TOT = fld['sdt_community_name'].nunique()
gstat = (fld.groupby('genus')
         .agg(n_comm_present=('sdt_community_name', 'nunique'),
              mean_relab=('relab', 'mean'),
              max_relab=('relab', 'max')).reset_index())
gstat['prevalence'] = gstat['n_comm_present'] / N_TOT
gstat = gstat.sort_values('prevalence', ascending=False)
print('field genera with prevalence >= 0.5 (core SSO taxa):')
print(gstat[gstat['prevalence'] >= 0.5].head(20).to_string(index=False))

field genera with prevalence >= 0.5 (core SSO taxa):
                 genus  n_comm_present  mean_relab  max_relab  prevalence
            Nitrospira              46    0.003514   0.013841    0.901961
          Sphingomonas              46    0.004906   0.028094    0.901961
        Hyphomicrobium              45    0.003179   0.011040    0.882353
            Haliangium              44    0.002675   0.014747    0.862745
           Pseudomonas              44    0.005293   0.026363    0.862745
 Candidatus Solibacter              44    0.007699   0.050667    0.862745
         Rhodanobacter              43    0.028530   0.188870    0.843137
             Ellin6067              43    0.003374   0.022409    0.843137
          Nocardioides              42    0.001386   0.004486    0.823529
              Massilia              42    0.008612   0.121396    0.823529
          Bdellovibrio              41    0.004349   0.027599    0.803922
         Acinetobacter              41    0.007044   0.040734    0.803922
Candidatus Omnitrophus              40    0.021525   0.101997    0.784314
        Bradyrhizobium              39    0.004804   0.026693    0.764706
         Lacunisphaera              39    0.001469   0.005951    0.764706
        Rhizomicrobium              38    0.010547   0.080367    0.745098
         Mycobacterium              38    0.001390   0.010835    0.745098
      Pajaroellobacter              38    0.000871   0.003378    0.745098
                Afipia              38    0.003996   0.034231    0.745098
       Ignavibacterium              36    0.001785   0.014253    0.705882

m = imp.merge(gstat, on='genus', how='left')
m['field_detected'] = m['n_comm_present'].notna()
m['prevalence'] = m['prevalence'].fillna(0.0)
m['mean_relab'] = m['mean_relab'].fillna(0.0)

det = m['field_detected'].sum()
print(f'implicated genus-calls: {len(m)} | field-detected at SSO: {det} '
      f'({100*det/len(m):.0f}%)')
print('distinct implicated genera detected in SSO field:',
      m[m['field_detected']]['genus'].nunique(), '/', m['genus'].nunique())
print('\nfield-detected implicated genera by prevalence:')
gg = (m[m['field_detected']].groupby('genus')
      .agg(prevalence=('prevalence', 'first'), mean_relab=('mean_relab', 'first'),
           n_compounds=('name', 'nunique')).reset_index()
      .sort_values('prevalence', ascending=False))
print(gg.head(25).to_string(index=False))

implicated genus-calls: 156 | field-detected at SSO: 112 (72%)
distinct implicated genera detected in SSO field: 62 / 86

field-detected implicated genera by prevalence:
            genus  prevalence  mean_relab  n_compounds
     Sphingomonas    0.901961    0.004906            1
      Pseudomonas    0.862745    0.005293            3
    Rhodanobacter    0.843137    0.028530            1
     Nocardioides    0.823529    0.001386            1
         Massilia    0.823529    0.008612            1
           Afipia    0.745098    0.003996            1
    Mycobacterium    0.745098    0.001390            1
       Reyranella    0.666667    0.001495            2
       Xylophilus    0.666667    0.002518            1
  Novosphingobium    0.647059    0.002031            1
      Ramlibacter    0.647059    0.002978            2
Pseudarthrobacter    0.647059    0.001762            1
         Bacillus    0.627451    0.002797            2
       Acidovorax    0.607843    0.003048            5
     Streptomyces    0.568627    0.001346            2
      Rhodococcus    0.568627    0.002524            2
    Mesorhizobium    0.568627    0.002063            1
    Brevundimonas    0.568627    0.003198            1
      Caulobacter    0.549020    0.001710            3
      Curvibacter    0.549020    0.003151            1
      Sphingobium    0.529412    0.009450            2
       Rhodoferax    0.470588    0.009816            3
   Castellaniella    0.450980    0.004167            1
       Pedobacter    0.450980    0.001176            1
            Bosea    0.431373    0.000615            3

pc = (m.groupby(['name', 'npc_pathway', 'source'])
      .agg(n_genera=('genus', 'nunique'),
           n_field=('genus', lambda s: m.loc[s.index][m.loc[s.index, 'field_detected']]['genus'].nunique()),
           max_prev=('prevalence', 'max'),
           best_genus=('genus', 'first')).reset_index())
# recompute best field genus per compound cleanly
best = (m[m['field_detected']].sort_values('prevalence', ascending=False)
        .groupby('name').agg(top_field_genus=('genus', 'first'),
                             top_field_prev=('prevalence', 'first')).reset_index())
pc = pc.drop(columns=['best_genus']).merge(best, on='name', how='left')
pc['frac_field'] = pc['n_field'] / pc['n_genera']
pc = pc.sort_values('frac_field', ascending=False)
print(pc[['name', 'source', 'n_genera', 'n_field', 'frac_field',
          'top_field_genus', 'top_field_prev']].to_string(index=False))

                 name      source  n_genera  n_field  frac_field top_field_genus  top_field_prev
        Abscisic acid groundwater         1        1    1.000000  Microbacterium        0.019608
        phthalic acid   necromass        15       12    0.800000   Mycobacterium        0.745098
       salicylic acid groundwater        24       19    0.791667     Pseudomonas        0.862745
3-hydroxybenzoic acid groundwater        42       31    0.738095    Sphingomonas        0.901961
4-hydroxybenzaldehyde groundwater        34       24    0.705882     Pseudomonas        0.862745
             xanthine groundwater         6        4    0.666667          Afipia        0.745098
     Phenylethylamine groundwater        11        7    0.636364      Acidovorax        0.607843
    TEREPHTHALIC ACID   necromass        17       10    0.588235      Reyranella        0.666667

fig, ax = plt.subplots(figsize=(8, 4.5))
p = pc.sort_values('frac_field')
colors = ['#2c7fb8' if s == 'groundwater' else '#d95f0e' for s in p['source']]
ax.barh(p['name'], p['frac_field'], color=colors)
for i, (_, r) in enumerate(p.iterrows()):
    ax.text(r['frac_field'] + 0.01, i, f"{int(r['n_field'])}/{int(r['n_genera'])}",
            va='center', fontsize=8)
ax.set_xlabel('fraction of predicted utilizer genera detected in SSO field community')
ax.set_xlim(0, 1.15)
from matplotlib.patches import Patch
ax.legend(handles=[Patch(color='#2c7fb8', label='groundwater'),
                   Patch(color='#d95f0e', label='necromass')],
          title='compound source', loc='lower right')
ax.set_title('Predicted utilizers present in Oak Ridge SSO subsurface field community')
fig.tight_layout(); fig.savefig(FIG / '07_field_occurrence.png', dpi=150)
print('saved 07_field_occurrence.png'); plt.close(fig)

saved 07_field_occurrence.png

out = m[['name', 'npc_pathway', 'source', 'genus', 'certainty',
         'field_detected', 'n_comm_present', 'prevalence', 'mean_relab', 'max_relab']].copy()
out = out.sort_values(['name', 'prevalence'], ascending=[True, False]).reset_index(drop=True)
out.to_csv(DATA / 'environmental_atlas.tsv', sep='\t', index=False)
print('wrote data/environmental_atlas.tsv', out.shape)

print('\n=== deliverable (c) environmental headline ===')
print(f'SSO communities surveyed: {N_TOT} | implicated genera: {m["genus"].nunique()} | '
      f'detected in SSO field: {m[m["field_detected"]]["genus"].nunique()}')
print(f'compounds with >=1 field-present utilizer genus: '
      f'{pc[pc["n_field"] > 0]["name"].nunique()} / {pc["name"].nunique()}')

print('\n=== H3 verdict ===')
print('UNTESTABLE / CONFOUNDED: of 8 callable compounds only 2 are necromass '
      '(terephthalic + phthalic acid), both phthalate-class aromatics with '
      'Actinomycetota-heavy utilizers. Source is confounded with chemical class and '
      'the necromass arm is n=2; no source-contrast statistic is reported. '
      'Reported instead: honest SSO field-occurrence of predicted utilizer genera.')

wrote data/environmental_atlas.tsv (156, 10)

=== deliverable (c) environmental headline ===
SSO communities surveyed: 51 | implicated genera: 86 | detected in SSO field: 62
compounds with >=1 field-present utilizer genus: 8 / 8

=== H3 verdict ===
UNTESTABLE / CONFOUNDED: of 8 callable compounds only 2 are necromass (terephthalic + phthalic acid), both phthalate-class aromatics with Actinomycetota-heavy utilizers. Source is confounded with chemical class and the necromass arm is n=2; no source-contrast statistic is reported. Reported instead: honest SSO field-occurrence of predicted utilizer genera.

07 Environmental Atlas

NB07 — Environmental Occurrence Atlas (Deliverable c, environmental half)¶

SSO field survey: genus prevalence + relative abundance¶

Per-genus field statistics (pooled SSO)¶

Match predicted utilizer genera to SSO field occurrence¶

Per-compound: how many predicted utilizer genera are real SSO residents¶

Write deliverable (c) environmental table¶