import os
from dotenv import load_dotenv
import pandas as pd, numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path
from scipy.stats import mannwhitneyu

DATA = Path('../data'); FIG = Path('../figures')
pd.set_option('display.max_columns', 60); pd.set_option('display.width', 240)

phylo = pd.read_csv(DATA / 'phylo_utilizer_map.tsv', sep='\t')
IMPL = sorted(phylo['genus'].dropna().unique().tolist())
print('implicated genera:', len(IMPL))
# genus -> implicated-compound context (breadth + pathways), for the synthesis table
gctx = (phylo.groupby('genus')
        .agg(n_compounds=('name', 'nunique'),
             pathways=('npc_pathway', lambda s: ','.join(sorted(set(str(x) for x in s if pd.notna(x))))))
        .reset_index())
print(gctx.head().to_string(index=False))

implicated genera: 86
        genus  n_compounds                                  pathways
Achromobacter            3 Alkaloids,Shikimates and Phenylpropanoids
   Acidovorax            5 Alkaloids,Shikimates and Phenylpropanoids
       Afipia            1                                 Alkaloids
  Alcaligenes            3 Alkaloids,Shikimates and Phenylpropanoids
  Alteromonas            1           Shikimates and Phenylpropanoids

load_dotenv('/home/aparkin/BERIL-research-observatory/.env')
_tok = os.environ['KBASE_AUTH_TOKEN']
from pyspark.sql import SparkSession, functions as F
_url = f'sc://jupyter-aparkin.jupyterhub-prod:15002/;use_ssl=false;x-kbase-token={_tok}'
spark = SparkSession.builder.remote(_url).getOrCreate()
print('spark up')

spark up

# --- relative abundance for implicated genera (present rows only) ---
roll = spark.table('kbase.nmdc_arkin.covstats_taxonomy_rollup')
file_tot = roll.groupBy('file_id').agg(F.sum('total_tpm').alias('file_tpm'))
imp = roll.filter(F.col('genus').isin(IMPL)).select('file_id', 'genus', 'total_tpm')
relab = (imp.join(file_tot, 'file_id')
         .withColumn('relab', F.col('total_tpm') / F.col('file_tpm'))
         .select('file_id', 'genus', 'relab'))
pres = relab.toPandas()
print('present (genus x file) rows:', len(pres), '| distinct files:', pres['file_id'].nunique())

present (genus x file) rows: 3958057 | distinct files: 1719

# --- denominator = the covstats files that actually carry taxonomy (NOT all 6700 lookup rows) ---
# one row per taxonomy-bearing file, labeled with TWO ontology sets from biosample_set.
# NOTE: one covstats file (assembly) can link to several biosample ids; we keep one label/file.
roll_files = file_tot.select('file_id')
sfl = (spark.table('kbase.nmdc_arkin.sample_file_lookup')
       .select(F.col('covstats_file_id').alias('file_id'), 'sample_id').distinct())
bs = (spark.table('nmdc_metadata.biosample_set')
      .select(F.col('id').alias('sample_id'),
              'ecosystem', 'ecosystem_category', 'ecosystem_type',
              'ecosystem_subtype', 'specific_ecosystem',
              'env_broad_scale_term_name', 'env_local_scale_term_name', 'env_medium_term_name',
              'lat_lon_latitude', 'lat_lon_longitude', 'depth_has_numeric_value')
      .dropDuplicates(['sample_id']))
uni = (roll_files.join(sfl, 'file_id', 'left').join(bs, 'sample_id', 'left')
       .dropDuplicates(['file_id']).toPandas())
print('taxonomy-bearing files (denominator):', len(uni))
for c in ['env_medium_term_name', 'env_broad_scale_term_name', 'ecosystem_type']:
    print(f'  labeled [{c}]:', uni[c].notna().sum())

taxonomy-bearing files (denominator): 3825
  labeled [env_medium_term_name]: 3403
  labeled [env_broad_scale_term_name]: 3403
  labeled [ecosystem_type]: 3016

# --- macro env-class from ENVO env_medium (fallback to GOLD ecosystem_type) ---
def macro(med, eco):
    s = (str(med) if pd.notna(med) else (str(eco) if pd.notna(eco) else '')).lower()
    if not s: return 'unlabeled'
    if any(k in s for k in ['periphyt', 'epilith', 'epipsamm', 'epiphyt', 'biofilm', 'microbial mat']): return 'periphyton'
    if 'sediment' in s: return 'sediment'
    if 'soil' in s: return 'soil'
    if any(k in s for k in ['rhizosphere', 'root', 'phyllosphere', 'plant', 'litter', 'leaf']): return 'plant'
    if any(k in s for k in ['fecal', 'feces', 'gut', 'host', 'rumen', 'oral']): return 'host'
    if any(k in s for k in ['marine', 'ocean', 'sea water', 'seawater', 'saline', 'estuar']): return 'marine'
    if any(k in s for k in ['lake', 'fresh', 'river', 'pond', 'stream', 'groundwater', 'aquifer', 'surface water']): return 'freshwater'
    if 'water' in s: return 'water_other'
    return 'other'
uni['macro_env'] = [macro(m, e) for m, e in zip(uni['env_medium_term_name'], uni['ecosystem_type'])]
print(uni['macro_env'].value_counts().to_string())
# --- IMPORTANT: covstats rollup is per-SPECIES; sum species->genus per file before any max/outlier ---
presg = pres.groupby(['file_id', 'genus'], as_index=False)['relab'].sum()
presg = presg.merge(uni, on='file_id', how='left')
print('genus-aggregated (genus x file) rows:', len(presg))

macro_env
soil          2260
freshwater     723
periphyton     472
other          179
plant          119
sediment        42
unlabeled       30

genus-aggregated (genus x file) rows: 124677

N_class = uni.groupby('macro_env')['file_id'].nunique().to_dict()
g = (presg.dropna(subset=['macro_env'])
     .groupby(['genus', 'macro_env'])
     .agg(n_present=('file_id', 'nunique'), sum_relab=('relab', 'sum'),
          mean_present=('relab', 'mean'), max_relab=('relab', 'max')).reset_index())
g['n_class'] = g['macro_env'].map(N_class)
g['prevalence'] = g['n_present'] / g['n_class']
g['mean_relab_all'] = g['sum_relab'] / g['n_class']
g = g.drop(columns='sum_relab')
# the substantively interesting classes
MAIN = ['soil', 'freshwater', 'sediment', 'plant', 'host', 'periphyton', 'marine']
show = (g[g['macro_env'].isin(MAIN)]
        .sort_values('mean_relab_all', ascending=False).head(20))
print('top genus x env by mean relative abundance (all-sample):')
print(show[['genus', 'macro_env', 'n_present', 'n_class', 'prevalence', 'mean_relab_all', 'max_relab']]
      .to_string(index=False, float_format=lambda x: f'{x:.4g}'))

top genus x env by mean relative abundance (all-sample):
          genus  macro_env  n_present  n_class  prevalence  mean_relab_all  max_relab
    Rhizobacter periphyton        459      472      0.9725        0.009098    0.09897
     Variovorax periphyton        460      472      0.9746        0.009094    0.07491
    Polaromonas periphyton        458      472      0.9703        0.007648     0.1558
   Streptomyces       soil       1061     2260      0.4695        0.006589     0.1194
    Methylibium periphyton        456      472      0.9661          0.0063    0.07914
   Sphingomonas periphyton        457      472      0.9682        0.005786    0.07854
  Mycobacterium       soil       1046     2260      0.4628        0.005404     0.2204
 Hydrogenophaga periphyton        454      472      0.9619        0.004655     0.2776
     Rhodoferax periphyton        457      472      0.9682         0.00428    0.06539
    Ramlibacter periphyton        454      472      0.9619        0.003046    0.02731
    Pseudomonas periphyton        456      472      0.9661        0.003006     0.1759
     Roseateles periphyton        454      472      0.9619        0.002743    0.01898
Novosphingobium periphyton        453      472      0.9597        0.002638    0.03843
   Nocardioides       soil       1038     2260      0.4593        0.002533    0.07422
  Mesorhizobium periphyton        454      472      0.9619        0.002059    0.03125
     Acidovorax periphyton        451      472      0.9555        0.002012    0.01598
   Nocardioides periphyton        444      472      0.9407        0.001868     0.4286
  Mesorhizobium       soil       1049     2260      0.4642         0.00178     0.0335
     Variovorax      plant         19      119      0.1597        0.001758    0.04669
   Nocardioides      plant         19      119      0.1597        0.001738    0.04052

# soil vs freshwater rank test per genus (zeros included over class universe)
soil_files = uni.loc[uni['macro_env'] == 'soil', 'file_id'].tolist()
fw_files   = uni.loc[uni['macro_env'] == 'freshwater', 'file_id'].tolist()
pv = presg.pivot_table(index='file_id', columns='genus', values='relab', aggfunc='sum', fill_value=0.0)
def vec(files, genus):
    idx = pv.index.intersection(files)
    base = pd.Series(0.0, index=files)
    if genus in pv.columns:
        base.loc[idx] = pv.loc[idx, genus]
    return base.values
rows = []
for gen in IMPL:
    a, b = vec(soil_files, gen), vec(fw_files, gen)
    if (a > 0).sum() + (b > 0).sum() < 5:  # too sparse to test
        continue
    try:
        u, p = mannwhitneyu(a, b, alternative='two-sided')
    except ValueError:
        continue
    ms, mf = a.mean(), b.mean()
    rows.append((gen, ms, mf, np.log2((ms + 1e-9) / (mf + 1e-9)),
                 (a > 0).mean(), (b > 0).mean(), p))
enr = pd.DataFrame(rows, columns=['genus', 'mean_soil', 'mean_fresh', 'log2_soil_fresh',
                                  'prev_soil', 'prev_fresh', 'p'])
# BH correction
enr = enr.sort_values('p').reset_index(drop=True)
m = len(enr); enr['q'] = (enr['p'] * m / (enr.index + 1)).clip(upper=1.0)
enr['q'] = enr['q'][::-1].cummin()[::-1]
print(f'genera tested soil vs freshwater: {m} (n_soil={len(soil_files)}, n_fresh={len(fw_files)})')
print('\nstrongest soil-enriched (log2>0, q<0.05):')
print(enr[(enr['log2_soil_fresh'] > 0) & (enr['q'] < 0.05)].head(12)
      .to_string(index=False, float_format=lambda x: f'{x:.3g}'))
print('\nstrongest freshwater-enriched (log2<0, q<0.05):')
print(enr[(enr['log2_soil_fresh'] < 0) & (enr['q'] < 0.05)].sort_values('log2_soil_fresh').head(12)
      .to_string(index=False, float_format=lambda x: f'{x:.3g}'))

genera tested soil vs freshwater: 83 (n_soil=2260, n_fresh=723)

strongest soil-enriched (log2>0, q<0.05):
        genus  mean_soil  mean_fresh  log2_soil_fresh  prev_soil  prev_fresh        p        q
Mycobacterium     0.0054    0.000159             5.09      0.463       0.111 2.16e-74 1.79e-72
  Terriglobus   0.000955    2.43e-05             5.29      0.454      0.0982 4.29e-73 1.48e-71
  Nitrobacter   0.000356    1.29e-05             4.79      0.438       0.083 5.34e-73 1.48e-71
       Afipia    0.00137    5.01e-05             4.77      0.452      0.0982 7.49e-73 1.55e-71
   Tardiphaga   0.000344    1.88e-05             4.19      0.441      0.0913 7.78e-70 1.29e-68
 Streptomyces    0.00659    0.000694             3.25      0.469       0.115 1.09e-69 1.51e-68
Rhodanobacter   0.000434    2.48e-05             4.13      0.447       0.101 6.22e-69 7.38e-68
Mesorhizobium    0.00178    0.000193             3.21      0.464       0.115 9.29e-69 9.64e-68
 Caballeronia   0.000465    5.22e-05             3.15      0.457       0.105 4.44e-67 4.09e-66
 Nocardioides    0.00253    0.000231             3.45      0.459       0.111 5.74e-66 4.76e-65
 Arthrobacter   0.000643    8.63e-05              2.9      0.454       0.107 1.91e-64  1.4e-63
   Reyranella    0.00106    0.000183             2.53      0.457       0.105 2.02e-64  1.4e-63

strongest freshwater-enriched (log2<0, q<0.05):
         genus  mean_soil  mean_fresh  log2_soil_fresh  prev_soil  prev_fresh        p        q
    Cellvibrio   3.33e-05    0.000152            -2.19      0.407       0.111  6.1e-41 8.16e-41
   Curvibacter   2.65e-05    8.65e-05            -1.71      0.386       0.113 8.69e-32 9.49e-32
   Alcaligenes   7.35e-06    1.97e-05            -1.42       0.31      0.0885 7.57e-29 8.06e-29
    Rhodoferax   0.000137    0.000304            -1.15       0.44       0.119 7.01e-42 9.53e-42
   Hylemonella   6.92e-06    1.28e-05           -0.886      0.308      0.0996 4.85e-23 5.04e-23
Hydrogenophaga   8.37e-05    0.000124           -0.562      0.433       0.115 8.79e-40 1.14e-39
   Escherichia    4.3e-05    5.62e-05           -0.386      0.394       0.102 2.26e-40 2.98e-40
     Comamonas   4.29e-05    5.34e-05           -0.316      0.415       0.113  1.5e-37 1.81e-37
    Acidovorax   0.000112    0.000131            -0.23      0.437       0.119 1.32e-39 1.69e-39
   Bernardetia   1.24e-06    1.41e-06           -0.192       0.17      0.0664  1.4e-10  1.4e-10

TOPN = 3
out_rows = []
for gen, sub in presg.groupby('genus'):
    top = sub.sort_values('relab', ascending=False).head(TOPN)
    for _, r in top.iterrows():
        out_rows.append((gen, r['relab'], r['sample_id'], r['macro_env'],
                         r['env_medium_term_name'], r['env_broad_scale_term_name'],
                         r['ecosystem_type'], r['specific_ecosystem'],
                         r['lat_lon_latitude'], r['lat_lon_longitude']))
outl = pd.DataFrame(out_rows, columns=['genus', 'relab', 'sample_id', 'macro_env',
                    'env_medium', 'env_broad', 'gold_ecosystem_type', 'gold_specific',
                    'lat', 'lon'])
outl = outl.sort_values('relab', ascending=False).reset_index(drop=True)
print('global top outlier (genus, sample) by relative abundance:')
print(outl.head(20)[['genus', 'relab', 'macro_env', 'env_medium', 'gold_ecosystem_type', 'lat', 'lon']]
      .to_string(index=False, float_format=lambda x: f'{x:.3g}'))
print('\nwhere do the strongest single-genus spikes live? (macro_env of each genus top hit)')
best = outl.sort_values('relab', ascending=False).drop_duplicates('genus')
print(best['macro_env'].value_counts().to_string())

global top outlier (genus, sample) by relative abundance:
           genus  relab  macro_env   env_medium gold_ecosystem_type         lat         lon
    Nocardioides  0.429 periphyton   epipsammon                 NaN   39.758206 -102.447148
  Hydrogenophaga  0.278 periphyton    epiphyton                 NaN   39.758206 -102.447148
   Mycobacterium   0.22       soil         soil                Soil   42.427091  -72.229737
   Mycobacterium  0.212       soil         soil                Soil   45.850153 -121.989672
   Mycobacterium  0.196       soil         soil                Soil   42.478565  -72.259572
      Variovorax  0.188 freshwater stream water                 NaN   39.890674 -105.911293
    Sphingomonas  0.185       soil          NaN                Soil 28.72122816 83.91438789
Mucilaginibacter  0.184       soil          NaN                Soil     40.7988   -76.33928
     Pseudomonas  0.176 periphyton    epilithon                 NaN   68.669753 -149.143018
     Polaromonas  0.156 periphyton    epilithon                 NaN   68.669753 -149.143018
    Streptomyces  0.152      other          NaN            Volcanic   19.389305   -155.2485
    Sphingomonas  0.147       soil          NaN                Soil  -13.773333  -71.071389
     Polaromonas  0.137 periphyton    epilithon                 NaN   65.153224 -147.503973
     Polaromonas  0.121 periphyton    epilithon                 NaN   65.153224 -147.503973
    Streptomyces  0.119       soil         soil                Soil   37.082813 -119.743324
  Hydrogenophaga  0.117 periphyton    epiphyton                 NaN   39.758206 -102.447148
     Pseudomonas  0.106 periphyton    epilithon                 NaN   68.669753 -149.143018
     Curvibacter 0.0995 periphyton   epipsammon                 NaN   37.059719 -119.257549
    Burkholderia 0.0992 freshwater stream water                 NaN   39.890674 -105.911293
     Rhizobacter  0.099 periphyton   epipsammon                 NaN   37.059719 -119.257549

where do the strongest single-genus spikes live? (macro_env of each genus top hit)
macro_env
periphyton    34
soil          32
freshwater     8
other          7
plant          2

# IMPORTANT: PM taxonomy.name is mostly SPECIES-level ('Sphingomonas panacis'); the genus-rank
# reference entries carry ~0 abundance. Extract genus = first token, then sum species->genus per run.
rt = (spark.table('planetmicrobe.planetmicrobe.run_to_taxonomy')
      .select('run_id', 'tax_id', 'abundance'))
taxg = (spark.table('planetmicrobe.planetmicrobe.taxonomy')
        .withColumn('genus', F.split(F.col('name'), ' ').getItem(0))
        .select('tax_id', 'genus'))
rtg = (rt.join(taxg, 'tax_id').filter(F.col('genus').isin(IMPL))
       .groupBy('run_id', 'genus').agg(F.sum('abundance').alias('abundance')))
run = spark.table('planetmicrobe.planetmicrobe.run').select('run_id', 'experiment_id')
exp = spark.table('planetmicrobe.planetmicrobe.experiment').select('experiment_id', 'sample_id')
p2s = spark.table('planetmicrobe.planetmicrobe.project_to_sample').select('project_id', 'sample_id')
proj = spark.table('planetmicrobe.planetmicrobe.project').select('project_id', F.col('name').alias('project'))
s2e = spark.table('planetmicrobe.planetmicrobe.sample_to_sampling_event').select('sample_id', 'sampling_event_id')
se = spark.table('planetmicrobe.planetmicrobe.sampling_event').select('sampling_event_id', 'campaign_id')
camp = spark.table('planetmicrobe.planetmicrobe.campaign').select('campaign_id', F.col('name').alias('campaign'))
labels = (run.join(exp, 'experiment_id')
          .join(p2s, 'sample_id', 'left').join(proj, 'project_id', 'left')
          .join(s2e, 'sample_id', 'left').join(se, 'sampling_event_id', 'left')
          .join(camp, 'campaign_id', 'left')
          .select('run_id', 'project', 'campaign').dropDuplicates(['run_id']))
pmdf = rtg.join(labels, 'run_id', 'left').select('run_id', 'genus', 'abundance', 'project', 'campaign').toPandas()
n_runs_pm = spark.table('planetmicrobe.planetmicrobe.run_to_taxonomy').select('run_id').distinct().count()
# abundance is a real 0-1 fraction but zero-inflated; separate 'listed in run' from 'positive abundance'
pmdf['present'] = pmdf['abundance'] > 0
print('PM implicated-genus (run x genus) rows:', len(pmdf), '| runs w/ taxonomy:', n_runs_pm)
pm_g = (pmdf.groupby('genus')
        .agg(n_runs_listed=('run_id', 'nunique'),
             n_runs_pos=('present', 'sum'),
             mean_abund=('abundance', 'mean'),
             max_abund=('abundance', 'max')).reset_index())
pm_g['prevalence_pos'] = pm_g['n_runs_pos'] / n_runs_pm
print('\ntop implicated genera in marine (Planet Microbe), by mean abundance:')
print(pm_g.sort_values('mean_abund', ascending=False).head(15)
      .to_string(index=False, float_format=lambda x: f'{x:.3e}'))
print('\nMagnitude note: most implicated (terrestrial/freshwater) genera are detectable but')
print('low in open-ocean runs (~1e-3 to 1e-4); a few genuine marine members (e.g. Alteromonas)')
print('reach ~1e-2. So marine is mostly a NEGATIVE biome contrast, with marine-native exceptions.')
print('\nby project:')
print(pmdf['project'].value_counts().head(8).to_string())

PM implicated-genus (run x genus) rows: 20289 | runs w/ taxonomy: 302

top implicated genera in marine (Planet Microbe), by mean abundance:
           genus  n_runs_listed  n_runs_pos  mean_abund  max_abund  prevalence_pos
     Alteromonas            300         240   4.828e-02  1.000e+00       7.947e-01
     Pseudomonas            302         206   1.081e-02  5.434e-01       6.821e-01
    Sphingomonas            301         160   6.343e-03  2.747e-01       5.298e-01
     Sphingobium            301         140   3.244e-03  1.494e-01       4.636e-01
    Sphingopyxis            300         133   3.172e-03  2.199e-01       4.404e-01
      Acidovorax            300         119   2.370e-03  1.956e-01       3.940e-01
  Staphylococcus            301         126   2.222e-03  1.352e-01       4.172e-01
Paraburkholderia            300          54   1.431e-03  2.412e-01       1.788e-01
   Brevundimonas            301          76   1.345e-03  7.803e-02       2.517e-01
    Burkholderia            301         104   8.621e-04  1.855e-01       3.444e-01
    Streptomyces            302         102   7.847e-04  6.666e-02       3.377e-01
     Rhodococcus            300          77   5.032e-04  4.299e-02       2.550e-01
  Microbacterium            299          65   4.430e-04  6.592e-02       2.152e-01
 Novosphingobium            301          84   4.246e-04  4.105e-02       2.781e-01
   Mycobacterium            301          91   4.008e-04  2.676e-02       3.013e-01

Magnitude note: most implicated (terrestrial/freshwater) genera are detectable but
low in open-ocean runs (~1e-3 to 1e-4); a few genuine marine members (e.g. Alteromonas)
reach ~1e-2. So marine is mostly a NEGATIVE biome contrast, with marine-native exceptions.

by project:
project
OSD                               10194
GOS                                4723
HOT 194-215                        2992
HOT 144-166                        1088
HOT 194-215 Metatranscriptomes     1020
Tara Oceans                         272

# NMDC long rows (main classes only)
nmdc_atlas = g[g['macro_env'].isin(MAIN)][['genus', 'macro_env', 'n_present', 'n_class',
             'prevalence', 'mean_relab_all', 'mean_present', 'max_relab']].copy()
nmdc_atlas['source'] = 'NMDC'
nmdc_atlas = nmdc_atlas.rename(columns={'macro_env': 'env_class', 'mean_relab_all': 'mean_relab'})
# PM marine rows (n_present = runs with POSITIVE abundance, not merely listed)
pm_atlas = pm_g.rename(columns={'mean_abund': 'mean_relab', 'max_abund': 'max_relab',
                                'n_runs_pos': 'n_present', 'prevalence_pos': 'prevalence'}).copy()
pm_atlas['env_class'] = 'marine'; pm_atlas['source'] = 'PlanetMicrobe'
pm_atlas['n_class'] = n_runs_pm; pm_atlas['mean_present'] = pm_atlas['mean_relab']
cols = ['genus', 'env_class', 'source', 'n_present', 'n_class', 'prevalence',
        'mean_relab', 'mean_present', 'max_relab']
atlas = pd.concat([nmdc_atlas[cols], pm_atlas[cols]], ignore_index=True)
atlas = atlas.merge(gctx, on='genus', how='left')
atlas = atlas.sort_values(['genus', 'mean_relab'], ascending=[True, False]).reset_index(drop=True)
atlas.to_csv(DATA / 'env_atlas_global.tsv', sep='\t', index=False)
print('wrote data/env_atlas_global.tsv', atlas.shape)
# enrichment + outliers as companion tables
enr.merge(gctx, on='genus', how='left').to_csv(DATA / 'env_enrichment_soil_fresh.tsv', sep='\t', index=False)
outl.merge(gctx, on='genus', how='left').to_csv(DATA / 'env_outlier_samples.tsv', sep='\t', index=False)
print('wrote data/env_enrichment_soil_fresh.tsv', enr.shape)
print('wrote data/env_outlier_samples.tsv', outl.shape)

wrote data/env_atlas_global.tsv (400, 11)
wrote data/env_enrichment_soil_fresh.tsv (83, 8)
wrote data/env_outlier_samples.tsv (249, 10)

# Figure 1: genus x env-class mean relative-abundance heatmap (top genera by overall signal)
mat = atlas.pivot_table(index='genus', columns='env_class', values='mean_relab', aggfunc='max', fill_value=0)
order_classes = [c for c in ['soil', 'freshwater', 'sediment', 'plant', 'host', 'periphyton', 'marine'] if c in mat.columns]
mat = mat[order_classes]
top_g = mat.max(axis=1).sort_values(ascending=False).head(30).index
mat = mat.loc[top_g]
fig, ax = plt.subplots(figsize=(7, 9))
im = ax.imshow(np.log10(mat.values + 1e-5), aspect='auto', cmap='viridis')
ax.set_xticks(range(len(mat.columns))); ax.set_xticklabels(mat.columns, rotation=45, ha='right')
ax.set_yticks(range(len(mat.index))); ax.set_yticklabels(mat.index, fontsize=8)
ax.set_title('Implicated utilizer genera: mean relative abundance by environment\n(log10, NMDC + Planet Microbe)')
cb = fig.colorbar(im, ax=ax, shrink=0.6); cb.set_label('log10 mean relative abundance')
fig.tight_layout(); fig.savefig(FIG / '07b_env_atlas_heatmap.png', dpi=150); plt.close(fig)
print('saved 07b_env_atlas_heatmap.png', mat.shape)

saved 07b_env_atlas_heatmap.png (30, 5)

# Figure 2: soil-vs-freshwater enrichment volcano-ish (effect size vs significance)
fig, ax = plt.subplots(figsize=(7, 6))
sig = enr['q'] < 0.05
ax.scatter(enr.loc[~sig, 'log2_soil_fresh'], -np.log10(enr.loc[~sig, 'p'] + 1e-300),
           c='#bbbbbb', s=18, label='q>=0.05')
ax.scatter(enr.loc[sig, 'log2_soil_fresh'], -np.log10(enr.loc[sig, 'p'] + 1e-300),
           c='#d7301f', s=26, label='q<0.05')
for _, r in enr[sig].sort_values('p').head(10).iterrows():
    ax.annotate(r['genus'], (r['log2_soil_fresh'], -np.log10(r['p'] + 1e-300)),
                fontsize=7, xytext=(3, 3), textcoords='offset points')
ax.axvline(0, color='k', lw=0.6)
ax.set_xlabel('log2(mean relab soil / freshwater)   <-fresh  | soil->')
ax.set_ylabel('-log10 p (Mann-Whitney, exploratory)')
ax.set_title('Implicated genera: soil vs freshwater differential abundance (NMDC)')
ax.legend(fontsize=8)
fig.tight_layout(); fig.savefig(FIG / '07b_soil_fresh_enrichment.png', dpi=150); plt.close(fig)
print('saved 07b_soil_fresh_enrichment.png')

saved 07b_soil_fresh_enrichment.png

print('=' * 64)
print('NB07b GLOBAL ENVIRONMENTAL ATLAS — HEADLINE')
print('=' * 64)
print(f'implicated genera queried           : {len(IMPL)}')
print(f'NMDC taxonomy-bearing metagenomes   : {len(uni)} (denominator = covstats files w/ taxonomy)')
print(f'NMDC genera detected                : {presg["genus"].nunique()}  '
      f'(in {presg["file_id"].nunique()} metagenomes)')
lab = uni[uni["macro_env"] != "unlabeled"]["file_id"].nunique()
print(f'NMDC samples with env label         : {lab}/{len(uni)} '
      f'({100*lab/len(uni):.0f}%, two ontologies: ENVO triad + GOLD path)')
pm_pos = int((pm_g["n_runs_pos"] > 0).sum())
print(f'PM (marine) genera w/ positive abund: {pm_pos}/{pmdf["genus"].nunique()} listed  (in {n_runs_pm} runs)')
n_sig = int((enr['q'] < 0.05).sum())
print(f'soil-vs-freshwater genera tested     : {len(enr)}; q<0.05: {n_sig} (exploratory)')
print('\nCAVEATS (carried into the report):')
print('  - No compound measurements in any env dataset -> genus abundance is a BIOME PROXY,')
print('    not catabolic activity. Enrichment != the genus degrades the compound there.')
print('  - Relative abundance is compositional & zero-inflated; rank-test p-values exploratory.')
print('  - PM marine arm is small (302 runs) and gene-content-blind; presence/abundance only.')
print('  - Outlier mode (Arm B) is label-robust and the most defensible discovery signal.')
spark.stop()
print('\nspark stopped; NB07b complete')

================================================================
NB07b GLOBAL ENVIRONMENTAL ATLAS — HEADLINE
================================================================
implicated genera queried           : 86
NMDC taxonomy-bearing metagenomes   : 3825 (denominator = covstats files w/ taxonomy)
NMDC genera detected                : 83  (in 1719 metagenomes)
NMDC samples with env label         : 3795/3825 (99%, two ontologies: ENVO triad + GOLD path)
PM (marine) genera w/ positive abund: 68/68 listed  (in 302 runs)
soil-vs-freshwater genera tested     : 83; q<0.05: 83 (exploratory)

CAVEATS (carried into the report):
  - No compound measurements in any env dataset -> genus abundance is a BIOME PROXY,
    not catabolic activity. Enrichment != the genus degrades the compound there.
  - Relative abundance is compositional & zero-inflated; rank-test p-values exploratory.
  - PM marine arm is small (302 runs) and gene-content-blind; presence/abundance only.
  - Outlier mode (Arm B) is label-robust and the most defensible discovery signal.

spark stopped; NB07b complete

07B Environmental Atlas Global

NB07b — Global Environmental Atlas (broad cross-environment arm of deliverable c)¶

Arm A — NMDC genus relative abundance with sample-level environment labels¶

Arm A enrichment — abundance by environment class¶

Arm B — label-free outlier samples¶

Arm C — Planet Microbe marine contrast¶

Arm D — synthesis: the global environmental atlas¶