from pathlib import Path
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.multitest import multipletests

DATA_DIR = Path('../data')
presence = pd.read_parquet(DATA_DIR / 'cohort_og_presence.parquet')
cohort = pd.read_csv(DATA_DIR / 'cohort_assignments.tsv', sep='\t')

anchor_ids = set(cohort.loc[cohort['cohort_class']=='anchor_deep_clay', 'genome_id'])
baseline_ids = set(cohort.loc[cohort['cohort_class']=='soil_baseline', 'genome_id'])
n_anchor = len(anchor_ids); n_base = len(baseline_ids)
print(f'anchor n={n_anchor}, baseline n={n_base}')

# Build per-OG counts
presence['is_anchor'] = presence['genome_id'].isin(anchor_ids)
presence['is_base']   = presence['genome_id'].isin(baseline_ids)
og_counts = (presence.groupby('og_id')
             .agg(pos_anchor=('is_anchor', 'sum'),
                  pos_base=('is_base', 'sum'))
             .reset_index())
print(f'unique OGs: {len(og_counts)}')

anchor n=10, baseline n=62
unique OGs: 14109

# Fisher's exact per OG (vectorized via apply)
def fisher_row(row, n_a=n_anchor, n_b=n_base):
    a = int(row['pos_anchor']); b = int(row['pos_base'])
    table = [[a, n_a - a], [b, n_b - b]]
    odds, p = stats.fisher_exact(table, alternative='two-sided')
    return pd.Series({'odds_ratio': odds, 'p_value': p})

stats_df = og_counts.apply(fisher_row, axis=1)
og = pd.concat([og_counts, stats_df], axis=1)
og['rate_anchor'] = og['pos_anchor'] / n_anchor
og['rate_base']   = og['pos_base']   / n_base
og['fold_diff']   = og['rate_anchor'] / og['rate_base'].replace(0, np.nan)
og['p_BH'] = multipletests(og['p_value'].fillna(1).values, method='fdr_bh')[1]
print(f'OGs total: {len(og)}; raw p<0.05: {(og["p_value"]<0.05).sum()}; q<0.05: {(og["p_BH"]<0.05).sum()}')

OGs total: 14109; raw p<0.05: 2415; q<0.05: 611

enriched = og[
    (og['p_BH'] < 0.05) &
    (og['rate_anchor'] > og['rate_base']) &
    (og['pos_anchor'] >= 3) &
    ((og['fold_diff'].fillna(np.inf) >= 3) | (og['rate_base'] == 0))
].sort_values(['p_BH','rate_anchor'], ascending=[True, False]).copy()
print(f'anchor-enriched OGs (q<0.05, fold>=3, n_anchor>=3): {len(enriched)}')

depleted = og[
    (og['p_BH'] < 0.05) &
    (og['rate_anchor'] < og['rate_base']) &
    (og['pos_base'] >= 5)
].sort_values('p_BH').copy()
print(f'anchor-depleted OGs: {len(depleted)}')

enriched.head(20)

anchor-enriched OGs (q<0.05, fold>=3, n_anchor>=3): 547
anchor-depleted OGs: 26

out_path = DATA_DIR / 'og_enrichment.tsv'
enriched.to_csv(out_path, sep='\t', index=False)
print(f'wrote {len(enriched)} enriched OGs to {out_path}')

out_dep = DATA_DIR / 'og_depletion.tsv'
depleted.to_csv(out_dep, sep='\t', index=False)
print(f'wrote {len(depleted)} depleted OGs to {out_dep}')

# Also write the full per-OG table for downstream NB04
og.to_parquet(DATA_DIR / 'og_per_test.parquet', index=False)
print(f'wrote full per-OG table ({len(og)} rows) to og_per_test.parquet')

wrote 547 enriched OGs to ../data/og_enrichment.tsv
wrote 26 depleted OGs to ../data/og_depletion.tsv
wrote full per-OG table (14109 rows) to og_per_test.parquet

	og_id	pos_anchor	pos_base	odds_ratio	p_value	rate_anchor	rate_base	fold_diff	p_BH
3606	1UIFM	7	0	inf	8.146033e-08	0.7	0.000000	NaN	0.000383
4863	1UYVZ	7	0	inf	8.146033e-08	0.7	0.000000	NaN	0.000383
8268	1VDS4	7	0	inf	8.146033e-08	0.7	0.000000	NaN	0.000383
1177	1TQFI	9	5	102.600000	2.184155e-07	0.9	0.080645	11.160000	0.000514
5291	1V0WU	9	5	102.600000	2.184155e-07	0.9	0.080645	11.160000	0.000514
1970	1TRYW	8	2	120.000000	1.598547e-07	0.8	0.032258	24.800000	0.000514
13128	COG1977	10	11	inf	6.577922e-07	1.0	0.177419	5.636364	0.000928
5801	1V2WV	7	1	142.333333	6.253647e-07	0.7	0.016129	43.400000	0.000928
6773	1V6QD	7	1	142.333333	6.253647e-07	0.7	0.016129	43.400000	0.000928
14055	arCOG04899	7	1	142.333333	6.253647e-07	0.7	0.016129	43.400000	0.000928
1864	1TRR9	8	4	58.000000	1.658702e-06	0.8	0.064516	12.400000	0.001232
2249	1TSKR	8	4	58.000000	1.658702e-06	0.8	0.064516	12.400000	0.001232
3736	1UJAW	8	4	58.000000	1.658702e-06	0.8	0.064516	12.400000	0.001232
8709	1VFGN	8	4	58.000000	1.658702e-06	0.8	0.064516	12.400000	0.001232
3648	1UIRV	6	0	inf	1.344095e-06	0.6	0.000000	NaN	0.001232
5196	1V0EP	6	0	inf	1.344095e-06	0.6	0.000000	NaN	0.001232
8110	1VCU0	6	0	inf	1.344095e-06	0.6	0.000000	NaN	0.001232
9786	1VMJ0	6	0	inf	1.344095e-06	0.6	0.000000	NaN	0.001232
10835	1W0RM	6	0	inf	1.344095e-06	0.6	0.000000	NaN	0.001232
4662	1UY3F	9	8	60.750000	2.529780e-06	0.9	0.129032	6.975000	0.001656

03 Og Enrichment

NB03 — Per-OG Cultivation-Habitat Enrichment Test (H1)¶

Filter to anchor-enriched + significant + min-support¶