import pandas as pd, numpy as np
from scipy import stats
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt, warnings
warnings.filterwarnings('ignore')

otu_16s = pd.read_csv('data/16S/otu_table_filtered.tsv', sep='\t', index_col=0)
otu_its = pd.read_csv('data/ITS/otu_table_filtered.tsv', sep='\t', index_col=0)
tax_16s = pd.read_csv('data/16S/taxonomy.tsv', sep='\t', index_col=0)
tax_its = pd.read_csv('data/ITS/taxonomy.tsv', sep='\t', index_col=0)
meta = pd.read_csv('data/sample_metadata.tsv', sep='\t').set_index('sample_id')

def clr(df):
    c = df.values.astype(float) + 0.5
    lc = np.log(c); gm = lc.mean(axis=1, keepdims=True)
    return pd.DataFrame(lc - gm, index=df.index, columns=df.columns)

def get_genus(otu, tax):
    if otu in tax.index and 'Genus' in tax.columns:
        v = tax.loc[otu, 'Genus']
        return str(v) if pd.notna(v) else 'Unclassified'
    return 'Unclassified'

def da_test(otu_df, g1s, g2s, tax_df):
    c = clr(otu_df)
    rows = []
    for otu in otu_df.columns:
        v1, v2 = c.loc[g1s, otu].values, c.loc[g2s, otu].values
        effect = v2.mean() - v1.mean()
        try: _, pw = stats.mannwhitneyu(v1, v2, alternative='two-sided')
        except: pw = 1.0
        ra1 = otu_df.loc[g1s, otu].mean() / otu_df.loc[g1s].sum(axis=1).mean() * 100
        ra2 = otu_df.loc[g2s, otu].mean() / otu_df.loc[g2s].sum(axis=1).mean() * 100
        rows.append({'OTU': otu, 'Genus': get_genus(otu, tax_df),
                     'mean_RA_g1': ra1, 'mean_RA_g2': ra2, 'CLR_effect': effect, 'p_wilcox': pw})
    df = pd.DataFrame(rows).sort_values('p_wilcox')
    n = len(df); df['rank'] = range(1, n+1)
    df['p_adj'] = np.minimum(1, df['p_wilcox'] * n / df['rank'])
    for i in range(n-2, -1, -1):
        df.iloc[i, df.columns.get_loc('p_adj')] = min(df.iloc[i]['p_adj'], df.iloc[i+1]['p_adj'])
    return df.sort_values('CLR_effect', ascending=False)

contrasts = [('Base_vs_L','Base','L'), ('Base_vs_LC','Base','LC'), ('L_vs_LC','L','LC'),
             ('LL_vs_LCL','L-L','LC-L'), ('LLC_vs_LCLC','L-LC','LC-LC')]

for marker, otu, tax in [('16S', otu_16s, tax_16s), ('ITS', otu_its, tax_its)]:
    print(f"\n{'='*60}\n  {marker} Differential Abundance\n{'='*60}")
    for name, g1, g2 in contrasts:
        s1 = [s for s in otu.index if meta.loc[s,'group_label']==g1]
        s2 = [s for s in otu.index if meta.loc[s,'group_label']==g2]
        if len(s1)<2 or len(s2)<2: continue
        res = da_test(otu, s1, s2, tax)
        res.to_csv(f'data/{marker}/DA_{name}.tsv', sep='\t', index=False)
        sig = len(res[res['p_adj']<0.05])
        print(f"\n  {g1} vs {g2}: {len(res)} OTUs tested, {sig} significant (FDR<0.05)")
        print(f"  Top enriched in {g2}: {res.head(3)[['Genus','CLR_effect']].to_string(index=False)}")

============================================================
  16S Differential Abundance
============================================================

  Base vs L: 1793 OTUs tested, 0 significant (FDR<0.05)
  Top enriched in L:          Genus  CLR_effect
      Acinetobacter       12.34
        Pseudomonas       11.52
     Flavobacterium       11.17

  Base vs LC: 1793 OTUs tested, 0 significant (FDR<0.05)
  Top enriched in LC:          Genus  CLR_effect
      Acinetobacter       13.25
          Aeromonas       12.19
          Aeromonas       11.41

  L vs LC: 1793 OTUs tested, 0 significant (FDR<0.05)
  Top enriched in LC:           Genus  CLR_effect
         Shewanella        6.62
       Unclassified        6.55
  Hafnia-Obesumbact        6.51

  L-L vs LC-L: 1793 OTUs tested, 0 significant (FDR<0.05)
  Top enriched in LC-L:          Genus  CLR_effect
      Acinetobacter        8.17
        Pseudomonas        6.18
   Sphingobacterium        5.81

  L-LC vs LC-LC: 1793 OTUs tested, 0 significant (FDR<0.05)
  Top enriched in LC-LC:          Genus  CLR_effect
         Shewanella        6.36
         Pedobacter        5.03
  Hafnia-Obesumbact        4.56

============================================================
  ITS Differential Abundance
============================================================

  Base vs L: 440 OTUs tested, 0 significant (FDR<0.05)
  Top enriched in L:      Genus  CLR_effect
      Fusicolla       14.03
     Malassezia        8.76
       Fusarium        7.63

  Base vs LC: 440 OTUs tested, 0 significant (FDR<0.05)
  Top enriched in LC:          Genus  CLR_effect
       Unclassified       13.59
       Unclassified        7.93
        Aspergillus        7.49

  L vs LC: 440 OTUs tested, 0 significant (FDR<0.05)
  Top enriched in LC:          Genus  CLR_effect
       Unclassified       10.25
      Chrysosporium        8.33
       Unclassified        4.59

  L-L vs LC-L: 440 OTUs tested, 0 significant (FDR<0.05)
  L-LC vs LC-LC: 440 OTUs tested, 0 significant (FDR<0.05)

# Targeted genera heatmaps
for marker, otu_df, tax_df, targets in [
    ('16S', otu_16s, tax_16s, ['Pseudomonas','Acinetobacter','Aeromonas','Enterobacter','Flavobacterium','Comamonas','Sphingomonas','Rhodococcus','Aminobacter','Azospirillum','Arthrobacter','Peredibacter','Shewanella','Sphingobacterium','Methylotenera']),
    ('ITS', otu_its, tax_its, ['Fusarium','Fusicolla','Chrysosporium','Aspergillus','Malassezia','Pleurotus','Preussia','Mortierella','Albifimbria','Chaetomium','Gibellulopsis','Alternaria'])
]:
    ra = otu_df.div(otu_df.sum(axis=1), axis=0) * 100
    gmap = {o: get_genus(o, tax_df) for o in ra.columns}
    ra.columns = [gmap.get(c,'Unclassified') for c in ra.columns]
    ra = ra.T.groupby(ra.columns).sum().T
    targets = [t for t in targets if t in ra.columns]
    gm = pd.DataFrame([ra.loc[[s for s in ra.index if meta.loc[s,'group_label']==g], targets].mean()
                        for g in ['Base','L','LC','L-L','LC-L','L-LC','LC-LC']],
                       index=['Base','L','LC','L-L','LC-L','L-LC','LC-LC'])
    fig, ax = plt.subplots(figsize=(14, 6))
    im = ax.imshow(gm.values, cmap='YlOrRd', aspect='auto')
    ax.set_xticks(range(len(targets))); ax.set_xticklabels(targets, rotation=45, ha='right')
    ax.set_yticks(range(7)); ax.set_yticklabels(['Base','L','LC','L-L','LC-L','L-LC','LC-LC'])
    for i in range(7):
        for j in range(len(targets)):
            v = gm.iloc[i,j]; c = 'white' if v > gm.values.max()*0.6 else 'black'
            ax.text(j, i, f'{v:.1f}', ha='center', va='center', fontsize=8, color=c)
    plt.colorbar(im, ax=ax, label='Mean RA (%)')
    ax.set_title(f'{marker} Key Genera Heatmap')
    plt.tight_layout(); plt.savefig(f'figures/{marker}_targeted_heatmap.png', dpi=150); plt.close()

# Volcano plot
da = pd.read_csv('data/16S/DA_Base_vs_L.tsv', sep='\t')
fig, ax = plt.subplots(figsize=(10, 7))
ax.scatter(da['CLR_effect'], -np.log10(da['p_wilcox']), c='grey', alpha=0.5, s=20)
top = da[da['CLR_effect'].abs() > 8]
for _, r in top.iterrows():
    c = '#EE6677' if r['CLR_effect'] > 0 else '#4477AA'
    ax.scatter(r['CLR_effect'], -np.log10(r['p_wilcox']), c=c, s=50, edgecolors='black', zorder=5)
    ax.annotate(str(r['Genus']), (r['CLR_effect'], -np.log10(r['p_wilcox'])), fontsize=7)
ax.axhline(-np.log10(0.05), color='red', ls='--', alpha=0.5)
ax.set_xlabel('CLR Effect'); ax.set_ylabel('-log10(p)')
ax.set_title('16S: Base vs Lignin Volcano Plot')
plt.tight_layout(); plt.savefig('figures/16S_volcano_base_vs_L.png', dpi=150); plt.close()
print("Saved heatmaps and volcano plot")

Saved heatmaps and volcano plot

Nb05 Differential Abundance

NB05: Differential Abundance¶

Heatmaps and Volcano Plot¶