Nb05 Differential Abundance
Jupyter notebook from the Lignin Enrichment and Ecological Memory in Microbial Communities project.
NB05: Differential Abundance¶
ALDEx2-style CLR + Wilcoxon rank-sum test with BH-FDR correction.
Contrasts: Base vs L, Base vs LC, L vs LC, L-L vs LC-L (history), L-LC vs LC-LC (history)
In [1]:
import pandas as pd, numpy as np
from scipy import stats
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt, warnings
warnings.filterwarnings('ignore')
otu_16s = pd.read_csv('data/16S/otu_table_filtered.tsv', sep='\t', index_col=0)
otu_its = pd.read_csv('data/ITS/otu_table_filtered.tsv', sep='\t', index_col=0)
tax_16s = pd.read_csv('data/16S/taxonomy.tsv', sep='\t', index_col=0)
tax_its = pd.read_csv('data/ITS/taxonomy.tsv', sep='\t', index_col=0)
meta = pd.read_csv('data/sample_metadata.tsv', sep='\t').set_index('sample_id')
In [1]:
def clr(df):
c = df.values.astype(float) + 0.5
lc = np.log(c); gm = lc.mean(axis=1, keepdims=True)
return pd.DataFrame(lc - gm, index=df.index, columns=df.columns)
def get_genus(otu, tax):
if otu in tax.index and 'Genus' in tax.columns:
v = tax.loc[otu, 'Genus']
return str(v) if pd.notna(v) else 'Unclassified'
return 'Unclassified'
def da_test(otu_df, g1s, g2s, tax_df):
c = clr(otu_df)
rows = []
for otu in otu_df.columns:
v1, v2 = c.loc[g1s, otu].values, c.loc[g2s, otu].values
effect = v2.mean() - v1.mean()
try: _, pw = stats.mannwhitneyu(v1, v2, alternative='two-sided')
except: pw = 1.0
ra1 = otu_df.loc[g1s, otu].mean() / otu_df.loc[g1s].sum(axis=1).mean() * 100
ra2 = otu_df.loc[g2s, otu].mean() / otu_df.loc[g2s].sum(axis=1).mean() * 100
rows.append({'OTU': otu, 'Genus': get_genus(otu, tax_df),
'mean_RA_g1': ra1, 'mean_RA_g2': ra2, 'CLR_effect': effect, 'p_wilcox': pw})
df = pd.DataFrame(rows).sort_values('p_wilcox')
n = len(df); df['rank'] = range(1, n+1)
df['p_adj'] = np.minimum(1, df['p_wilcox'] * n / df['rank'])
for i in range(n-2, -1, -1):
df.iloc[i, df.columns.get_loc('p_adj')] = min(df.iloc[i]['p_adj'], df.iloc[i+1]['p_adj'])
return df.sort_values('CLR_effect', ascending=False)
contrasts = [('Base_vs_L','Base','L'), ('Base_vs_LC','Base','LC'), ('L_vs_LC','L','LC'),
('LL_vs_LCL','L-L','LC-L'), ('LLC_vs_LCLC','L-LC','LC-LC')]
for marker, otu, tax in [('16S', otu_16s, tax_16s), ('ITS', otu_its, tax_its)]:
print(f"\n{'='*60}\n {marker} Differential Abundance\n{'='*60}")
for name, g1, g2 in contrasts:
s1 = [s for s in otu.index if meta.loc[s,'group_label']==g1]
s2 = [s for s in otu.index if meta.loc[s,'group_label']==g2]
if len(s1)<2 or len(s2)<2: continue
res = da_test(otu, s1, s2, tax)
res.to_csv(f'data/{marker}/DA_{name}.tsv', sep='\t', index=False)
sig = len(res[res['p_adj']<0.05])
print(f"\n {g1} vs {g2}: {len(res)} OTUs tested, {sig} significant (FDR<0.05)")
print(f" Top enriched in {g2}: {res.head(3)[['Genus','CLR_effect']].to_string(index=False)}")
============================================================
16S Differential Abundance
============================================================
Base vs L: 1793 OTUs tested, 0 significant (FDR<0.05)
Top enriched in L: Genus CLR_effect
Acinetobacter 12.34
Pseudomonas 11.52
Flavobacterium 11.17
Base vs LC: 1793 OTUs tested, 0 significant (FDR<0.05)
Top enriched in LC: Genus CLR_effect
Acinetobacter 13.25
Aeromonas 12.19
Aeromonas 11.41
L vs LC: 1793 OTUs tested, 0 significant (FDR<0.05)
Top enriched in LC: Genus CLR_effect
Shewanella 6.62
Unclassified 6.55
Hafnia-Obesumbact 6.51
L-L vs LC-L: 1793 OTUs tested, 0 significant (FDR<0.05)
Top enriched in LC-L: Genus CLR_effect
Acinetobacter 8.17
Pseudomonas 6.18
Sphingobacterium 5.81
L-LC vs LC-LC: 1793 OTUs tested, 0 significant (FDR<0.05)
Top enriched in LC-LC: Genus CLR_effect
Shewanella 6.36
Pedobacter 5.03
Hafnia-Obesumbact 4.56
============================================================
ITS Differential Abundance
============================================================
Base vs L: 440 OTUs tested, 0 significant (FDR<0.05)
Top enriched in L: Genus CLR_effect
Fusicolla 14.03
Malassezia 8.76
Fusarium 7.63
Base vs LC: 440 OTUs tested, 0 significant (FDR<0.05)
Top enriched in LC: Genus CLR_effect
Unclassified 13.59
Unclassified 7.93
Aspergillus 7.49
L vs LC: 440 OTUs tested, 0 significant (FDR<0.05)
Top enriched in LC: Genus CLR_effect
Unclassified 10.25
Chrysosporium 8.33
Unclassified 4.59
L-L vs LC-L: 440 OTUs tested, 0 significant (FDR<0.05)
L-LC vs LC-LC: 440 OTUs tested, 0 significant (FDR<0.05)
Heatmaps and Volcano Plot¶
In [1]:
# Targeted genera heatmaps
for marker, otu_df, tax_df, targets in [
('16S', otu_16s, tax_16s, ['Pseudomonas','Acinetobacter','Aeromonas','Enterobacter','Flavobacterium','Comamonas','Sphingomonas','Rhodococcus','Aminobacter','Azospirillum','Arthrobacter','Peredibacter','Shewanella','Sphingobacterium','Methylotenera']),
('ITS', otu_its, tax_its, ['Fusarium','Fusicolla','Chrysosporium','Aspergillus','Malassezia','Pleurotus','Preussia','Mortierella','Albifimbria','Chaetomium','Gibellulopsis','Alternaria'])
]:
ra = otu_df.div(otu_df.sum(axis=1), axis=0) * 100
gmap = {o: get_genus(o, tax_df) for o in ra.columns}
ra.columns = [gmap.get(c,'Unclassified') for c in ra.columns]
ra = ra.T.groupby(ra.columns).sum().T
targets = [t for t in targets if t in ra.columns]
gm = pd.DataFrame([ra.loc[[s for s in ra.index if meta.loc[s,'group_label']==g], targets].mean()
for g in ['Base','L','LC','L-L','LC-L','L-LC','LC-LC']],
index=['Base','L','LC','L-L','LC-L','L-LC','LC-LC'])
fig, ax = plt.subplots(figsize=(14, 6))
im = ax.imshow(gm.values, cmap='YlOrRd', aspect='auto')
ax.set_xticks(range(len(targets))); ax.set_xticklabels(targets, rotation=45, ha='right')
ax.set_yticks(range(7)); ax.set_yticklabels(['Base','L','LC','L-L','LC-L','L-LC','LC-LC'])
for i in range(7):
for j in range(len(targets)):
v = gm.iloc[i,j]; c = 'white' if v > gm.values.max()*0.6 else 'black'
ax.text(j, i, f'{v:.1f}', ha='center', va='center', fontsize=8, color=c)
plt.colorbar(im, ax=ax, label='Mean RA (%)')
ax.set_title(f'{marker} Key Genera Heatmap')
plt.tight_layout(); plt.savefig(f'figures/{marker}_targeted_heatmap.png', dpi=150); plt.close()
# Volcano plot
da = pd.read_csv('data/16S/DA_Base_vs_L.tsv', sep='\t')
fig, ax = plt.subplots(figsize=(10, 7))
ax.scatter(da['CLR_effect'], -np.log10(da['p_wilcox']), c='grey', alpha=0.5, s=20)
top = da[da['CLR_effect'].abs() > 8]
for _, r in top.iterrows():
c = '#EE6677' if r['CLR_effect'] > 0 else '#4477AA'
ax.scatter(r['CLR_effect'], -np.log10(r['p_wilcox']), c=c, s=50, edgecolors='black', zorder=5)
ax.annotate(str(r['Genus']), (r['CLR_effect'], -np.log10(r['p_wilcox'])), fontsize=7)
ax.axhline(-np.log10(0.05), color='red', ls='--', alpha=0.5)
ax.set_xlabel('CLR Effect'); ax.set_ylabel('-log10(p)')
ax.set_title('16S: Base vs Lignin Volcano Plot')
plt.tight_layout(); plt.savefig('figures/16S_volcano_base_vs_L.png', dpi=150); plt.close()
print("Saved heatmaps and volcano plot")
Saved heatmaps and volcano plot