Nb02 Composition Overview
Jupyter notebook from the Lignin Enrichment and Ecological Memory in Microbial Communities project.
NB02: Composition Overview, Rarefaction, and Replicate Consistency¶
- Pre-registered OTU filtering (prevalence >= 2, abundance >= 10)
- Taxonomy bar plots (phylum and genus)
- Rarefaction curves
- Rarefaction to even depth (16S: 24,000; ITS: 5,900)
- Replicate consistency via Bray-Curtis dendrograms
In [1]:
import pandas as pd, numpy as np
from scipy.spatial.distance import squareform, pdist
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt
otu_16s = pd.read_csv('data/16S/otu_table.tsv', sep='\t', index_col=0)
otu_its = pd.read_csv('data/ITS/otu_table.tsv', sep='\t', index_col=0)
tax_16s = pd.read_csv('data/16S/taxonomy.tsv', sep='\t', index_col=0)
tax_its = pd.read_csv('data/ITS/taxonomy.tsv', sep='\t', index_col=0)
meta = pd.read_csv('data/sample_metadata.tsv', sep='\t').set_index('sample_id')
group_order = ['Base', 'L', 'LC', 'L-L', 'LC-L', 'L-LC', 'LC-LC']
print(f"16S: {otu_16s.shape}, ITS: {otu_its.shape}")
16S: (21, 3392), ITS: (21, 893)
OTU Filtering¶
In [1]:
def filter_otus(df, min_prev=2, min_abund=10):
keep = ((df > 0).sum(axis=0) >= min_prev) & (df.sum(axis=0) >= min_abund)
return df.loc[:, keep]
filt_16s = filter_otus(otu_16s)
filt_its = filter_otus(otu_its)
its_depths = filt_its.sum(axis=1)
low = its_depths[its_depths < 1000]
print(f"ITS samples <1000 reads: {dict(low)}")
filt_its = filt_its.drop(low.index)
print(f"16S: {otu_16s.shape[1]} -> {filt_16s.shape[1]} OTUs")
print(f"ITS: {otu_its.shape[1]} -> {filt_its.shape[1]} OTUs ({filt_its.shape[0]} samples)")
filt_16s.to_csv('data/16S/otu_table_filtered.tsv', sep='\t')
filt_its.to_csv('data/ITS/otu_table_filtered.tsv', sep='\t')
ITS samples <1000 reads: {'LL_1': 74}
16S: 3392 -> 1793 OTUs
ITS: 893 -> 440 OTUs (20 samples)
Taxonomy Bar Plots¶
In [1]:
def genus_ra(otu_df, tax_df, level='Genus'):
ra = otu_df.div(otu_df.sum(axis=1), axis=0) * 100
mapping = {}
for otu in ra.columns:
if otu in tax_df.index and level in tax_df.columns:
v = tax_df.loc[otu, level]
mapping[otu] = str(v) if pd.notna(v) else 'Unclassified'
else:
mapping[otu] = 'Unclassified'
ra.columns = [mapping.get(c, 'Unclassified') for c in ra.columns]
return ra.T.groupby(ra.columns).sum().T
for marker, otu_df, tax_df in [('16S', filt_16s, tax_16s), ('ITS', filt_its, tax_its)]:
for level in ['Phylum', 'Genus']:
ra_level = genus_ra(otu_df, tax_df, level)
group_means = []
for g in group_order:
samps = [s for s in ra_level.index if meta.loc[s, 'group_label'] == g]
if samps: group_means.append(ra_level.loc[samps].mean())
gra = pd.DataFrame(group_means, index=group_order)
top = gra.sum().nlargest(10).index.tolist()
plot_data = gra[top].copy()
plot_data['Other'] = gra.drop(columns=top, errors='ignore').sum(axis=1)
fig, ax = plt.subplots(figsize=(12, 6))
plot_data.plot(kind='bar', stacked=True, ax=ax, width=0.7)
ax.set_ylabel('Relative Abundance (%)')
ax.set_title(f'{marker} {level}-level Composition')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
plt.tight_layout()
plt.savefig(f'figures/{marker}_{level.lower()}_barplot.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved bar plots")
Saved bar plots
Rarefaction Curves¶
In [1]:
def rarefaction(otu_df, steps=20, n_draws=10):
max_d = int(otu_df.sum(axis=1).max())
depths = np.linspace(100, max_d, steps).astype(int)
curves = {}
for s in otu_df.index:
row = otu_df.loc[s].values.astype(int)
total = row.sum()
obs_list = []
for d in depths:
if d > total: obs_list.append(np.nan); continue
obs = [len(set(np.random.choice(len(row), d, replace=True, p=row/total))) for _ in range(n_draws)]
obs_list.append(np.mean(obs))
curves[s] = obs_list
return depths, curves
np.random.seed(42)
gc = dict(zip(group_order, ['#4477AA','#EE6677','#CCBB44','#228833','#66CCEE','#AA3377','#BBBBBB']))
for marker, df in [('16S', filt_16s), ('ITS', filt_its)]:
depths, curves = rarefaction(df)
fig, ax = plt.subplots(figsize=(10, 6))
for s, c in curves.items():
ax.plot(depths, c, color=gc[meta.loc[s,'group_label']], alpha=0.7)
for g in group_order: ax.plot([],[], color=gc[g], label=g)
ax.set_xlabel('Depth'); ax.set_ylabel('Observed OTUs')
ax.set_title(f'{marker} Rarefaction'); ax.legend(); ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f'figures/{marker}_rarefaction.png', dpi=150, bbox_inches='tight')
plt.close()
print(f"16S min depth: {int(filt_16s.sum(axis=1).min())}, rarefaction: 24,000")
print(f"ITS min depth: {int(filt_its.sum(axis=1).min())}, rarefaction: 5,900")
16S min depth: 24287, rarefaction: 24,000 ITS min depth: 5964, rarefaction: 5,900
Rarefaction¶
In [1]:
def rarefy(df, depth, seed=42):
np.random.seed(seed)
out = pd.DataFrame(0, index=df.index, columns=df.columns)
for s in df.index:
c = df.loc[s].values.astype(int)
if c.sum() < depth: continue
pool = np.repeat(np.arange(len(c)), c)
chosen = np.random.choice(pool, depth, replace=False)
u, cnt = np.unique(chosen, return_counts=True)
out.loc[s, df.columns[u]] = cnt
return out
rare_16s = rarefy(filt_16s, 24000)
rare_its = rarefy(filt_its, 5900)
rare_16s.to_csv('data/16S/otu_table_rarefied.tsv', sep='\t')
rare_its.to_csv('data/ITS/otu_table_rarefied.tsv', sep='\t')
print(f"16S rarefied: {rare_16s.shape}"); print(f"ITS rarefied: {rare_its.shape}")
16S rarefied: (21, 1793) ITS rarefied: (20, 440)
Replicate Consistency¶
In [1]:
for marker, df in [('16S', rare_16s), ('ITS', rare_its)]:
ra = df.div(df.sum(axis=1), axis=0)
dm = squareform(pdist(ra.values, metric='braycurtis'))
pd.DataFrame(dm, index=df.index, columns=df.index).to_csv(f'data/{marker}/bray_curtis_distance.tsv', sep='\t')
groups = [meta.loc[s,'group_label'] for s in df.index]
w = [dm[i,j] for i in range(len(groups)) for j in range(i+1,len(groups)) if groups[i]==groups[j]]
b = [dm[i,j] for i in range(len(groups)) for j in range(i+1,len(groups)) if groups[i]!=groups[j]]
print(f"{marker}: within={np.mean(w):.3f}±{np.std(w):.3f}, between={np.mean(b):.3f}±{np.std(b):.3f}, ratio={np.mean(b)/np.mean(w):.2f}x")
Z = linkage(squareform(dm), method='average')
fig, ax = plt.subplots(figsize=(12, 6))
dendrogram(Z, labels=[f"{s} ({meta.loc[s,'group_label']})" for s in df.index], ax=ax, leaf_rotation=90)
ax.set_title(f'{marker} Dendrogram'); ax.set_ylabel('Bray-Curtis')
plt.tight_layout(); plt.savefig(f'figures/{marker}_dendrogram.png', dpi=150); plt.close()
16S: within=0.091±0.066, between=0.646±0.245, ratio=7.11x ITS: within=0.652±0.331, between=0.960±0.109, ratio=1.47x