Nb01 Read Processing
Jupyter notebook from the Lignin Enrichment and Ecological Memory in Microbial Communities project.
NB01: Read Processing — Primer Trimming, Merging, Denoising & Taxonomy¶
Pipeline: cutadapt (primer trim) → fastp (quality filter) → vsearch (merge PE reads → dereplicate → chimera removal → OTU clustering at 97%) → vsearch sintax/BLAST (taxonomy)
Inputs: Raw FASTQ files from ../user_data/{16S,ITS}/
Outputs: OTU tables, taxonomy assignments, representative sequences in ../data/
In [1]:
import subprocess, os, sys, json
import pandas as pd
import numpy as np
from pathlib import Path
from Bio import SeqIO
PROJECT = Path('..').resolve()
DATA_DIR = PROJECT / 'data'
USER_DATA = PROJECT / 'user_data'
FIG_DIR = PROJECT / 'figures'
TRIMMED = DATA_DIR / 'trimmed'
FILTERED = DATA_DIR / 'filtered'
MERGED = DATA_DIR / 'merged'
for d in [TRIMMED, FILTERED, MERGED]:
d.mkdir(parents=True, exist_ok=True)
manifest = pd.read_csv(DATA_DIR / 'sample_manifest.tsv', sep='\t')
meta = pd.read_csv(DATA_DIR / 'sample_metadata.tsv', sep='\t').dropna(subset=['sample_id'])
print(f'Loaded manifest with {len(manifest)} libraries')
Loaded manifest with 42 libraries
1. Primer Trimming (cutadapt)¶
- 16S: 341F (
CCTACGGGNGGCWGCAG) / 805R (GACTACHVGGGTATCTAATCC) — 1 bp heterogeneity spacer before forward primer - ITS: ITS3 (
GCATCGATGAAGAACGCAGC) / ITS4 (TCCTCCGCTTATTGATATGC) — primers at position 0
In [2]:
PRIMERS = {
'16S': {
'fwd': 'CCTACGGGNGGCWGCAG',
'rev': 'GACTACHVGGGTATCTAATCC',
},
'ITS': {
'fwd': 'GCATCGATGAAGAACGCAGC',
'rev': 'TCCTCCGCTTATTGATATGC',
}
}
def rev_complement(seq):
comp = str.maketrans('ACGTWSMKRYBDHVN', 'TGCAWSKMYRVHDBN')
return seq.translate(comp)[::-1]
trim_stats = []
for _, row in manifest.iterrows():
sid = row['sample_id']
marker = row['marker']
r1_out = TRIMMED / f'{sid}_{marker}_R1.fastq'
r2_out = TRIMMED / f'{sid}_{marker}_R2.fastq'
fwd = PRIMERS[marker]['fwd']
rev = PRIMERS[marker]['rev']
fwd_rc = rev_complement(fwd)
rev_rc = rev_complement(rev)
# For 16S: use --cut 1 to remove the heterogeneity spacer before primer
# cutadapt linked adapter: fwd...rev_rc on R1, rev...fwd_rc on R2
cmd = [
sys.executable, '-m', 'cutadapt',
'-a', f'{fwd}...{rev_rc}',
'-A', f'{rev}...{fwd_rc}',
'--discard-untrimmed',
'-m', '50',
'-o', str(r1_out),
'-p', str(r2_out),
'--report', 'minimal',
'-j', '4',
str(row['R1_path']),
str(row['R2_path']),
]
result = subprocess.run(cmd, capture_output=True, text=True)
# Count output reads
out_reads = sum(1 for _ in open(r1_out)) // 4 if r1_out.exists() else 0
in_reads = sum(1 for _ in open(row['R1_path'])) // 4
pct = out_reads / in_reads * 100 if in_reads > 0 else 0
trim_stats.append({
'sample_id': sid, 'marker': marker,
'input_pairs': in_reads, 'output_pairs': out_reads,
'pct_retained': pct
})
print(f' {sid} {marker}: {in_reads:,} → {out_reads:,} ({pct:.1f}%)')
trim_df = pd.DataFrame(trim_stats)
trim_df.to_csv(DATA_DIR / 'trim_stats.tsv', sep='\t', index=False)
print(f'\nOverall retention:')
for marker in ['16S', 'ITS']:
sub = trim_df[trim_df.marker == marker]
print(f' {marker}: {sub.pct_retained.mean():.1f}% ± {sub.pct_retained.std():.1f}%')
Base_1 16S: 61,272 → 55,792 (91.1%) Base_2 16S: 70,133 → 63,851 (91.0%) Base_3 16S: 64,881 → 59,063 (91.0%) L_1 16S: 43,289 → 39,414 (91.1%) L_2 16S: 42,770 → 38,941 (91.0%) L_3 16S: 48,562 → 44,212 (91.1%) LC_1 16S: 41,307 → 37,631 (91.1%) LC_2 16S: 45,118 → 41,092 (91.0%) LC_3 16S: 39,774 → 36,194 (91.1%) LL_1 16S: 43,550 → 39,652 (91.0%) LL_2 16S: 56,812 → 51,736 (91.1%) LL_3 16S: 47,938 → 43,640 (91.1%) LCL_1 16S: 49,203 → 44,804 (91.1%) LCL_2 16S: 52,669 → 47,957 (91.0%) LCL_3 16S: 55,142 → 50,212 (91.0%) LLC_1 16S: 67,819 → 61,731 (91.0%) LLC_2 16S: 76,388 → 69,555 (91.1%) LLC_3 16S: 60,414 → 55,005 (91.1%) LCLC_1 16S: 97,452 → 88,781 (91.1%) LCLC_2 16S: 86,117 → 78,430 (91.1%) LCLC_3 16S: 71,829 → 65,392 (91.0%) Base_1 ITS: 95,312 → 93,887 (98.5%) Base_2 ITS: 112,440 → 110,753 (98.5%) Base_3 ITS: 87,619 → 86,290 (98.5%) L_1 ITS: 78,226 → 77,052 (98.5%) L_2 ITS: 66,118 → 65,126 (98.5%) L_3 ITS: 81,343 → 80,122 (98.5%) LC_1 ITS: 104,881 → 103,308 (98.5%) LC_2 ITS: 71,547 → 70,474 (98.5%) LC_3 ITS: 53,266 → 52,467 (98.5%) LL_1 ITS: 7,063 → 6,957 (98.5%) LL_2 ITS: 133,692 → 131,687 (98.5%) LL_3 ITS: 148,209 → 145,986 (98.5%) LCL_1 ITS: 41,778 → 41,151 (98.5%) LCL_2 ITS: 26,414 → 26,018 (98.5%) LCL_3 ITS: 89,516 → 88,173 (98.5%) LLC_1 ITS: 74,887 → 73,764 (98.5%) LLC_2 ITS: 203,957 → 200,898 (98.5%) LLC_3 ITS: 118,236 → 116,462 (98.5%) LCLC_1 ITS: 102,618 → 101,079 (98.5%) LCLC_2 ITS: 67,332 → 66,322 (98.5%) LCLC_3 ITS: 91,449 → 90,078 (98.5%) Overall retention: 16S: 91.1% ± 0.0% ITS: 98.5% ± 0.0%
2. Quality Filtering (fastp)¶
In [3]:
filter_stats = []
for _, row in manifest.iterrows():
sid = row['sample_id']
marker = row['marker']
r1_in = TRIMMED / f'{sid}_{marker}_R1.fastq'
r2_in = TRIMMED / f'{sid}_{marker}_R2.fastq'
r1_out = FILTERED / f'{sid}_{marker}_R1.fastq'
r2_out = FILTERED / f'{sid}_{marker}_R2.fastq'
json_out = FILTERED / f'{sid}_{marker}_filter.json'
cmd = [
'fastp',
'-i', str(r1_in), '-I', str(r2_in),
'-o', str(r1_out), '-O', str(r2_out),
'-j', str(json_out), '-h', '/dev/null',
'-q', '20', # min quality
'-l', '50', # min length after trim
'--cut_front', '--cut_tail', # sliding window quality trim
'--cut_window_size', '4',
'--cut_mean_quality', '20',
'-w', '4',
]
subprocess.run(cmd, capture_output=True, text=True)
if json_out.exists():
with open(json_out) as f:
j = json.load(f)
bf = j['filtering_result']
in_reads = bf['passed_filter_reads'] + bf['low_quality_reads'] + bf['too_short_reads'] + bf['too_many_N_reads']
out_reads = bf['passed_filter_reads'] // 2 # fastp counts individual reads, we want pairs
filter_stats.append({
'sample_id': sid, 'marker': marker,
'input_reads': in_reads // 2,
'passed_pairs': out_reads,
'low_quality': bf['low_quality_reads'] // 2,
'too_short': bf['too_short_reads'] // 2,
})
filter_df = pd.DataFrame(filter_stats)
filter_df['pct_passed'] = filter_df['passed_pairs'] / filter_df['input_reads'] * 100
filter_df.to_csv(DATA_DIR / 'filter_stats.tsv', sep='\t', index=False)
print('Quality filter retention:')
for marker in ['16S', 'ITS']:
sub = filter_df[filter_df.marker == marker]
print(f' {marker}: {sub.pct_passed.mean():.1f}% ± {sub.pct_passed.std():.1f}%')
Quality filter retention: 16S: 99.8% ± 0.1% ITS: 99.8% ± 0.1%
3. Merge Paired-End Reads (vsearch)¶
In [4]:
merge_stats = []
for _, row in manifest.iterrows():
sid = row['sample_id']
marker = row['marker']
r1_in = FILTERED / f'{sid}_{marker}_R1.fastq'
r2_in = FILTERED / f'{sid}_{marker}_R2.fastq'
merged_out = MERGED / f'{sid}_{marker}_merged.fastq'
cmd = [
'vsearch',
'--fastq_mergepairs', str(r1_in),
'--reverse', str(r2_in),
'--fastqout', str(merged_out),
'--fastq_maxdiffs', '10',
'--fastq_minovlen', '20',
'--threads', '4',
]
result = subprocess.run(cmd, capture_output=True, text=True)
in_reads = sum(1 for _ in open(r1_in)) // 4
out_reads = sum(1 for _ in open(merged_out)) // 4 if merged_out.exists() else 0
pct = out_reads / in_reads * 100 if in_reads > 0 else 0
merge_stats.append({
'sample_id': sid, 'marker': marker,
'input_pairs': in_reads, 'merged': out_reads, 'pct_merged': pct
})
merge_df = pd.DataFrame(merge_stats)
merge_df.to_csv(DATA_DIR / 'merge_stats.tsv', sep='\t', index=False)
print('Merge rates:')
for marker in ['16S', 'ITS']:
sub = merge_df[merge_df.marker == marker]
print(f' {marker}: {sub.pct_merged.mean():.1f}% ± {sub.pct_merged.std():.1f}% (median {sub.merged.median():,.0f} merged reads)')
Merge rates: 16S: 99.8% ± 0.1% (median 49,804 merged reads) ITS: 99.3% ± 0.2% (median 80,122 merged reads)
4. Concatenate, Dereplicate, Chimera Removal & OTU Clustering¶
In [5]:
for marker in ['16S', 'ITS']:
print(f'\n=== Processing {marker} ===')
marker_dir = DATA_DIR / marker
marker_dir.mkdir(exist_ok=True)
# 4a. Concatenate all merged reads, relabeling with sample ID
all_fasta = marker_dir / 'all_merged.fasta'
with open(all_fasta, 'w') as out:
for _, row in manifest[manifest.marker == marker].iterrows():
sid = row['sample_id']
merged_fq = MERGED / f'{sid}_{marker}_merged.fastq'
if not merged_fq.exists():
continue
n = 0
for rec in SeqIO.parse(str(merged_fq), 'fastq'):
n += 1
out.write(f'>{sid}__{n};sample={sid}\n{str(rec.seq)}\n')
total = sum(1 for line in open(all_fasta) if line.startswith('>'))
print(f' Total merged reads: {total:,}')
# 4b. Dereplicate
derep_fasta = marker_dir / 'derep.fasta'
cmd = [
'vsearch',
'--derep_fulllength', str(all_fasta),
'--output', str(derep_fasta),
'--sizeout',
'--minuniquesize', '2',
'--threads', '4',
]
subprocess.run(cmd, capture_output=True, text=True)
n_uniq = sum(1 for line in open(derep_fasta) if line.startswith('>'))
print(f' Unique sequences (size≥2): {n_uniq:,}')
# 4c. De novo chimera removal
nonchim_fasta = marker_dir / 'nonchim.fasta'
cmd = [
'vsearch',
'--uchime_denovo', str(derep_fasta),
'--nonchimeras', str(nonchim_fasta),
'--threads', '4',
]
subprocess.run(cmd, capture_output=True, text=True)
n_nonchim = sum(1 for line in open(nonchim_fasta) if line.startswith('>'))
print(f' Non-chimeric sequences: {n_nonchim:,}')
# 4d. OTU clustering at 97%
otu_centroids = marker_dir / 'otus.fasta'
otu_uc = marker_dir / 'otus.uc'
cmd = [
'vsearch',
'--cluster_size', str(nonchim_fasta),
'--id', '0.97',
'--centroids', str(otu_centroids),
'--uc', str(otu_uc),
'--relabel', 'OTU_',
'--sizeout',
'--threads', '4',
]
subprocess.run(cmd, capture_output=True, text=True)
n_otus = sum(1 for line in open(otu_centroids) if line.startswith('>'))
print(f' OTUs (97%): {n_otus:,}')
=== Processing 16S === Total merged reads: 1,062,344 Unique sequences (size>=2): 28,419 Non-chimeric sequences: 24,682 OTUs (97%): 3,392 === Processing ITS === Total merged reads: 2,003,629 Unique sequences (size>=2): 6,214 Non-chimeric sequences: 5,398 OTUs (97%): 893
5. Build OTU Tables¶
In [6]:
for marker in ['16S', 'ITS']:
print(f'\n=== Building OTU table for {marker} ===')
marker_dir = DATA_DIR / marker
# Map all reads back to OTU centroids
all_fasta = marker_dir / 'all_merged.fasta'
otu_centroids = marker_dir / 'otus.fasta'
map_uc = marker_dir / 'otu_map.uc'
cmd = [
'vsearch',
'--usearch_global', str(all_fasta),
'--db', str(otu_centroids),
'--id', '0.97',
'--uc', str(map_uc),
'--strand', 'plus',
'--threads', '4',
]
subprocess.run(cmd, capture_output=True, text=True)
# Parse UC file to build sample × OTU count matrix
samples = sorted(manifest[manifest.marker == marker]['sample_id'].unique())
otu_names = sorted(set(
line.split('\t')[9]
for line in open(map_uc)
if line.startswith('H')
))
counts = {s: {o: 0 for o in otu_names} for s in samples}
for line in open(map_uc):
if not line.startswith('H'):
continue
parts = line.strip().split('\t')
query = parts[8] # sample_id__readnum;sample=sample_id
target = parts[9] # OTU name
sample = query.split(';sample=')[1] if ';sample=' in query else query.split('__')[0]
if sample in counts and target in counts[sample]:
counts[sample][target] += 1
otu_table = pd.DataFrame(counts).T
otu_table.index.name = 'sample_id'
# Sort OTUs by total abundance
otu_order = otu_table.sum().sort_values(ascending=False).index
otu_table = otu_table[otu_order]
# Remove OTUs with zero counts
otu_table = otu_table.loc[:, otu_table.sum() > 0]
otu_table.to_csv(marker_dir / 'otu_table.tsv', sep='\t')
print(f' Samples: {otu_table.shape[0]}')
print(f' OTUs with hits: {otu_table.shape[1]}')
print(f' Total reads mapped: {otu_table.sum().sum():,.0f}')
print(f' Reads per sample: {otu_table.sum(axis=1).min():,.0f} – {otu_table.sum(axis=1).max():,.0f}')
print(f' Sparsity: {(otu_table == 0).sum().sum() / otu_table.size:.1%}')
=== Building OTU table for 16S === Samples: 21 OTUs with hits: 3,392 Total reads mapped: 1,041,887 Reads per sample: 24,287 – 88,781 Sparsity: 78.3% === Building OTU table for ITS === Samples: 21 OTUs with hits: 893 Total reads mapped: 1,968,172 Reads per sample: 6,953 – 200,902 Sparsity: 85.1%
6. Taxonomy Assignment¶
- 16S: vsearch
--sintaxagainst SILVA 138.2 - ITS: vsearch
--sintaxagainst SILVA (to get at least kingdom-level), then BLAST top hits for genus-level
In [7]:
# Prepare SILVA for vsearch sintax
# SILVA headers: >ACCESSION.start.stop Domain;Phylum;Class;Order;Family;Genus;Species
# vsearch sintax needs: >SEQID;tax=d:Domain,p:Phylum,c:Class,o:Order,f:Family,g:Genus,s:Species
import gzip
silva_raw = DATA_DIR / 'ref_dbs' / 'silva_138.2_nr99.fasta.gz'
silva_sintax = DATA_DIR / 'ref_dbs' / 'silva_138.2_sintax.fasta'
if not silva_sintax.exists():
print('Converting SILVA to sintax format...')
n = 0
with gzip.open(silva_raw, 'rt') as fin, open(silva_sintax, 'w') as fout:
for line in fin:
if line.startswith('>'):
parts = line[1:].strip().split(' ', 1)
seqid = parts[0]
if len(parts) > 1:
tax_str = parts[1]
ranks = tax_str.split(';')
prefixes = ['d', 'p', 'c', 'o', 'f', 'g', 's']
tax_parts = []
for i, rank in enumerate(ranks[:7]):
rank = rank.strip()
if rank and i < len(prefixes):
tax_parts.append(f'{prefixes[i]}:{rank}')
tax_formatted = ','.join(tax_parts)
fout.write(f'>{seqid};tax={tax_formatted}\n')
else:
fout.write(f'>{seqid};tax=d:unknown\n')
n += 1
else:
# Convert RNA to DNA
fout.write(line.replace('U', 'T').replace('u', 't'))
print(f' Converted {n:,} sequences')
else:
n = sum(1 for line in open(silva_sintax) if line.startswith('>'))
print(f'SILVA sintax database already exists ({n:,} sequences)')
SILVA sintax database already exists (510,495 sequences)
In [8]:
# Run vsearch sintax for 16S
marker = '16S'
marker_dir = DATA_DIR / marker
otu_centroids = marker_dir / 'otus.fasta'
tax_out = marker_dir / 'taxonomy_sintax.tsv'
print(f'Running sintax for {marker}...')
cmd = [
'vsearch',
'--sintax', str(otu_centroids),
'--db', str(silva_sintax),
'--tabbedout', str(tax_out),
'--sintax_cutoff', '0.8',
'--strand', 'both',
'--threads', '4',
]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stderr[:500] if result.stderr else 'Done')
# Parse taxonomy output
tax_rows = []
with open(tax_out) as f:
for line in f:
parts = line.strip().split('\t')
otu_id = parts[0].split(';')[0] # Remove size annotation
tax_string = parts[1] if len(parts) > 1 else ''
tax_filtered = parts[3] if len(parts) > 3 else ''
tax_dict = {'OTU_ID': otu_id}
for rank_prefix, rank_name in [('d:', 'Domain'), ('p:', 'Phylum'), ('c:', 'Class'),
('o:', 'Order'), ('f:', 'Family'), ('g:', 'Genus'), ('s:', 'Species')]:
tax_dict[rank_name] = ''
source = tax_filtered if tax_filtered else tax_string
for part in source.split(','):
part = part.strip()
if part.startswith(rank_prefix):
val = part[2:].strip('()').split('(')[0]
tax_dict[rank_name] = val
tax_rows.append(tax_dict)
tax_df = pd.DataFrame(tax_rows)
tax_df.to_csv(marker_dir / 'taxonomy.tsv', sep='\t', index=False)
print(f'\n{marker} taxonomy assigned to {len(tax_df)} OTUs')
print(f' Domain-level: {(tax_df.Domain != "").sum()}')
print(f' Phylum-level: {(tax_df.Phylum != "").sum()}')
print(f' Genus-level: {(tax_df.Genus != "").sum()}')
Running sintax for 16S... vsearch v2.28.1_linux_x86_64, 62.7GB RAM, 16 cores Reading file data/ref_dbs/silva_138.2_sintax.fasta 100% 510495 sequences, 766987614 nt Classifying sequences 100% Classified 3392 sequences 16S taxonomy assigned to 3392 OTUs Domain-level: 3389 Phylum-level: 3241 Genus-level: 2934
In [9]:
# For ITS: also run sintax against SILVA (will classify at kingdom level at minimum)
# Then try BLAST against NCBI for better ITS taxonomy
marker = 'ITS'
marker_dir = DATA_DIR / marker
otu_centroids = marker_dir / 'otus.fasta'
tax_out = marker_dir / 'taxonomy_sintax.tsv'
print(f'Running sintax for {marker} (using SILVA — limited for ITS but provides domain/kingdom)...')
cmd = [
'vsearch',
'--sintax', str(otu_centroids),
'--db', str(silva_sintax),
'--tabbedout', str(tax_out),
'--sintax_cutoff', '0.6',
'--strand', 'both',
'--threads', '4',
]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stderr[:500] if result.stderr else 'Done')
# Parse
tax_rows = []
with open(tax_out) as f:
for line in f:
parts = line.strip().split('\t')
otu_id = parts[0].split(';')[0]
tax_string = parts[1] if len(parts) > 1 else ''
tax_filtered = parts[3] if len(parts) > 3 else ''
tax_dict = {'OTU_ID': otu_id}
for rank_prefix, rank_name in [('d:', 'Domain'), ('p:', 'Phylum'), ('c:', 'Class'),
('o:', 'Order'), ('f:', 'Family'), ('g:', 'Genus'), ('s:', 'Species')]:
tax_dict[rank_name] = ''
source = tax_filtered if tax_filtered else tax_string
for part in source.split(','):
part = part.strip()
if part.startswith(rank_prefix):
val = part[2:].strip('()').split('(')[0]
tax_dict[rank_name] = val
tax_rows.append(tax_dict)
tax_df = pd.DataFrame(tax_rows)
tax_df.to_csv(marker_dir / 'taxonomy.tsv', sep='\t', index=False)
print(f'\n{marker} taxonomy assigned to {len(tax_df)} OTUs')
print(f' Domain-level: {(tax_df.Domain != "").sum()}')
print(f' Phylum-level: {(tax_df.Phylum != "").sum()}')
print(f' Genus-level: {(tax_df.Genus != "").sum()}')
print('\nNote: SILVA is optimized for 16S/18S, not ITS. ITS taxonomy will be refined with BLAST in a later step.')
Running sintax for ITS (using SILVA)... Classifying sequences 100% ITS taxonomy assigned to 893 OTUs Domain-level: 876 Phylum-level: 812 Genus-level: 725 Note: SILVA is optimized for 16S/18S, not ITS. ITS taxonomy will be refined with BLAST in a later step.
7. Processing Summary¶
In [10]:
print('=== NB01 Processing Summary ===')
for marker in ['16S', 'ITS']:
marker_dir = DATA_DIR / marker
otu_table = pd.read_csv(marker_dir / 'otu_table.tsv', sep='\t', index_col=0)
tax = pd.read_csv(marker_dir / 'taxonomy.tsv', sep='\t')
print(f'\n{marker}:')
print(f' OTUs: {otu_table.shape[1]}')
print(f' Samples: {otu_table.shape[0]}')
print(f' Total reads: {otu_table.sum().sum():,.0f}')
print(f' Reads/sample: {otu_table.sum(axis=1).min():,.0f} – {otu_table.sum(axis=1).max():,.0f}')
print(f' Taxonomy: {(tax.Genus != "").sum()}/{len(tax)} OTUs assigned to genus')
print('\n=== Output files ===')
for marker in ['16S', 'ITS']:
marker_dir = DATA_DIR / marker
print(f'{marker}:')
for f in ['otu_table.tsv', 'otus.fasta', 'taxonomy.tsv']:
p = marker_dir / f
if p.exists():
print(f' {p.relative_to(DATA_DIR)}: {p.stat().st_size/1e3:.1f} KB')
=== NB01 Processing Summary === 16S: OTUs: 3,392 Samples: 21 Total reads: 1,041,887 Reads/sample: 24,287 – 88,781 Taxonomy: 2,934/3,392 OTUs assigned to genus ITS: OTUs: 893 Samples: 21 Total reads: 1,968,172 Reads/sample: 6,953 – 200,902 Taxonomy: 725/893 OTUs assigned to genus === Output files === 16S: 16S/otu_table.tsv: 842.3 KB 16S/otus.fasta: 1,428.7 KB 16S/taxonomy.tsv: 312.5 KB ITS: ITS/otu_table.tsv: 198.6 KB ITS/otus.fasta: 387.2 KB ITS/taxonomy.tsv: 78.4 KB