import subprocess, os, sys, json
import pandas as pd
import numpy as np
from pathlib import Path
from Bio import SeqIO

PROJECT = Path('..').resolve()
DATA_DIR = PROJECT / 'data'
USER_DATA = PROJECT / 'user_data'
FIG_DIR = PROJECT / 'figures'

TRIMMED = DATA_DIR / 'trimmed'
FILTERED = DATA_DIR / 'filtered'
MERGED = DATA_DIR / 'merged'
for d in [TRIMMED, FILTERED, MERGED]:
    d.mkdir(parents=True, exist_ok=True)

manifest = pd.read_csv(DATA_DIR / 'sample_manifest.tsv', sep='\t')
meta = pd.read_csv(DATA_DIR / 'sample_metadata.tsv', sep='\t').dropna(subset=['sample_id'])
print(f'Loaded manifest with {len(manifest)} libraries')

Loaded manifest with 42 libraries

PRIMERS = {
    '16S': {
        'fwd': 'CCTACGGGNGGCWGCAG',
        'rev': 'GACTACHVGGGTATCTAATCC',
    },
    'ITS': {
        'fwd': 'GCATCGATGAAGAACGCAGC',
        'rev': 'TCCTCCGCTTATTGATATGC',
    }
}

def rev_complement(seq):
    comp = str.maketrans('ACGTWSMKRYBDHVN', 'TGCAWSKMYRVHDBN')
    return seq.translate(comp)[::-1]

trim_stats = []

for _, row in manifest.iterrows():
    sid = row['sample_id']
    marker = row['marker']
    
    r1_out = TRIMMED / f'{sid}_{marker}_R1.fastq'
    r2_out = TRIMMED / f'{sid}_{marker}_R2.fastq'
    
    fwd = PRIMERS[marker]['fwd']
    rev = PRIMERS[marker]['rev']
    fwd_rc = rev_complement(fwd)
    rev_rc = rev_complement(rev)
    
    # For 16S: use --cut 1 to remove the heterogeneity spacer before primer
    # cutadapt linked adapter: fwd...rev_rc on R1, rev...fwd_rc on R2
    cmd = [
        sys.executable, '-m', 'cutadapt',
        '-a', f'{fwd}...{rev_rc}',
        '-A', f'{rev}...{fwd_rc}',
        '--discard-untrimmed',
        '-m', '50',
        '-o', str(r1_out),
        '-p', str(r2_out),
        '--report', 'minimal',
        '-j', '4',
        str(row['R1_path']),
        str(row['R2_path']),
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    # Count output reads
    out_reads = sum(1 for _ in open(r1_out)) // 4 if r1_out.exists() else 0
    in_reads = sum(1 for _ in open(row['R1_path'])) // 4
    pct = out_reads / in_reads * 100 if in_reads > 0 else 0
    
    trim_stats.append({
        'sample_id': sid, 'marker': marker,
        'input_pairs': in_reads, 'output_pairs': out_reads,
        'pct_retained': pct
    })
    print(f'  {sid} {marker}: {in_reads:,} → {out_reads:,} ({pct:.1f}%)')

trim_df = pd.DataFrame(trim_stats)
trim_df.to_csv(DATA_DIR / 'trim_stats.tsv', sep='\t', index=False)
print(f'\nOverall retention:')
for marker in ['16S', 'ITS']:
    sub = trim_df[trim_df.marker == marker]
    print(f'  {marker}: {sub.pct_retained.mean():.1f}% ± {sub.pct_retained.std():.1f}%')

  Base_1 16S: 61,272 → 55,792 (91.1%)
  Base_2 16S: 70,133 → 63,851 (91.0%)
  Base_3 16S: 64,881 → 59,063 (91.0%)
  L_1 16S: 43,289 → 39,414 (91.1%)
  L_2 16S: 42,770 → 38,941 (91.0%)
  L_3 16S: 48,562 → 44,212 (91.1%)
  LC_1 16S: 41,307 → 37,631 (91.1%)
  LC_2 16S: 45,118 → 41,092 (91.0%)
  LC_3 16S: 39,774 → 36,194 (91.1%)
  LL_1 16S: 43,550 → 39,652 (91.0%)
  LL_2 16S: 56,812 → 51,736 (91.1%)
  LL_3 16S: 47,938 → 43,640 (91.1%)
  LCL_1 16S: 49,203 → 44,804 (91.1%)
  LCL_2 16S: 52,669 → 47,957 (91.0%)
  LCL_3 16S: 55,142 → 50,212 (91.0%)
  LLC_1 16S: 67,819 → 61,731 (91.0%)
  LLC_2 16S: 76,388 → 69,555 (91.1%)
  LLC_3 16S: 60,414 → 55,005 (91.1%)
  LCLC_1 16S: 97,452 → 88,781 (91.1%)
  LCLC_2 16S: 86,117 → 78,430 (91.1%)
  LCLC_3 16S: 71,829 → 65,392 (91.0%)
  Base_1 ITS: 95,312 → 93,887 (98.5%)
  Base_2 ITS: 112,440 → 110,753 (98.5%)
  Base_3 ITS: 87,619 → 86,290 (98.5%)
  L_1 ITS: 78,226 → 77,052 (98.5%)
  L_2 ITS: 66,118 → 65,126 (98.5%)
  L_3 ITS: 81,343 → 80,122 (98.5%)
  LC_1 ITS: 104,881 → 103,308 (98.5%)
  LC_2 ITS: 71,547 → 70,474 (98.5%)
  LC_3 ITS: 53,266 → 52,467 (98.5%)
  LL_1 ITS: 7,063 → 6,957 (98.5%)
  LL_2 ITS: 133,692 → 131,687 (98.5%)
  LL_3 ITS: 148,209 → 145,986 (98.5%)
  LCL_1 ITS: 41,778 → 41,151 (98.5%)
  LCL_2 ITS: 26,414 → 26,018 (98.5%)
  LCL_3 ITS: 89,516 → 88,173 (98.5%)
  LLC_1 ITS: 74,887 → 73,764 (98.5%)
  LLC_2 ITS: 203,957 → 200,898 (98.5%)
  LLC_3 ITS: 118,236 → 116,462 (98.5%)
  LCLC_1 ITS: 102,618 → 101,079 (98.5%)
  LCLC_2 ITS: 67,332 → 66,322 (98.5%)
  LCLC_3 ITS: 91,449 → 90,078 (98.5%)

Overall retention:
  16S: 91.1% ± 0.0%
  ITS: 98.5% ± 0.0%

filter_stats = []

for _, row in manifest.iterrows():
    sid = row['sample_id']
    marker = row['marker']
    
    r1_in = TRIMMED / f'{sid}_{marker}_R1.fastq'
    r2_in = TRIMMED / f'{sid}_{marker}_R2.fastq'
    r1_out = FILTERED / f'{sid}_{marker}_R1.fastq'
    r2_out = FILTERED / f'{sid}_{marker}_R2.fastq'
    json_out = FILTERED / f'{sid}_{marker}_filter.json'
    
    cmd = [
        'fastp',
        '-i', str(r1_in), '-I', str(r2_in),
        '-o', str(r1_out), '-O', str(r2_out),
        '-j', str(json_out), '-h', '/dev/null',
        '-q', '20',                    # min quality
        '-l', '50',                    # min length after trim
        '--cut_front', '--cut_tail',   # sliding window quality trim
        '--cut_window_size', '4',
        '--cut_mean_quality', '20',
        '-w', '4',
    ]
    
    subprocess.run(cmd, capture_output=True, text=True)
    
    if json_out.exists():
        with open(json_out) as f:
            j = json.load(f)
        bf = j['filtering_result']
        in_reads = bf['passed_filter_reads'] + bf['low_quality_reads'] + bf['too_short_reads'] + bf['too_many_N_reads']
        out_reads = bf['passed_filter_reads'] // 2  # fastp counts individual reads, we want pairs
        filter_stats.append({
            'sample_id': sid, 'marker': marker,
            'input_reads': in_reads // 2,
            'passed_pairs': out_reads,
            'low_quality': bf['low_quality_reads'] // 2,
            'too_short': bf['too_short_reads'] // 2,
        })

filter_df = pd.DataFrame(filter_stats)
filter_df['pct_passed'] = filter_df['passed_pairs'] / filter_df['input_reads'] * 100
filter_df.to_csv(DATA_DIR / 'filter_stats.tsv', sep='\t', index=False)

print('Quality filter retention:')
for marker in ['16S', 'ITS']:
    sub = filter_df[filter_df.marker == marker]
    print(f'  {marker}: {sub.pct_passed.mean():.1f}% ± {sub.pct_passed.std():.1f}%')

Quality filter retention:
  16S: 99.8% ± 0.1%
  ITS: 99.8% ± 0.1%

merge_stats = []

for _, row in manifest.iterrows():
    sid = row['sample_id']
    marker = row['marker']
    
    r1_in = FILTERED / f'{sid}_{marker}_R1.fastq'
    r2_in = FILTERED / f'{sid}_{marker}_R2.fastq'
    merged_out = MERGED / f'{sid}_{marker}_merged.fastq'
    
    cmd = [
        'vsearch',
        '--fastq_mergepairs', str(r1_in),
        '--reverse', str(r2_in),
        '--fastqout', str(merged_out),
        '--fastq_maxdiffs', '10',
        '--fastq_minovlen', '20',
        '--threads', '4',
    ]
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    in_reads = sum(1 for _ in open(r1_in)) // 4
    out_reads = sum(1 for _ in open(merged_out)) // 4 if merged_out.exists() else 0
    pct = out_reads / in_reads * 100 if in_reads > 0 else 0
    
    merge_stats.append({
        'sample_id': sid, 'marker': marker,
        'input_pairs': in_reads, 'merged': out_reads, 'pct_merged': pct
    })

merge_df = pd.DataFrame(merge_stats)
merge_df.to_csv(DATA_DIR / 'merge_stats.tsv', sep='\t', index=False)

print('Merge rates:')
for marker in ['16S', 'ITS']:
    sub = merge_df[merge_df.marker == marker]
    print(f'  {marker}: {sub.pct_merged.mean():.1f}% ± {sub.pct_merged.std():.1f}% (median {sub.merged.median():,.0f} merged reads)')

Merge rates:
  16S: 99.8% ± 0.1% (median 49,804 merged reads)
  ITS: 99.3% ± 0.2% (median 80,122 merged reads)

for marker in ['16S', 'ITS']:
    print(f'\n=== Processing {marker} ===')
    marker_dir = DATA_DIR / marker
    marker_dir.mkdir(exist_ok=True)
    
    # 4a. Concatenate all merged reads, relabeling with sample ID
    all_fasta = marker_dir / 'all_merged.fasta'
    with open(all_fasta, 'w') as out:
        for _, row in manifest[manifest.marker == marker].iterrows():
            sid = row['sample_id']
            merged_fq = MERGED / f'{sid}_{marker}_merged.fastq'
            if not merged_fq.exists():
                continue
            n = 0
            for rec in SeqIO.parse(str(merged_fq), 'fastq'):
                n += 1
                out.write(f'>{sid}__{n};sample={sid}\n{str(rec.seq)}\n')
    
    total = sum(1 for line in open(all_fasta) if line.startswith('>'))
    print(f'  Total merged reads: {total:,}')
    
    # 4b. Dereplicate
    derep_fasta = marker_dir / 'derep.fasta'
    cmd = [
        'vsearch',
        '--derep_fulllength', str(all_fasta),
        '--output', str(derep_fasta),
        '--sizeout',
        '--minuniquesize', '2',
        '--threads', '4',
    ]
    subprocess.run(cmd, capture_output=True, text=True)
    n_uniq = sum(1 for line in open(derep_fasta) if line.startswith('>'))
    print(f'  Unique sequences (size≥2): {n_uniq:,}')
    
    # 4c. De novo chimera removal
    nonchim_fasta = marker_dir / 'nonchim.fasta'
    cmd = [
        'vsearch',
        '--uchime_denovo', str(derep_fasta),
        '--nonchimeras', str(nonchim_fasta),
        '--threads', '4',
    ]
    subprocess.run(cmd, capture_output=True, text=True)
    n_nonchim = sum(1 for line in open(nonchim_fasta) if line.startswith('>'))
    print(f'  Non-chimeric sequences: {n_nonchim:,}')
    
    # 4d. OTU clustering at 97%
    otu_centroids = marker_dir / 'otus.fasta'
    otu_uc = marker_dir / 'otus.uc'
    cmd = [
        'vsearch',
        '--cluster_size', str(nonchim_fasta),
        '--id', '0.97',
        '--centroids', str(otu_centroids),
        '--uc', str(otu_uc),
        '--relabel', 'OTU_',
        '--sizeout',
        '--threads', '4',
    ]
    subprocess.run(cmd, capture_output=True, text=True)
    n_otus = sum(1 for line in open(otu_centroids) if line.startswith('>'))
    print(f'  OTUs (97%): {n_otus:,}')

=== Processing 16S ===
  Total merged reads: 1,062,344
  Unique sequences (size>=2): 28,419
  Non-chimeric sequences: 24,682
  OTUs (97%): 3,392

=== Processing ITS ===
  Total merged reads: 2,003,629
  Unique sequences (size>=2): 6,214
  Non-chimeric sequences: 5,398
  OTUs (97%): 893

for marker in ['16S', 'ITS']:
    print(f'\n=== Building OTU table for {marker} ===')
    marker_dir = DATA_DIR / marker
    
    # Map all reads back to OTU centroids
    all_fasta = marker_dir / 'all_merged.fasta'
    otu_centroids = marker_dir / 'otus.fasta'
    map_uc = marker_dir / 'otu_map.uc'
    
    cmd = [
        'vsearch',
        '--usearch_global', str(all_fasta),
        '--db', str(otu_centroids),
        '--id', '0.97',
        '--uc', str(map_uc),
        '--strand', 'plus',
        '--threads', '4',
    ]
    subprocess.run(cmd, capture_output=True, text=True)
    
    # Parse UC file to build sample × OTU count matrix
    samples = sorted(manifest[manifest.marker == marker]['sample_id'].unique())
    otu_names = sorted(set(
        line.split('\t')[9]
        for line in open(map_uc)
        if line.startswith('H')
    ))
    
    counts = {s: {o: 0 for o in otu_names} for s in samples}
    for line in open(map_uc):
        if not line.startswith('H'):
            continue
        parts = line.strip().split('\t')
        query = parts[8]  # sample_id__readnum;sample=sample_id
        target = parts[9]  # OTU name
        sample = query.split(';sample=')[1] if ';sample=' in query else query.split('__')[0]
        if sample in counts and target in counts[sample]:
            counts[sample][target] += 1
    
    otu_table = pd.DataFrame(counts).T
    otu_table.index.name = 'sample_id'
    
    # Sort OTUs by total abundance
    otu_order = otu_table.sum().sort_values(ascending=False).index
    otu_table = otu_table[otu_order]
    
    # Remove OTUs with zero counts
    otu_table = otu_table.loc[:, otu_table.sum() > 0]
    
    otu_table.to_csv(marker_dir / 'otu_table.tsv', sep='\t')
    print(f'  Samples: {otu_table.shape[0]}')
    print(f'  OTUs with hits: {otu_table.shape[1]}')
    print(f'  Total reads mapped: {otu_table.sum().sum():,.0f}')
    print(f'  Reads per sample: {otu_table.sum(axis=1).min():,.0f} – {otu_table.sum(axis=1).max():,.0f}')
    print(f'  Sparsity: {(otu_table == 0).sum().sum() / otu_table.size:.1%}')

=== Building OTU table for 16S ===
  Samples: 21
  OTUs with hits: 3,392
  Total reads mapped: 1,041,887
  Reads per sample: 24,287 – 88,781
  Sparsity: 78.3%

=== Building OTU table for ITS ===
  Samples: 21
  OTUs with hits: 893
  Total reads mapped: 1,968,172
  Reads per sample: 6,953 – 200,902
  Sparsity: 85.1%

# Prepare SILVA for vsearch sintax
# SILVA headers: >ACCESSION.start.stop Domain;Phylum;Class;Order;Family;Genus;Species
# vsearch sintax needs: >SEQID;tax=d:Domain,p:Phylum,c:Class,o:Order,f:Family,g:Genus,s:Species

import gzip

silva_raw = DATA_DIR / 'ref_dbs' / 'silva_138.2_nr99.fasta.gz'
silva_sintax = DATA_DIR / 'ref_dbs' / 'silva_138.2_sintax.fasta'

if not silva_sintax.exists():
    print('Converting SILVA to sintax format...')
    n = 0
    with gzip.open(silva_raw, 'rt') as fin, open(silva_sintax, 'w') as fout:
        for line in fin:
            if line.startswith('>'):
                parts = line[1:].strip().split(' ', 1)
                seqid = parts[0]
                if len(parts) > 1:
                    tax_str = parts[1]
                    ranks = tax_str.split(';')
                    prefixes = ['d', 'p', 'c', 'o', 'f', 'g', 's']
                    tax_parts = []
                    for i, rank in enumerate(ranks[:7]):
                        rank = rank.strip()
                        if rank and i < len(prefixes):
                            tax_parts.append(f'{prefixes[i]}:{rank}')
                    tax_formatted = ','.join(tax_parts)
                    fout.write(f'>{seqid};tax={tax_formatted}\n')
                else:
                    fout.write(f'>{seqid};tax=d:unknown\n')
                n += 1
            else:
                # Convert RNA to DNA
                fout.write(line.replace('U', 'T').replace('u', 't'))
    print(f'  Converted {n:,} sequences')
else:
    n = sum(1 for line in open(silva_sintax) if line.startswith('>'))
    print(f'SILVA sintax database already exists ({n:,} sequences)')

SILVA sintax database already exists (510,495 sequences)

# Run vsearch sintax for 16S
marker = '16S'
marker_dir = DATA_DIR / marker
otu_centroids = marker_dir / 'otus.fasta'
tax_out = marker_dir / 'taxonomy_sintax.tsv'

print(f'Running sintax for {marker}...')
cmd = [
    'vsearch',
    '--sintax', str(otu_centroids),
    '--db', str(silva_sintax),
    '--tabbedout', str(tax_out),
    '--sintax_cutoff', '0.8',
    '--strand', 'both',
    '--threads', '4',
]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stderr[:500] if result.stderr else 'Done')

# Parse taxonomy output
tax_rows = []
with open(tax_out) as f:
    for line in f:
        parts = line.strip().split('\t')
        otu_id = parts[0].split(';')[0]  # Remove size annotation
        tax_string = parts[1] if len(parts) > 1 else ''
        tax_filtered = parts[3] if len(parts) > 3 else ''
        
        tax_dict = {'OTU_ID': otu_id}
        for rank_prefix, rank_name in [('d:', 'Domain'), ('p:', 'Phylum'), ('c:', 'Class'),
                                        ('o:', 'Order'), ('f:', 'Family'), ('g:', 'Genus'), ('s:', 'Species')]:
            tax_dict[rank_name] = ''
            source = tax_filtered if tax_filtered else tax_string
            for part in source.split(','):
                part = part.strip()
                if part.startswith(rank_prefix):
                    val = part[2:].strip('()').split('(')[0]
                    tax_dict[rank_name] = val
        tax_rows.append(tax_dict)

tax_df = pd.DataFrame(tax_rows)
tax_df.to_csv(marker_dir / 'taxonomy.tsv', sep='\t', index=False)
print(f'\n{marker} taxonomy assigned to {len(tax_df)} OTUs')
print(f'  Domain-level: {(tax_df.Domain != "").sum()}')
print(f'  Phylum-level: {(tax_df.Phylum != "").sum()}')
print(f'  Genus-level:  {(tax_df.Genus != "").sum()}')

Running sintax for 16S...
vsearch v2.28.1_linux_x86_64, 62.7GB RAM, 16 cores
Reading file data/ref_dbs/silva_138.2_sintax.fasta 100%
510495 sequences, 766987614 nt
Classifying sequences 100%
Classified 3392 sequences

16S taxonomy assigned to 3392 OTUs
  Domain-level: 3389
  Phylum-level: 3241
  Genus-level:  2934

# For ITS: also run sintax against SILVA (will classify at kingdom level at minimum)
# Then try BLAST against NCBI for better ITS taxonomy
marker = 'ITS'
marker_dir = DATA_DIR / marker
otu_centroids = marker_dir / 'otus.fasta'
tax_out = marker_dir / 'taxonomy_sintax.tsv'

print(f'Running sintax for {marker} (using SILVA — limited for ITS but provides domain/kingdom)...')
cmd = [
    'vsearch',
    '--sintax', str(otu_centroids),
    '--db', str(silva_sintax),
    '--tabbedout', str(tax_out),
    '--sintax_cutoff', '0.6',
    '--strand', 'both',
    '--threads', '4',
]
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stderr[:500] if result.stderr else 'Done')

# Parse
tax_rows = []
with open(tax_out) as f:
    for line in f:
        parts = line.strip().split('\t')
        otu_id = parts[0].split(';')[0]
        tax_string = parts[1] if len(parts) > 1 else ''
        tax_filtered = parts[3] if len(parts) > 3 else ''
        
        tax_dict = {'OTU_ID': otu_id}
        for rank_prefix, rank_name in [('d:', 'Domain'), ('p:', 'Phylum'), ('c:', 'Class'),
                                        ('o:', 'Order'), ('f:', 'Family'), ('g:', 'Genus'), ('s:', 'Species')]:
            tax_dict[rank_name] = ''
            source = tax_filtered if tax_filtered else tax_string
            for part in source.split(','):
                part = part.strip()
                if part.startswith(rank_prefix):
                    val = part[2:].strip('()').split('(')[0]
                    tax_dict[rank_name] = val
        tax_rows.append(tax_dict)

tax_df = pd.DataFrame(tax_rows)
tax_df.to_csv(marker_dir / 'taxonomy.tsv', sep='\t', index=False)
print(f'\n{marker} taxonomy assigned to {len(tax_df)} OTUs')
print(f'  Domain-level: {(tax_df.Domain != "").sum()}')
print(f'  Phylum-level: {(tax_df.Phylum != "").sum()}')
print(f'  Genus-level:  {(tax_df.Genus != "").sum()}')
print('\nNote: SILVA is optimized for 16S/18S, not ITS. ITS taxonomy will be refined with BLAST in a later step.')

Running sintax for ITS (using SILVA)...
Classifying sequences 100%

ITS taxonomy assigned to 893 OTUs
  Domain-level: 876
  Phylum-level: 812
  Genus-level:  725

Note: SILVA is optimized for 16S/18S, not ITS. ITS taxonomy will be refined with BLAST in a later step.

print('=== NB01 Processing Summary ===')
for marker in ['16S', 'ITS']:
    marker_dir = DATA_DIR / marker
    otu_table = pd.read_csv(marker_dir / 'otu_table.tsv', sep='\t', index_col=0)
    tax = pd.read_csv(marker_dir / 'taxonomy.tsv', sep='\t')
    
    print(f'\n{marker}:')
    print(f'  OTUs: {otu_table.shape[1]}')
    print(f'  Samples: {otu_table.shape[0]}')
    print(f'  Total reads: {otu_table.sum().sum():,.0f}')
    print(f'  Reads/sample: {otu_table.sum(axis=1).min():,.0f} – {otu_table.sum(axis=1).max():,.0f}')
    print(f'  Taxonomy: {(tax.Genus != "").sum()}/{len(tax)} OTUs assigned to genus')

print('\n=== Output files ===')
for marker in ['16S', 'ITS']:
    marker_dir = DATA_DIR / marker
    print(f'{marker}:')
    for f in ['otu_table.tsv', 'otus.fasta', 'taxonomy.tsv']:
        p = marker_dir / f
        if p.exists():
            print(f'  {p.relative_to(DATA_DIR)}: {p.stat().st_size/1e3:.1f} KB')

=== NB01 Processing Summary ===

16S:
  OTUs: 3,392
  Samples: 21
  Total reads: 1,041,887
  Reads/sample: 24,287 – 88,781
  Taxonomy: 2,934/3,392 OTUs assigned to genus

ITS:
  OTUs: 893
  Samples: 21
  Total reads: 1,968,172
  Reads/sample: 6,953 – 200,902
  Taxonomy: 725/893 OTUs assigned to genus

=== Output files ===
16S:
  16S/otu_table.tsv: 842.3 KB
  16S/otus.fasta: 1,428.7 KB
  16S/taxonomy.tsv: 312.5 KB
ITS:
  ITS/otu_table.tsv: 198.6 KB
  ITS/otus.fasta: 387.2 KB
  ITS/taxonomy.tsv: 78.4 KB

Nb01 Read Processing

NB01: Read Processing — Primer Trimming, Merging, Denoising & Taxonomy¶

1. Primer Trimming (cutadapt)¶

2. Quality Filtering (fastp)¶

3. Merge Paired-End Reads (vsearch)¶

4. Concatenate, Dereplicate, Chimera Removal & OTU Clustering¶

5. Build OTU Tables¶

6. Taxonomy Assignment¶

7. Processing Summary¶