06 Exemplar Blast
Jupyter notebook from the Annotation-Gap Discovery via Phenotype-Fitness-Pangenome-Gapfilling Integration project.
NB06: Exemplar BLAST and Triangulated Gene Assignment¶
Project: Annotation-Gap Discovery
Goal: Use DIAMOND BLAST against Swiss-Prot exemplar sequences to find homologs
for gapfilled reactions, then triangulate all evidence for final confidence-scored
gene-reaction assignments.
Inputs: NB03 candidates, NB04 Bakta/GapMind, NB05 fitness/co-occurrence
Outputs: blast_hits.tsv, reaction_gene_candidates.tsv (final master table)
Requires: DIAMOND BLAST, UniProt API access, BERDL JupyterHub
import pandas as pd
import numpy as np
import os, subprocess, time, requests
from io import StringIO
DATA_DIR = '../data'
FIG_DIR = '../figures'
os.makedirs(FIG_DIR, exist_ok=True)
os.makedirs(f'{DATA_DIR}/blast', exist_ok=True)
# Load all prior outputs
candidates = pd.read_csv(f'{DATA_DIR}/annotation_gap_candidates.tsv', sep='\t')
bakta_alts = pd.read_csv(f'{DATA_DIR}/bakta_ec_alternatives.tsv', sep='\t')
gf_details = pd.read_csv(f'{DATA_DIR}/gapfill_reaction_details.tsv', sep='\t')
gf_results = pd.read_csv(f'{DATA_DIR}/carbon_source_gapfill_results.tsv', sep='\t')
manifest = pd.read_csv(f'{DATA_DIR}/genome_manifest.tsv', sep='\t')
concordance = pd.read_csv(f'{DATA_DIR}/gapmind_concordance.tsv', sep='\t')
fitness_support = pd.read_csv(f'{DATA_DIR}/fitness_support.tsv', sep='\t')
cooccurrence = pd.read_csv(f'{DATA_DIR}/cooccurrence_candidates.tsv', sep='\t')
cand_enhanced = pd.read_csv(f'{DATA_DIR}/candidates_enhanced.tsv', sep='\t')
enzymatic = gf_details[gf_details['rxn_type'] == 'metabolic'].copy()
print(f'Enzymatic reactions: {len(enzymatic)}')
print(f'NB03 candidates: {len(candidates)}')
print(f'NB04 Bakta alternatives: {len(bakta_alts)}')
print(f'NB05 fitness profiles: {len(fitness_support)}')
print(f'NB05 co-occurrence: {len(cooccurrence)}')
Enzymatic reactions: 201 NB03 candidates: 107 NB04 Bakta alternatives: 1459 NB05 fitness profiles: 71 NB05 co-occurrence: 201
1. Download Swiss-Prot Exemplar Sequences¶
For each unique EC from gapfilled reactions, download up to 5 reviewed
Swiss-Prot sequences from UniProt. These serve as exemplar queries for BLAST.
# Collect unique ECs from gapfilled reactions
ec_to_rxns = {}
for _, row in enzymatic[enzymatic['ec_numbers'].notna()].iterrows():
for ec in str(row['ec_numbers']).replace('|', ';').split(';'):
ec = ec.strip()
if ec and not ec.endswith('-'):
ec_to_rxns.setdefault(ec, set()).add(row['rxn_id_base'])
# Also include partial ECs (ending in -) with a broader query
partial_ecs = set()
for _, row in enzymatic[enzymatic['ec_numbers'].notna()].iterrows():
for ec in str(row['ec_numbers']).replace('|', ';').split(';'):
ec = ec.strip()
if ec and ec.endswith('-'):
partial_ecs.add(ec)
ec_to_rxns.setdefault(ec, set()).add(row['rxn_id_base'])
all_ecs = sorted(ec_to_rxns.keys())
print(f'Unique ECs (complete): {len(all_ecs) - len(partial_ecs)}')
print(f'Partial ECs (X.X.X.-): {len(partial_ecs)}')
print(f'Total ECs to query: {len(all_ecs)}')
# Download Swiss-Prot exemplars (up to 5 per EC, bacteria preferred)
exemplar_fasta = f'{DATA_DIR}/blast/exemplar_sequences.fasta'
ec_seq_count = {}
total_seqs = 0
failed_ecs = []
with open(exemplar_fasta, 'w') as fout:
for i, ec in enumerate(all_ecs):
try:
# Prefer bacterial sequences
url = f'https://rest.uniprot.org/uniprotkb/search?query=ec:{ec}+AND+reviewed:true+AND+taxonomy_id:2&format=fasta&size=5'
r = requests.get(url, timeout=30)
if r.status_code == 200 and r.text.strip():
# Tag sequences with EC in header
for line in r.text.strip().split('\n'):
if line.startswith('>'):
fout.write(f'{line} [EC:{ec}]\n')
else:
fout.write(f'{line}\n')
n = r.text.count('>')
ec_seq_count[ec] = n
total_seqs += n
else:
# Fallback: any organism
url2 = f'https://rest.uniprot.org/uniprotkb/search?query=ec:{ec}+AND+reviewed:true&format=fasta&size=3'
r2 = requests.get(url2, timeout=30)
if r2.status_code == 200 and r2.text.strip():
for line in r2.text.strip().split('\n'):
if line.startswith('>'):
fout.write(f'{line} [EC:{ec}]\n')
else:
fout.write(f'{line}\n')
n = r2.text.count('>')
ec_seq_count[ec] = n
total_seqs += n
else:
failed_ecs.append(ec)
ec_seq_count[ec] = 0
except Exception as e:
failed_ecs.append(ec)
ec_seq_count[ec] = 0
if (i + 1) % 20 == 0:
print(f' Downloaded {i+1}/{len(all_ecs)} ECs, {total_seqs} sequences so far')
time.sleep(0.5)
print(f'\nTotal exemplar sequences: {total_seqs}')
print(f'ECs with sequences: {sum(1 for v in ec_seq_count.values() if v > 0)}')
print(f'ECs with no Swiss-Prot hit: {len(failed_ecs)}')
if failed_ecs:
print(f' Failed ECs: {failed_ecs[:10]}...')
Unique ECs (complete): 77 Partial ECs (X.X.X.-): 7 Total ECs to query: 84
Downloaded 20/84 ECs, 75 sequences so far
Downloaded 40/84 ECs, 169 sequences so far
Downloaded 60/84 ECs, 252 sequences so far
Downloaded 80/84 ECs, 311 sequences so far
Total exemplar sequences: 328 ECs with sequences: 75 ECs with no Swiss-Prot hit: 9 Failed ECs: ['1.2.7.2', '1.3.1.0', '2.7.7.10', '2.A.1.6.-', '3.6.3.25', '4.1.2.5', '4.99.1.1', '5.4.2.5', '5.4.99.3']...
2. Build DIAMOND Database and Run BLAST¶
# Concatenate all target proteomes into one FASTA with org prefix
target_fasta = f'{DATA_DIR}/blast/all_proteomes.fasta'
with open(target_fasta, 'w') as fout:
for org_id in manifest['orgId'].unique():
prot_file = f'{DATA_DIR}/proteomes/{org_id}.fasta'
if os.path.exists(prot_file):
with open(prot_file) as fin:
for line in fin:
if line.startswith('>'):
locus_id = line[1:].strip().split()[0]
fout.write(f'>{org_id}|{locus_id}\n')
else:
fout.write(line)
n_target = sum(1 for line in open(target_fasta) if line.startswith('>'))
print(f'Target proteome: {n_target:,} sequences')
# Build DIAMOND database
db_path = f'{DATA_DIR}/blast/target_db'
result = subprocess.run(
['diamond', 'makedb', '--in', target_fasta, '--db', db_path],
capture_output=True, text=True
)
print(f'DIAMOND makedb: {"OK" if result.returncode == 0 else "FAILED"}')
if result.returncode != 0:
print(result.stderr)
Target proteome: 67,377 sequences
DIAMOND makedb: OK
# Run DIAMOND BLAST
blast_out = f'{DATA_DIR}/blast/blast_results.tsv'
result = subprocess.run([
'diamond', 'blastp',
'--query', exemplar_fasta,
'--db', db_path,
'--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length', 'mismatch',
'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qcovhsp',
'--evalue', '1e-5',
'--max-target-seqs', '20',
'--id', '25',
'--query-cover', '50',
'--out', blast_out,
'--threads', '4',
], capture_output=True, text=True)
print(f'DIAMOND blastp: {"OK" if result.returncode == 0 else "FAILED"}')
if result.returncode != 0:
print(result.stderr)
else:
n_hits = sum(1 for _ in open(blast_out))
print(f'Total BLAST hits: {n_hits:,}')
DIAMOND blastp: OK Total BLAST hits: 2,990
# Parse BLAST results
blast_cols = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch',
'gapopen', 'qstart', 'qend', 'sstart', 'send',
'evalue', 'bitscore', 'qcovhsp']
blast_df = pd.read_csv(blast_out, sep='\t', names=blast_cols)
# Extract EC from query header and orgId|locusId from subject
import re
def extract_ec_from_qseqid(qseqid):
# Look up EC from the exemplar FASTA headers
return ec_from_query.get(qseqid, None)
# Build qseqid → EC mapping from the exemplar FASTA
ec_from_query = {}
with open(exemplar_fasta) as f:
for line in f:
if line.startswith('>'):
parts = line.strip()
# qseqid is the accession (sp|XXXX|NAME)
qid = parts[1:].split()[0]
ec_match = re.search(r'\[EC:([^\]]+)\]', parts)
if ec_match:
ec_from_query[qid] = ec_match.group(1)
blast_df['ec'] = blast_df['qseqid'].map(ec_from_query)
blast_df['target_org'] = blast_df['sseqid'].apply(lambda x: x.split('|')[0])
blast_df['target_locus'] = blast_df['sseqid'].apply(lambda x: x.split('|')[1] if '|' in x else x)
# Apply quality thresholds
blast_df['quality'] = 'low'
blast_df.loc[
(blast_df['pident'] >= 30) & (blast_df['qcovhsp'] >= 70) & (blast_df['evalue'] <= 1e-10),
'quality'
] = 'high'
blast_df.loc[
(blast_df['quality'] == 'low') &
(blast_df['pident'] >= 25) & (blast_df['qcovhsp'] >= 50) & (blast_df['evalue'] <= 1e-5),
'quality'
] = 'medium'
print(f'BLAST hits parsed: {len(blast_df):,}')
print(f'With EC mapping: {blast_df["ec"].notna().sum():,}')
print(f'\nQuality distribution:')
print(blast_df['quality'].value_counts().to_string())
print(f'\nTarget organisms hit:')
print(blast_df['target_org'].value_counts().to_string())
BLAST hits parsed: 2,990 With EC mapping: 2,990 Quality distribution: quality high 2505 medium 485 Target organisms hit: target_org Koxy 298 Keio 285 Cup4G11 284 Smeli 272 PV4 217 HerbieS 213 MR1 205 azobra 200 Marino 194 Dyella79 192 Phaeo 182 Korea 174 Dino 145 Btheta 129
# Map BLAST hits to gapfilled reactions
blast_to_rxn = []
for _, hit in blast_df[blast_df['ec'].notna()].iterrows():
rxns = ec_to_rxns.get(hit['ec'], set())
for rxn_id in rxns:
# Check if this reaction is gapfilled in this organism
is_gapfilled = len(enzymatic[
(enzymatic['orgId'] == hit['target_org']) &
(enzymatic['rxn_id_base'] == rxn_id)
]) > 0
if is_gapfilled:
blast_to_rxn.append({
'orgId': hit['target_org'],
'rxn_id_base': rxn_id,
'locusId': hit['target_locus'],
'blast_ec': hit['ec'],
'pident': hit['pident'],
'evalue': hit['evalue'],
'qcovhsp': hit['qcovhsp'],
'bitscore': hit['bitscore'],
'blast_quality': hit['quality'],
'qseqid': hit['qseqid'],
})
blast_rxn = pd.DataFrame(blast_to_rxn).drop_duplicates()
# Keep best hit per organism × reaction × locus
if len(blast_rxn) > 0:
blast_best = blast_rxn.sort_values('bitscore', ascending=False).drop_duplicates(
subset=['orgId', 'rxn_id_base', 'locusId'], keep='first'
)
else:
blast_best = blast_rxn
print(f'BLAST hits mapped to gapfilled reactions: {len(blast_rxn):,}')
print(f'Best hits (deduplicated): {len(blast_best):,}')
print(f'Unique reaction-organism pairs with BLAST hits: {blast_best[["orgId","rxn_id_base"]].drop_duplicates().shape[0]}')
print(f'\nBLAST quality for gapfilled reaction hits:')
if len(blast_best) > 0:
print(blast_best['blast_quality'].value_counts().to_string())
BLAST hits mapped to gapfilled reactions: 304 Best hits (deduplicated): 154 Unique reaction-organism pairs with BLAST hits: 70 BLAST quality for gapfilled reaction hits: blast_quality high 123 medium 31
3. Triangulate All Evidence¶
Combine evidence from all notebooks for final gene-reaction assignments:
- NB03: EC match + fitness + conservation
- NB04: GapMind concordance + Bakta EC alternatives
- NB05: Fitness specificity + co-occurrence
- NB06: BLAST homology
# Build master evidence table for each reaction-organism pair
all_rxn_pairs = enzymatic[['orgId', 'rxn_id_base', 'name', 'ec_numbers']].drop_duplicates()
# NB03 evidence
nb03_resolved = set(zip(candidates['orgId'], candidates['rxn_id_base']))
nb03_high = set(zip(
candidates[candidates['confidence'] == 'high']['orgId'],
candidates[candidates['confidence'] == 'high']['rxn_id_base']
))
nb03_medium = set(zip(
candidates[candidates['confidence'] == 'medium']['orgId'],
candidates[candidates['confidence'] == 'medium']['rxn_id_base']
))
# NB04 evidence
nb04_resolved = set(zip(bakta_alts['orgId'], bakta_alts['rxn_id_base']))
# NB05 fitness specificity
spec_lookup = {}
if len(fitness_support) > 0:
for _, row in fitness_support.iterrows():
key = (row['orgId'], row['rxn_id_base'])
if key not in spec_lookup or abs(row.get('z_score', 0)) > abs(spec_lookup[key].get('z_score', 0)):
spec_lookup[key] = row.to_dict()
# NB05 co-occurrence
coocc_lookup = {}
for _, row in cooccurrence.iterrows():
coocc_lookup[(row['orgId'], row['rxn_id_base'])] = row['gap_evidence']
# NB04 GapMind concordance
gapmind_lookup = {}
for _, row in concordance.iterrows():
if pd.notna(row.get('score_category')):
gapmind_lookup[(row['orgId'], row.get('gapmind_pathway', ''))] = row['score_category']
# NB06 BLAST evidence
blast_resolved = set()
blast_high = set()
if len(blast_best) > 0:
blast_resolved = set(zip(blast_best['orgId'], blast_best['rxn_id_base']))
blast_high_df = blast_best[blast_best['blast_quality'] == 'high']
blast_high = set(zip(blast_high_df['orgId'], blast_high_df['rxn_id_base']))
# Build master table
master = []
for _, row in all_rxn_pairs.iterrows():
key = (row['orgId'], row['rxn_id_base'])
has_nb03 = key in nb03_resolved
has_nb04 = key in nb04_resolved
has_blast = key in blast_resolved
# Evidence count
evidence_streams = sum([has_nb03, has_nb04, has_blast])
# Co-occurrence gap evidence
gap_ev = coocc_lookup.get(key, 'unknown')
# Fitness specificity
spec = spec_lookup.get(key, {})
z_score = spec.get('z_score', np.nan)
# Final confidence
if key in nb03_high or (has_blast and key in blast_high and (has_nb03 or has_nb04)):
final_conf = 'high'
elif key in nb03_medium or (has_blast and key in blast_high) or (has_nb03 and has_nb04):
final_conf = 'medium'
elif has_nb03 or has_nb04 or has_blast:
final_conf = 'low'
else:
final_conf = 'unresolved'
master.append({
'orgId': row['orgId'],
'rxn_id_base': row['rxn_id_base'],
'name': row['name'],
'ec_numbers': row['ec_numbers'],
'has_nb03_candidate': has_nb03,
'has_nb04_bakta': has_nb04,
'has_blast_hit': has_blast,
'evidence_streams': evidence_streams,
'gap_evidence': gap_ev,
'fitness_z_score': z_score,
'final_confidence': final_conf,
})
master_df = pd.DataFrame(master)
print('=== TRIANGULATED EVIDENCE SUMMARY ===')
print(f'\nTotal enzymatic reaction-organism pairs: {len(master_df)}')
print(f'\nFinal confidence distribution:')
print(master_df['final_confidence'].value_counts().to_string())
print(f'\nEvidence streams per pair:')
print(master_df['evidence_streams'].value_counts().sort_index().to_string())
print(f'\nResolution by source:')
print(f' NB03 (EC+fitness+conservation): {master_df["has_nb03_candidate"].sum()}')
print(f' NB04 (Bakta alternative EC): {master_df["has_nb04_bakta"].sum()}')
print(f' NB06 (BLAST homology): {master_df["has_blast_hit"].sum()}')
=== TRIANGULATED EVIDENCE SUMMARY === Total enzymatic reaction-organism pairs: 201 Final confidence distribution: final_confidence unresolved 105 high 44 low 33 medium 19 Evidence streams per pair: evidence_streams 0 105 1 49 2 47 Resolution by source: NB03 (EC+fitness+conservation): 51 NB04 (Bakta alternative EC): 22 NB06 (BLAST homology): 70
# Before/after comparison
total = len(master_df)
nb03_only = master_df['has_nb03_candidate'].sum()
nb03_nb04 = (master_df['has_nb03_candidate'] | master_df['has_nb04_bakta']).sum()
all_sources = (master_df['final_confidence'] != 'unresolved').sum()
print('=== RESOLUTION RATE PROGRESSION ===')
print(f'After NB03 (EC match): {nb03_only:3d} / {total} ({100*nb03_only/total:.1f}%)')
print(f'After NB04 (+ Bakta): {nb03_nb04:3d} / {total} ({100*nb03_nb04/total:.1f}%)')
print(f'After NB06 (+ BLAST): {all_sources:3d} / {total} ({100*all_sources/total:.1f}%)')
print(f'Still unresolved: {total - all_sources:3d} / {total} ({100*(total-all_sources)/total:.1f}%)')
print(f'\nHigh confidence assignments: {(master_df["final_confidence"] == "high").sum()}')
print(f'Medium confidence assignments: {(master_df["final_confidence"] == "medium").sum()}')
print(f'Low confidence assignments: {(master_df["final_confidence"] == "low").sum()}')
# Per-organism resolution
print(f'\nPer-organism resolution rates:')
org_resolution = master_df.groupby('orgId').agg(
total=('final_confidence', 'count'),
resolved=('final_confidence', lambda x: (x != 'unresolved').sum()),
high=('final_confidence', lambda x: (x == 'high').sum()),
).reset_index()
org_resolution['pct_resolved'] = 100 * org_resolution['resolved'] / org_resolution['total']
org_resolution = org_resolution.sort_values('pct_resolved', ascending=False)
print(org_resolution.to_string(index=False))
=== RESOLUTION RATE PROGRESSION ===
After NB03 (EC match): 51 / 201 (25.4%)
After NB04 (+ Bakta): 73 / 201 (36.3%)
After NB06 (+ BLAST): 96 / 201 (47.8%)
Still unresolved: 105 / 201 (52.2%)
High confidence assignments: 44
Medium confidence assignments: 19
Low confidence assignments: 33
Per-organism resolution rates:
orgId total resolved high pct_resolved
Koxy 7 5 2 71.428571
Marino 12 8 4 66.666667
azobra 21 13 4 61.904762
HerbieS 17 10 7 58.823529
Keio 7 4 4 57.142857
Korea 20 10 3 50.000000
MR1 8 4 2 50.000000
Smeli 14 7 4 50.000000
Dino 21 10 3 47.619048
PV4 7 3 2 42.857143
Cup4G11 12 5 2 41.666667
Dyella79 19 7 3 36.842105
Phaeo 21 7 3 33.333333
Btheta 15 3 1 20.000000
4. Visualization¶
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
# Panel 1: Resolution rate progression
stages = ['NB03\n(EC match)', 'NB03+NB04\n(+ Bakta)', 'NB03-06\n(+ BLAST)']
rates = [100*nb03_only/total, 100*nb03_nb04/total, 100*all_sources/total]
colors = ['steelblue', '#2ecc71', '#e74c3c']
axes[0].bar(stages, rates, color=colors)
axes[0].set_ylabel('Resolution Rate (%)')
axes[0].set_title('Annotation Gap Resolution\nProgression')
for i, v in enumerate(rates):
axes[0].text(i, v + 1, f'{v:.0f}%', ha='center', fontweight='bold')
axes[0].set_ylim(0, 105)
# Panel 2: Evidence Venn-like bar chart
only_nb03 = (master_df['has_nb03_candidate'] & ~master_df['has_nb04_bakta'] & ~master_df['has_blast_hit']).sum()
only_nb04 = (~master_df['has_nb03_candidate'] & master_df['has_nb04_bakta'] & ~master_df['has_blast_hit']).sum()
only_blast = (~master_df['has_nb03_candidate'] & ~master_df['has_nb04_bakta'] & master_df['has_blast_hit']).sum()
multi = (master_df['evidence_streams'] >= 2).sum()
unresolved = (master_df['final_confidence'] == 'unresolved').sum()
labels = ['EC only', 'Bakta only', 'BLAST only', 'Multi-evidence', 'Unresolved']
vals = [only_nb03, only_nb04, only_blast, multi, unresolved]
bar_colors = ['steelblue', '#2ecc71', '#e74c3c', '#9b59b6', '#95a5a6']
axes[1].barh(labels, vals, color=bar_colors)
axes[1].set_xlabel('Reaction-Organism Pairs')
axes[1].set_title('Evidence Source Distribution')
for i, v in enumerate(vals):
axes[1].text(v + 0.5, i, str(v), va='center')
# Panel 3: Final confidence distribution
conf_order = ['high', 'medium', 'low', 'unresolved']
conf_colors = ['#2ecc71', '#f39c12', '#e74c3c', '#95a5a6']
conf_counts = master_df['final_confidence'].value_counts().reindex(conf_order, fill_value=0)
axes[2].pie(conf_counts.values, labels=conf_counts.index, colors=conf_colors,
autopct='%1.0f%%', startangle=90)
axes[2].set_title('Final Confidence Distribution')
plt.tight_layout()
plt.savefig(f'{FIG_DIR}/nb06_triangulated_evidence.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved: {FIG_DIR}/nb06_triangulated_evidence.png')
Saved: ../figures/nb06_triangulated_evidence.png
5. Save Final Outputs¶
# Save BLAST hits
if len(blast_best) > 0:
blast_best.to_csv(f'{DATA_DIR}/blast_hits.tsv', sep='\t', index=False)
print(f'Saved blast_hits.tsv: {len(blast_best)} rows')
else:
pd.DataFrame().to_csv(f'{DATA_DIR}/blast_hits.tsv', sep='\t', index=False)
print('Saved blast_hits.tsv: 0 rows')
# Save master candidate table
master_df.to_csv(f'{DATA_DIR}/reaction_gene_candidates.tsv', sep='\t', index=False)
print(f'Saved reaction_gene_candidates.tsv: {len(master_df)} rows')
# Save per-organism summary
org_resolution.to_csv(f'{DATA_DIR}/organism_resolution_summary.tsv', sep='\t', index=False)
print(f'Saved organism_resolution_summary.tsv: {len(org_resolution)} rows')
print()
print('=' * 60)
print('NB06 FINAL SUMMARY')
print('=' * 60)
print(f'Swiss-Prot exemplar sequences: {total_seqs}')
print(f'BLAST hits (quality filtered): {len(blast_best) if len(blast_best) > 0 else 0}')
print(f'BLAST hits → gapfilled rxns: {len(blast_best[["orgId","rxn_id_base"]].drop_duplicates()) if len(blast_best) > 0 else 0}')
print(f'---')
print(f'Total reaction-organism pairs: {total}')
print(f'Resolved (any evidence): {all_sources} ({100*all_sources/total:.1f}%)')
print(f' High confidence: {(master_df["final_confidence"] == "high").sum()}')
print(f' Medium confidence: {(master_df["final_confidence"] == "medium").sum()}')
print(f' Low confidence: {(master_df["final_confidence"] == "low").sum()}')
print(f'Unresolved: {total - all_sources} ({100*(total-all_sources)/total:.1f}%)')
print(f'---')
print(f'Improvement over NB03 baseline: {100*nb03_only/total:.1f}% → {100*all_sources/total:.1f}%')
print('=' * 60)
Saved blast_hits.tsv: 154 rows Saved reaction_gene_candidates.tsv: 201 rows Saved organism_resolution_summary.tsv: 14 rows ============================================================ NB06 FINAL SUMMARY ============================================================ Swiss-Prot exemplar sequences: 328 BLAST hits (quality filtered): 154 BLAST hits → gapfilled rxns: 70 --- Total reaction-organism pairs: 201 Resolved (any evidence): 96 (47.8%) High confidence: 44 Medium confidence: 19 Low confidence: 33 Unresolved: 105 (52.2%) --- Improvement over NB03 baseline: 25.4% → 47.8% ============================================================