08 Figures Report
Jupyter notebook from the Annotation-Gap Discovery via Phenotype-Fitness-Pangenome-Gapfilling Integration project.
NB08: Publication Figures and Summary Report¶
Project: Annotation-Gap Discovery
Goal: Generate final publication-quality figures and summary tables
for the REPORT.md synthesis.
Inputs: All prior notebook outputs (NB01–NB07)
Outputs: Figures in figures/, summary tables in data/
In [1]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import warnings
warnings.filterwarnings('ignore')
plt.rcParams.update({
'figure.dpi': 150,
'savefig.dpi': 300,
'font.size': 10,
'axes.titlesize': 12,
'axes.labelsize': 11,
'figure.facecolor': 'white',
})
DATA_DIR = '../data'
FIG_DIR = '../figures'
# Load all outputs
master = pd.read_csv(f'{DATA_DIR}/reaction_gene_candidates.tsv', sep='\t')
candidates = pd.read_csv(f'{DATA_DIR}/annotation_gap_candidates.tsv', sep='\t')
gf_details = pd.read_csv(f'{DATA_DIR}/gapfill_reaction_details.tsv', sep='\t')
manifest = pd.read_csv(f'{DATA_DIR}/genome_manifest.tsv', sep='\t')
cv = pd.read_csv(f'{DATA_DIR}/cross_validation.tsv', sep='\t')
delta = pd.read_csv(f'{DATA_DIR}/delta_metrics.tsv', sep='\t')
ko_df = pd.read_csv(f'{DATA_DIR}/knockout_validation.tsv', sep='\t')
gpr_df = pd.read_csv(f'{DATA_DIR}/gpr_insertions.tsv', sep='\t')
baseline_fba = pd.read_csv(f'{DATA_DIR}/baseline_fba_results.tsv', sep='\t')
gapmind_conc = pd.read_csv(f'{DATA_DIR}/gapmind_concordance.tsv', sep='\t')
blast_hits = pd.read_csv(f'{DATA_DIR}/blast_hits.tsv', sep='\t')
presence = pd.read_csv(f'{DATA_DIR}/presence_absence_matrix.tsv', sep='\t')
enzymatic = gf_details[gf_details['rxn_type'] == 'metabolic'].copy()
print(f'Loaded all datasets.')
print(f'Master: {len(master)} reaction-organism pairs')
print(f'Resolved: {(master["final_confidence"] != "unresolved").sum()} ({100*(master["final_confidence"] != "unresolved").sum()/len(master):.1f}%)')
Loaded all datasets. Master: 201 reaction-organism pairs Resolved: 96 (47.8%)
Figure 1: Pipeline Overview — Resolution by Evidence Stream¶
In [2]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Panel A: Stacked bar of resolution progression
stages = ['NB03\nEC match', 'NB04\n+ Bakta', 'NB06\n+ BLAST']
nb03_only = master['has_nb03_candidate'].sum()
nb04_add = (master['has_nb04_bakta'] & ~master['has_nb03_candidate']).sum()
blast_add = (master['has_blast_hit'] & ~master['has_nb03_candidate'] & ~master['has_nb04_bakta']).sum()
cumulative = [nb03_only, nb03_only + nb04_add, nb03_only + nb04_add + blast_add]
unresolved = [len(master) - c for c in cumulative]
colors_resolved = ['#2196F3', '#4CAF50', '#FF9800']
bottom = [0, nb03_only, nb03_only + nb04_add]
heights = [nb03_only, nb04_add, blast_add]
ax = axes[0]
for i in range(3):
ax.bar(0, heights[i], bottom=bottom[i], color=colors_resolved[i],
label=stages[i].replace('\n', ' '), edgecolor='white', linewidth=0.5)
ax.bar(0, unresolved[-1], bottom=cumulative[-1], color='#E0E0E0',
label='Unresolved', edgecolor='white', linewidth=0.5)
ax.set_xlim(-0.6, 0.6)
ax.set_ylabel('Reaction-organism pairs')
ax.set_title('A. Cumulative Resolution')
ax.set_xticks([])
ax.legend(loc='upper right', fontsize=9)
ax.axhline(y=len(master)*0.30, color='red', linestyle='--', alpha=0.7, linewidth=1)
ax.text(0.35, len(master)*0.30 + 2, 'H1 threshold (30%)', color='red', fontsize=8)
for i in range(3):
mid = bottom[i] + heights[i]/2
if heights[i] > 5:
ax.text(0, mid, f'{heights[i]}', ha='center', va='center',
fontweight='bold', color='white', fontsize=11)
# Panel B: Confidence breakdown
ax = axes[1]
conf_counts = master['final_confidence'].value_counts()
conf_order = ['high', 'medium', 'low', 'unresolved']
conf_colors = {'high': '#1B5E20', 'medium': '#4CAF50', 'low': '#A5D6A7', 'unresolved': '#E0E0E0'}
vals = [conf_counts.get(c, 0) for c in conf_order]
labels = ['High', 'Medium', 'Low', 'Unresolved']
colors = [conf_colors[c] for c in conf_order]
wedges, texts, autotexts = ax.pie(vals, labels=labels, colors=colors,
autopct='%1.1f%%', startangle=90,
textprops={'fontsize': 10})
ax.set_title('B. Confidence Distribution')
plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig1_resolution_overview.png', bbox_inches='tight')
plt.show()
print(f'Saved fig1_resolution_overview.png')
Saved fig1_resolution_overview.png
Figure 2: Evidence Stream Cross-Validation¶
In [3]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Panel A: Leave-one-out bar chart
ax = axes[0]
configs = ['Full\npipeline', 'w/o NB03\n(EC)', 'w/o NB04\n(Bakta)', 'w/o NB06\n(BLAST)']
loo_vals = cv[cv['configuration'].isin(['full', 'no_nb03', 'no_nb04', 'no_blast'])]['pct'].values
bar_colors = ['#1565C0', '#64B5F6', '#64B5F6', '#64B5F6']
bars = ax.bar(configs, loo_vals, color=bar_colors, edgecolor='white', width=0.6)
ax.axhline(y=30, color='red', linestyle='--', alpha=0.7)
ax.text(3.3, 31, 'H1: 30%', color='red', fontsize=8)
ax.set_ylabel('Resolution rate (%)')
ax.set_title('A. Leave-One-Out Cross-Validation')
ax.set_ylim(0, 55)
for bar, val in zip(bars, loo_vals):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
f'{val:.1f}%', ha='center', va='bottom', fontsize=9, fontweight='bold')
# Panel B: Evidence stream Venn (approximation as grouped bars)
ax = axes[1]
only_labels = ['NB03\nonly', 'NB04\nonly', 'BLAST\nonly']
only_vals = cv[cv['configuration'].isin(['only_nb03', 'only_nb04', 'only_blast'])]['pct'].values
only_colors = ['#2196F3', '#4CAF50', '#FF9800']
bars2 = ax.bar(only_labels, only_vals, color=only_colors, edgecolor='white', width=0.5)
ax.set_ylabel('Resolution rate (%)')
ax.set_title('B. Single-Stream Resolution')
ax.set_ylim(0, 45)
for bar, val in zip(bars2, only_vals):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
f'{val:.1f}%', ha='center', va='bottom', fontsize=9, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig2_cross_validation.png', bbox_inches='tight')
plt.show()
print(f'Saved fig2_cross_validation.png')
Saved fig2_cross_validation.png
Figure 3: Per-Organism Resolution Heatmap¶
In [4]:
# Build per-organism × confidence matrix
org_conf = master.groupby(['orgId', 'final_confidence']).size().unstack(fill_value=0)
conf_order = ['high', 'medium', 'low', 'unresolved']
for c in conf_order:
if c not in org_conf.columns:
org_conf[c] = 0
org_conf = org_conf[conf_order]
# Sort by total resolved (descending)
org_conf['resolved'] = org_conf['high'] + org_conf['medium'] + org_conf['low']
org_conf['total'] = org_conf[conf_order].sum(axis=1)
org_conf['pct_resolved'] = 100 * org_conf['resolved'] / org_conf['total']
org_conf = org_conf.sort_values('pct_resolved', ascending=True)
fig, ax = plt.subplots(figsize=(10, 7))
orgs = org_conf.index.tolist()
y_pos = range(len(orgs))
conf_colors = {'high': '#1B5E20', 'medium': '#4CAF50', 'low': '#A5D6A7', 'unresolved': '#E0E0E0'}
left = np.zeros(len(orgs))
for conf in conf_order:
vals = org_conf[conf].values
ax.barh(y_pos, vals, left=left, color=conf_colors[conf], label=conf.title(),
edgecolor='white', linewidth=0.5)
left += vals
ax.set_yticks(y_pos)
ax.set_yticklabels(orgs)
ax.set_xlabel('Number of gapfilled reactions')
ax.set_title('Resolution by Organism')
ax.legend(loc='lower right', fontsize=9)
# Add percentage labels
for i, org in enumerate(orgs):
pct = org_conf.loc[org, 'pct_resolved']
total = org_conf.loc[org, 'total']
ax.text(total + 0.3, i, f'{pct:.0f}%', va='center', fontsize=9, color='#333')
plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig3_organism_resolution.png', bbox_inches='tight')
plt.show()
print(f'Saved fig3_organism_resolution.png')
Saved fig3_organism_resolution.png
Figure 4: BLAST Hit Quality Distribution¶
In [5]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Panel A: Identity vs coverage scatter
ax = axes[0]
if len(blast_hits) > 0:
ax.scatter(blast_hits['pident'], blast_hits['qcovhsp'],
c=np.log10(blast_hits['evalue'].clip(lower=1e-300)),
cmap='RdYlGn_r', alpha=0.6, s=30, edgecolors='none')
ax.axvline(x=30, color='red', linestyle='--', alpha=0.5, label='High threshold')
ax.axhline(y=70, color='red', linestyle='--', alpha=0.5)
ax.axvline(x=25, color='orange', linestyle=':', alpha=0.5, label='Medium threshold')
ax.axhline(y=50, color='orange', linestyle=':', alpha=0.5)
ax.set_xlabel('Percent identity')
ax.set_ylabel('Query coverage (%)')
ax.set_title('A. BLAST Hit Quality')
ax.legend(fontsize=8)
# Panel B: Hits per reaction (top reactions)
ax = axes[1]
if len(blast_hits) > 0:
rxn_hits = blast_hits.groupby('rxn_id_base').size().sort_values(ascending=False).head(15)
rxn_hits.plot(kind='barh', ax=ax, color='#FF9800', edgecolor='white')
ax.set_xlabel('Number of BLAST hits')
ax.set_title('B. Top Reactions by BLAST Hits')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig4_blast_quality.png', bbox_inches='tight')
plt.show()
print(f'Saved fig4_blast_quality.png')
Saved fig4_blast_quality.png
Figure 5: GapMind Concordance with Gapfilling¶
In [6]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Panel A: GapMind score_category distribution for gapfilled cases
ax = axes[0]
if len(gapmind_conc) > 0:
cat_counts = gapmind_conc['score_category'].value_counts()
cat_order = ['not_present', 'steps_missing_medium', 'steps_missing_low',
'likely_complete', 'complete']
cat_colors = {'not_present': '#D32F2F', 'steps_missing_medium': '#FF9800',
'steps_missing_low': '#FFC107', 'likely_complete': '#8BC34A',
'complete': '#4CAF50'}
existing = [c for c in cat_order if c in cat_counts.index]
vals = [cat_counts[c] for c in existing]
colors = [cat_colors.get(c, '#999') for c in existing]
ax.barh(existing, vals, color=colors, edgecolor='white')
ax.set_xlabel('Count')
ax.set_title('A. GapMind Categories for\nGapfilled Pathways')
# Panel B: Concordance — GapMind agrees vs disagrees with gapfilling
ax = axes[1]
if 'concordant' in gapmind_conc.columns:
conc_counts = gapmind_conc['concordant'].value_counts()
labels = ['Concordant\n(gap confirmed)', 'Discordant\n(gap not confirmed)']
vals = [conc_counts.get(True, 0), conc_counts.get(False, 0)]
colors = ['#4CAF50', '#FF9800']
ax.bar(labels, vals, color=colors, edgecolor='white', width=0.5)
ax.set_ylabel('Count')
ax.set_title('B. GapMind-Gapfill Concordance')
for i, v in enumerate(vals):
ax.text(i, v + 1, str(v), ha='center', fontweight='bold')
else:
ax.text(0.5, 0.5, 'No concordance\ndata available', ha='center', va='center',
transform=ax.transAxes, fontsize=14, color='#999')
ax.set_title('B. GapMind-Gapfill Concordance')
plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig5_gapmind_concordance.png', bbox_inches='tight')
plt.show()
print(f'Saved fig5_gapmind_concordance.png')
Saved fig5_gapmind_concordance.png
Figure 6: Gene Cluster Conservation Patterns¶
In [7]:
fig, ax = plt.subplots(figsize=(8, 5))
if len(presence) > 0:
org_cols = [c for c in presence.columns if c not in ['bakta_ec', 'ec_number', 'rxn_id_base', 'gene_cluster_id']]
if org_cols:
n_orgs = presence[org_cols].astype(int).sum(axis=1)
bins = range(1, len(org_cols) + 2)
ax.hist(n_orgs, bins=bins, color='#2196F3', edgecolor='white', alpha=0.8,
align='left')
ax.set_xlabel('Number of organisms with EC')
ax.set_ylabel('Number of EC groups')
ax.set_title('EC Conservation Across 14 Organisms')
ax.axvline(x=len(org_cols), color='red', linestyle='--', alpha=0.5)
ax.text(len(org_cols) - 0.5, ax.get_ylim()[1] * 0.9, 'Core',
color='red', fontsize=9, ha='right')
else:
ax.text(0.5, 0.5, 'No presence/absence data', ha='center', va='center',
transform=ax.transAxes, fontsize=14, color='#999')
plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig6_conservation.png', bbox_inches='tight')
plt.show()
print(f'Saved fig6_conservation.png')
Saved fig6_conservation.png
Summary Tables¶
In [8]:
# Table 1: Overall metrics
print('=' * 60)
print('TABLE 1: PROJECT METRICS')
print('=' * 60)
metrics = delta.set_index('metric')['value']
print(f'Total reaction-organism pairs: {int(metrics["total_pairs"])}')
print(f'Resolved (any confidence): {int(metrics["resolved"])} ({metrics["resolution_rate"]:.1f}%)')
print(f' High confidence: {int(metrics["high_conf"])}')
print(f' Medium confidence: {int(metrics["medium_conf"])}')
print(f' Low confidence: {int(metrics["low_conf"])}')
print(f'Unresolved: {int(metrics["unresolved"])}')
print(f'GPR rules inserted: {int(metrics["gpr_inserted"])}')
print()
# Table 2: Per-organism summary
print('=' * 60)
print('TABLE 2: PER-ORGANISM RESOLUTION')
print('=' * 60)
org_summary = master.groupby('orgId').agg(
total=('final_confidence', 'count'),
resolved=('final_confidence', lambda x: (x != 'unresolved').sum()),
high=('final_confidence', lambda x: (x == 'high').sum()),
medium=('final_confidence', lambda x: (x == 'medium').sum()),
low=('final_confidence', lambda x: (x == 'low').sum()),
evidence_streams=('evidence_streams', 'mean'),
).reset_index()
org_summary['pct'] = (100 * org_summary['resolved'] / org_summary['total']).round(1)
org_summary = org_summary.sort_values('pct', ascending=False)
print(org_summary.to_string(index=False))
print()
# Table 3: Top resolved reactions
print('=' * 60)
print('TABLE 3: TOP HIGH-CONFIDENCE ASSIGNMENTS')
print('=' * 60)
high_conf = master[master['final_confidence'] == 'high'].copy()
if len(high_conf) > 0:
high_conf_summary = high_conf.groupby('rxn_id_base').agg(
n_organisms=('orgId', 'nunique'),
mean_streams=('evidence_streams', 'mean'),
).reset_index().sort_values('n_organisms', ascending=False)
print(high_conf_summary.head(15).to_string(index=False))
print()
# Table 4: Cross-validation summary
print('=' * 60)
print('TABLE 4: EVIDENCE STREAM CROSS-VALIDATION')
print('=' * 60)
print(cv.to_string(index=False))
============================================================
TABLE 1: PROJECT METRICS
============================================================
Total reaction-organism pairs: 201
Resolved (any confidence): 96 (47.8%)
High confidence: 44
Medium confidence: 19
Low confidence: 33
Unresolved: 105
GPR rules inserted: 23
============================================================
TABLE 2: PER-ORGANISM RESOLUTION
============================================================
orgId total resolved high medium low evidence_streams pct
Koxy 7 5 2 1 2 1.142857 71.4
Marino 12 8 4 3 1 1.000000 66.7
azobra 21 13 4 1 8 0.857143 61.9
HerbieS 17 10 7 2 1 1.000000 58.8
Keio 7 4 4 0 0 1.142857 57.1
Korea 20 10 3 4 3 0.650000 50.0
MR1 8 4 2 1 1 0.750000 50.0
Smeli 14 7 4 1 2 0.857143 50.0
Dino 21 10 3 1 6 0.619048 47.6
PV4 7 3 2 0 1 0.714286 42.9
Cup4G11 12 5 2 1 2 0.583333 41.7
Dyella79 19 7 3 1 3 0.526316 36.8
Phaeo 21 7 3 2 2 0.476190 33.3
Btheta 15 3 1 1 1 0.266667 20.0
============================================================
TABLE 3: TOP HIGH-CONFIDENCE ASSIGNMENTS
============================================================
rxn_id_base n_organisms mean_streams
rxn02185 9 2.0
rxn03436 9 2.0
rxn15947 6 2.0
rxn15949 5 2.0
rxn17392 3 2.0
rxn16048 2 2.0
rxn01539 1 2.0
rxn00704 1 2.0
rxn10204 1 2.0
rxn04794 1 2.0
rxn02312 1 2.0
rxn01753 1 2.0
rxn15270 1 2.0
rxn19764 1 2.0
rxn19766 1 2.0
============================================================
TABLE 4: EVIDENCE STREAM CROSS-VALIDATION
============================================================
configuration resolved pct
full 96 47.761194
no_nb03 86 42.786070
no_nb04 80 39.800995
no_blast 73 36.318408
only_nb03 51 25.373134
only_nb04 22 10.945274
only_blast 70 34.825871
Final Summary Statistics for REPORT.md¶
In [9]:
print('=' * 60)
print('FINAL PROJECT SUMMARY')
print('=' * 60)
print()
print('--- Study Design ---')
print(f'Organisms studied: {manifest["orgId"].nunique()}')
n_cs = gf_details['carbon_source'].nunique()
print(f'Carbon sources tested: {n_cs}')
print(f'Total gapfilled reactions: {len(gf_details)}')
print(f'Enzymatic (metabolic) reactions: {len(enzymatic)}')
print(f'Unique reaction-organism pairs: {len(master)}')
print()
print('--- Evidence Integration ---')
print(f'NB03 EC match: {master["has_nb03_candidate"].sum()} pairs ({100*master["has_nb03_candidate"].sum()/len(master):.1f}%)')
print(f'NB04 Bakta alternatives: {master["has_nb04_bakta"].sum()} pairs ({100*master["has_nb04_bakta"].sum()/len(master):.1f}%)')
print(f'NB06 BLAST homology: {master["has_blast_hit"].sum()} pairs ({100*master["has_blast_hit"].sum()/len(master):.1f}%)')
print(f'Multi-stream support: {(master["evidence_streams"] >= 2).sum()} pairs')
print()
print('--- Resolution ---')
total = len(master)
resolved = (master['final_confidence'] != 'unresolved').sum()
print(f'Overall resolution rate: {100*resolved/total:.1f}% ({resolved}/{total})')
print(f' High confidence: {(master["final_confidence"]=="high").sum()}')
print(f' Medium confidence: {(master["final_confidence"]=="medium").sum()}')
print(f' Low confidence: {(master["final_confidence"]=="low").sum()}')
print(f' Unresolved: {(master["final_confidence"]=="unresolved").sum()}')
print()
print('--- Hypothesis ---')
rate = 100*resolved/total
if rate > 30:
print(f'H1 SUPPORTED: {rate:.1f}% > 30% threshold')
else:
print(f'H0 NOT REJECTED: {rate:.1f}% < 30% threshold')
print()
print('--- Key Insight ---')
print('No single evidence stream achieves >35% resolution alone.')
print('Triangulation (EC + Bakta + BLAST) is essential, boosting')
print(f'resolution from 25.4% to {rate:.1f}%.')
print(f'{(master["final_confidence"]=="unresolved").sum()} pairs remain unresolved')
print('and represent genuine annotation gaps for experimental follow-up.')
print()
print('=' * 60)
print('NB08 COMPLETE')
print('=' * 60)
============================================================ FINAL PROJECT SUMMARY ============================================================ --- Study Design --- Organisms studied: 14 Carbon sources tested: 18 Total gapfilled reactions: 219 Enzymatic (metabolic) reactions: 201 Unique reaction-organism pairs: 201 --- Evidence Integration --- NB03 EC match: 51 pairs (25.4%) NB04 Bakta alternatives: 22 pairs (10.9%) NB06 BLAST homology: 70 pairs (34.8%) Multi-stream support: 47 pairs --- Resolution --- Overall resolution rate: 47.8% (96/201) High confidence: 44 Medium confidence: 19 Low confidence: 33 Unresolved: 105 --- Hypothesis --- H1 SUPPORTED: 47.8% > 30% threshold --- Key Insight --- No single evidence stream achieves >35% resolution alone. Triangulation (EC + Bakta + BLAST) is essential, boosting resolution from 25.4% to 47.8%. 105 pairs remain unresolved and represent genuine annotation gaps for experimental follow-up. ============================================================ NB08 COMPLETE ============================================================