import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import warnings
warnings.filterwarnings('ignore')

plt.rcParams.update({
    'figure.dpi': 150,
    'savefig.dpi': 300,
    'font.size': 10,
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'figure.facecolor': 'white',
})

DATA_DIR = '../data'
FIG_DIR = '../figures'

# Load all outputs
master = pd.read_csv(f'{DATA_DIR}/reaction_gene_candidates.tsv', sep='\t')
candidates = pd.read_csv(f'{DATA_DIR}/annotation_gap_candidates.tsv', sep='\t')
gf_details = pd.read_csv(f'{DATA_DIR}/gapfill_reaction_details.tsv', sep='\t')
manifest = pd.read_csv(f'{DATA_DIR}/genome_manifest.tsv', sep='\t')
cv = pd.read_csv(f'{DATA_DIR}/cross_validation.tsv', sep='\t')
delta = pd.read_csv(f'{DATA_DIR}/delta_metrics.tsv', sep='\t')
ko_df = pd.read_csv(f'{DATA_DIR}/knockout_validation.tsv', sep='\t')
gpr_df = pd.read_csv(f'{DATA_DIR}/gpr_insertions.tsv', sep='\t')
baseline_fba = pd.read_csv(f'{DATA_DIR}/baseline_fba_results.tsv', sep='\t')
gapmind_conc = pd.read_csv(f'{DATA_DIR}/gapmind_concordance.tsv', sep='\t')
blast_hits = pd.read_csv(f'{DATA_DIR}/blast_hits.tsv', sep='\t')
presence = pd.read_csv(f'{DATA_DIR}/presence_absence_matrix.tsv', sep='\t')

enzymatic = gf_details[gf_details['rxn_type'] == 'metabolic'].copy()

print(f'Loaded all datasets.')
print(f'Master: {len(master)} reaction-organism pairs')
print(f'Resolved: {(master["final_confidence"] != "unresolved").sum()} ({100*(master["final_confidence"] != "unresolved").sum()/len(master):.1f}%)')

Loaded all datasets.
Master: 201 reaction-organism pairs
Resolved: 96 (47.8%)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Panel A: Stacked bar of resolution progression
stages = ['NB03\nEC match', 'NB04\n+ Bakta', 'NB06\n+ BLAST']
nb03_only = master['has_nb03_candidate'].sum()
nb04_add = (master['has_nb04_bakta'] & ~master['has_nb03_candidate']).sum()
blast_add = (master['has_blast_hit'] & ~master['has_nb03_candidate'] & ~master['has_nb04_bakta']).sum()

cumulative = [nb03_only, nb03_only + nb04_add, nb03_only + nb04_add + blast_add]
unresolved = [len(master) - c for c in cumulative]

colors_resolved = ['#2196F3', '#4CAF50', '#FF9800']
bottom = [0, nb03_only, nb03_only + nb04_add]
heights = [nb03_only, nb04_add, blast_add]

ax = axes[0]
for i in range(3):
    ax.bar(0, heights[i], bottom=bottom[i], color=colors_resolved[i],
           label=stages[i].replace('\n', ' '), edgecolor='white', linewidth=0.5)
ax.bar(0, unresolved[-1], bottom=cumulative[-1], color='#E0E0E0',
       label='Unresolved', edgecolor='white', linewidth=0.5)

ax.set_xlim(-0.6, 0.6)
ax.set_ylabel('Reaction-organism pairs')
ax.set_title('A. Cumulative Resolution')
ax.set_xticks([])
ax.legend(loc='upper right', fontsize=9)
ax.axhline(y=len(master)*0.30, color='red', linestyle='--', alpha=0.7, linewidth=1)
ax.text(0.35, len(master)*0.30 + 2, 'H1 threshold (30%)', color='red', fontsize=8)

for i in range(3):
    mid = bottom[i] + heights[i]/2
    if heights[i] > 5:
        ax.text(0, mid, f'{heights[i]}', ha='center', va='center',
                fontweight='bold', color='white', fontsize=11)

# Panel B: Confidence breakdown
ax = axes[1]
conf_counts = master['final_confidence'].value_counts()
conf_order = ['high', 'medium', 'low', 'unresolved']
conf_colors = {'high': '#1B5E20', 'medium': '#4CAF50', 'low': '#A5D6A7', 'unresolved': '#E0E0E0'}
vals = [conf_counts.get(c, 0) for c in conf_order]
labels = ['High', 'Medium', 'Low', 'Unresolved']
colors = [conf_colors[c] for c in conf_order]

wedges, texts, autotexts = ax.pie(vals, labels=labels, colors=colors,
                                   autopct='%1.1f%%', startangle=90,
                                   textprops={'fontsize': 10})
ax.set_title('B. Confidence Distribution')

plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig1_resolution_overview.png', bbox_inches='tight')
plt.show()
print(f'Saved fig1_resolution_overview.png')

Saved fig1_resolution_overview.png

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Panel A: Leave-one-out bar chart
ax = axes[0]
configs = ['Full\npipeline', 'w/o NB03\n(EC)', 'w/o NB04\n(Bakta)', 'w/o NB06\n(BLAST)']
loo_vals = cv[cv['configuration'].isin(['full', 'no_nb03', 'no_nb04', 'no_blast'])]['pct'].values
bar_colors = ['#1565C0', '#64B5F6', '#64B5F6', '#64B5F6']

bars = ax.bar(configs, loo_vals, color=bar_colors, edgecolor='white', width=0.6)
ax.axhline(y=30, color='red', linestyle='--', alpha=0.7)
ax.text(3.3, 31, 'H1: 30%', color='red', fontsize=8)
ax.set_ylabel('Resolution rate (%)')
ax.set_title('A. Leave-One-Out Cross-Validation')
ax.set_ylim(0, 55)

for bar, val in zip(bars, loo_vals):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{val:.1f}%', ha='center', va='bottom', fontsize=9, fontweight='bold')

# Panel B: Evidence stream Venn (approximation as grouped bars)
ax = axes[1]
only_labels = ['NB03\nonly', 'NB04\nonly', 'BLAST\nonly']
only_vals = cv[cv['configuration'].isin(['only_nb03', 'only_nb04', 'only_blast'])]['pct'].values
only_colors = ['#2196F3', '#4CAF50', '#FF9800']

bars2 = ax.bar(only_labels, only_vals, color=only_colors, edgecolor='white', width=0.5)
ax.set_ylabel('Resolution rate (%)')
ax.set_title('B. Single-Stream Resolution')
ax.set_ylim(0, 45)

for bar, val in zip(bars2, only_vals):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{val:.1f}%', ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig2_cross_validation.png', bbox_inches='tight')
plt.show()
print(f'Saved fig2_cross_validation.png')

Saved fig2_cross_validation.png

# Build per-organism × confidence matrix
org_conf = master.groupby(['orgId', 'final_confidence']).size().unstack(fill_value=0)
conf_order = ['high', 'medium', 'low', 'unresolved']
for c in conf_order:
    if c not in org_conf.columns:
        org_conf[c] = 0
org_conf = org_conf[conf_order]

# Sort by total resolved (descending)
org_conf['resolved'] = org_conf['high'] + org_conf['medium'] + org_conf['low']
org_conf['total'] = org_conf[conf_order].sum(axis=1)
org_conf['pct_resolved'] = 100 * org_conf['resolved'] / org_conf['total']
org_conf = org_conf.sort_values('pct_resolved', ascending=True)

fig, ax = plt.subplots(figsize=(10, 7))

orgs = org_conf.index.tolist()
y_pos = range(len(orgs))

conf_colors = {'high': '#1B5E20', 'medium': '#4CAF50', 'low': '#A5D6A7', 'unresolved': '#E0E0E0'}
left = np.zeros(len(orgs))

for conf in conf_order:
    vals = org_conf[conf].values
    ax.barh(y_pos, vals, left=left, color=conf_colors[conf], label=conf.title(),
            edgecolor='white', linewidth=0.5)
    left += vals

ax.set_yticks(y_pos)
ax.set_yticklabels(orgs)
ax.set_xlabel('Number of gapfilled reactions')
ax.set_title('Resolution by Organism')
ax.legend(loc='lower right', fontsize=9)

# Add percentage labels
for i, org in enumerate(orgs):
    pct = org_conf.loc[org, 'pct_resolved']
    total = org_conf.loc[org, 'total']
    ax.text(total + 0.3, i, f'{pct:.0f}%', va='center', fontsize=9, color='#333')

plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig3_organism_resolution.png', bbox_inches='tight')
plt.show()
print(f'Saved fig3_organism_resolution.png')

Saved fig3_organism_resolution.png

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Panel A: Identity vs coverage scatter
ax = axes[0]
if len(blast_hits) > 0:
    ax.scatter(blast_hits['pident'], blast_hits['qcovhsp'],
               c=np.log10(blast_hits['evalue'].clip(lower=1e-300)),
               cmap='RdYlGn_r', alpha=0.6, s=30, edgecolors='none')
    ax.axvline(x=30, color='red', linestyle='--', alpha=0.5, label='High threshold')
    ax.axhline(y=70, color='red', linestyle='--', alpha=0.5)
    ax.axvline(x=25, color='orange', linestyle=':', alpha=0.5, label='Medium threshold')
    ax.axhline(y=50, color='orange', linestyle=':', alpha=0.5)
    ax.set_xlabel('Percent identity')
    ax.set_ylabel('Query coverage (%)')
    ax.set_title('A. BLAST Hit Quality')
    ax.legend(fontsize=8)

# Panel B: Hits per reaction (top reactions)
ax = axes[1]
if len(blast_hits) > 0:
    rxn_hits = blast_hits.groupby('rxn_id_base').size().sort_values(ascending=False).head(15)
    rxn_hits.plot(kind='barh', ax=ax, color='#FF9800', edgecolor='white')
    ax.set_xlabel('Number of BLAST hits')
    ax.set_title('B. Top Reactions by BLAST Hits')
    ax.invert_yaxis()

plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig4_blast_quality.png', bbox_inches='tight')
plt.show()
print(f'Saved fig4_blast_quality.png')

Saved fig4_blast_quality.png

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Panel A: GapMind score_category distribution for gapfilled cases
ax = axes[0]
if len(gapmind_conc) > 0:
    cat_counts = gapmind_conc['score_category'].value_counts()
    cat_order = ['not_present', 'steps_missing_medium', 'steps_missing_low',
                 'likely_complete', 'complete']
    cat_colors = {'not_present': '#D32F2F', 'steps_missing_medium': '#FF9800',
                  'steps_missing_low': '#FFC107', 'likely_complete': '#8BC34A',
                  'complete': '#4CAF50'}
    existing = [c for c in cat_order if c in cat_counts.index]
    vals = [cat_counts[c] for c in existing]
    colors = [cat_colors.get(c, '#999') for c in existing]
    ax.barh(existing, vals, color=colors, edgecolor='white')
    ax.set_xlabel('Count')
    ax.set_title('A. GapMind Categories for\nGapfilled Pathways')

# Panel B: Concordance — GapMind agrees vs disagrees with gapfilling
ax = axes[1]
if 'concordant' in gapmind_conc.columns:
    conc_counts = gapmind_conc['concordant'].value_counts()
    labels = ['Concordant\n(gap confirmed)', 'Discordant\n(gap not confirmed)']
    vals = [conc_counts.get(True, 0), conc_counts.get(False, 0)]
    colors = ['#4CAF50', '#FF9800']
    ax.bar(labels, vals, color=colors, edgecolor='white', width=0.5)
    ax.set_ylabel('Count')
    ax.set_title('B. GapMind-Gapfill Concordance')
    for i, v in enumerate(vals):
        ax.text(i, v + 1, str(v), ha='center', fontweight='bold')
else:
    ax.text(0.5, 0.5, 'No concordance\ndata available', ha='center', va='center',
            transform=ax.transAxes, fontsize=14, color='#999')
    ax.set_title('B. GapMind-Gapfill Concordance')

plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig5_gapmind_concordance.png', bbox_inches='tight')
plt.show()
print(f'Saved fig5_gapmind_concordance.png')

Saved fig5_gapmind_concordance.png

fig, ax = plt.subplots(figsize=(8, 5))

if len(presence) > 0:
    org_cols = [c for c in presence.columns if c not in ['bakta_ec', 'ec_number', 'rxn_id_base', 'gene_cluster_id']]
    if org_cols:
        n_orgs = presence[org_cols].astype(int).sum(axis=1)
        bins = range(1, len(org_cols) + 2)
        ax.hist(n_orgs, bins=bins, color='#2196F3', edgecolor='white', alpha=0.8,
                align='left')
        ax.set_xlabel('Number of organisms with EC')
        ax.set_ylabel('Number of EC groups')
        ax.set_title('EC Conservation Across 14 Organisms')
        ax.axvline(x=len(org_cols), color='red', linestyle='--', alpha=0.5)
        ax.text(len(org_cols) - 0.5, ax.get_ylim()[1] * 0.9, 'Core',
                color='red', fontsize=9, ha='right')
else:
    ax.text(0.5, 0.5, 'No presence/absence data', ha='center', va='center',
            transform=ax.transAxes, fontsize=14, color='#999')

plt.tight_layout()
plt.savefig(f'{FIG_DIR}/fig6_conservation.png', bbox_inches='tight')
plt.show()
print(f'Saved fig6_conservation.png')

Saved fig6_conservation.png

# Table 1: Overall metrics
print('=' * 60)
print('TABLE 1: PROJECT METRICS')
print('=' * 60)
metrics = delta.set_index('metric')['value']
print(f'Total reaction-organism pairs:    {int(metrics["total_pairs"])}')
print(f'Resolved (any confidence):        {int(metrics["resolved"])} ({metrics["resolution_rate"]:.1f}%)')
print(f'  High confidence:                {int(metrics["high_conf"])}')
print(f'  Medium confidence:              {int(metrics["medium_conf"])}')
print(f'  Low confidence:                 {int(metrics["low_conf"])}')
print(f'Unresolved:                       {int(metrics["unresolved"])}')
print(f'GPR rules inserted:               {int(metrics["gpr_inserted"])}')
print()

# Table 2: Per-organism summary
print('=' * 60)
print('TABLE 2: PER-ORGANISM RESOLUTION')
print('=' * 60)
org_summary = master.groupby('orgId').agg(
    total=('final_confidence', 'count'),
    resolved=('final_confidence', lambda x: (x != 'unresolved').sum()),
    high=('final_confidence', lambda x: (x == 'high').sum()),
    medium=('final_confidence', lambda x: (x == 'medium').sum()),
    low=('final_confidence', lambda x: (x == 'low').sum()),
    evidence_streams=('evidence_streams', 'mean'),
).reset_index()
org_summary['pct'] = (100 * org_summary['resolved'] / org_summary['total']).round(1)
org_summary = org_summary.sort_values('pct', ascending=False)
print(org_summary.to_string(index=False))
print()

# Table 3: Top resolved reactions
print('=' * 60)
print('TABLE 3: TOP HIGH-CONFIDENCE ASSIGNMENTS')
print('=' * 60)
high_conf = master[master['final_confidence'] == 'high'].copy()
if len(high_conf) > 0:
    high_conf_summary = high_conf.groupby('rxn_id_base').agg(
        n_organisms=('orgId', 'nunique'),
        mean_streams=('evidence_streams', 'mean'),
    ).reset_index().sort_values('n_organisms', ascending=False)
    print(high_conf_summary.head(15).to_string(index=False))
print()

# Table 4: Cross-validation summary
print('=' * 60)
print('TABLE 4: EVIDENCE STREAM CROSS-VALIDATION')
print('=' * 60)
print(cv.to_string(index=False))

============================================================
TABLE 1: PROJECT METRICS
============================================================
Total reaction-organism pairs:    201
Resolved (any confidence):        96 (47.8%)
  High confidence:                44
  Medium confidence:              19
  Low confidence:                 33
Unresolved:                       105
GPR rules inserted:               23

============================================================
TABLE 2: PER-ORGANISM RESOLUTION
============================================================
   orgId  total  resolved  high  medium  low  evidence_streams  pct
    Koxy      7         5     2       1    2          1.142857 71.4
  Marino     12         8     4       3    1          1.000000 66.7
  azobra     21        13     4       1    8          0.857143 61.9
 HerbieS     17        10     7       2    1          1.000000 58.8
    Keio      7         4     4       0    0          1.142857 57.1
   Korea     20        10     3       4    3          0.650000 50.0
     MR1      8         4     2       1    1          0.750000 50.0
   Smeli     14         7     4       1    2          0.857143 50.0
    Dino     21        10     3       1    6          0.619048 47.6
     PV4      7         3     2       0    1          0.714286 42.9
 Cup4G11     12         5     2       1    2          0.583333 41.7
Dyella79     19         7     3       1    3          0.526316 36.8
   Phaeo     21         7     3       2    2          0.476190 33.3
  Btheta     15         3     1       1    1          0.266667 20.0

============================================================
TABLE 3: TOP HIGH-CONFIDENCE ASSIGNMENTS
============================================================
rxn_id_base  n_organisms  mean_streams
   rxn02185            9           2.0
   rxn03436            9           2.0
   rxn15947            6           2.0
   rxn15949            5           2.0
   rxn17392            3           2.0
   rxn16048            2           2.0
   rxn01539            1           2.0
   rxn00704            1           2.0
   rxn10204            1           2.0
   rxn04794            1           2.0
   rxn02312            1           2.0
   rxn01753            1           2.0
   rxn15270            1           2.0
   rxn19764            1           2.0
   rxn19766            1           2.0

============================================================
TABLE 4: EVIDENCE STREAM CROSS-VALIDATION
============================================================
configuration  resolved       pct
         full        96 47.761194
      no_nb03        86 42.786070
      no_nb04        80 39.800995
     no_blast        73 36.318408
    only_nb03        51 25.373134
    only_nb04        22 10.945274
   only_blast        70 34.825871

print('=' * 60)
print('FINAL PROJECT SUMMARY')
print('=' * 60)
print()
print('--- Study Design ---')
print(f'Organisms studied: {manifest["orgId"].nunique()}')
n_cs = gf_details['carbon_source'].nunique()
print(f'Carbon sources tested: {n_cs}')
print(f'Total gapfilled reactions: {len(gf_details)}')
print(f'Enzymatic (metabolic) reactions: {len(enzymatic)}')
print(f'Unique reaction-organism pairs: {len(master)}')
print()
print('--- Evidence Integration ---')
print(f'NB03 EC match:           {master["has_nb03_candidate"].sum()} pairs ({100*master["has_nb03_candidate"].sum()/len(master):.1f}%)')
print(f'NB04 Bakta alternatives: {master["has_nb04_bakta"].sum()} pairs ({100*master["has_nb04_bakta"].sum()/len(master):.1f}%)')
print(f'NB06 BLAST homology:     {master["has_blast_hit"].sum()} pairs ({100*master["has_blast_hit"].sum()/len(master):.1f}%)')
print(f'Multi-stream support:    {(master["evidence_streams"] >= 2).sum()} pairs')
print()
print('--- Resolution ---')
total = len(master)
resolved = (master['final_confidence'] != 'unresolved').sum()
print(f'Overall resolution rate:  {100*resolved/total:.1f}% ({resolved}/{total})')
print(f'  High confidence:        {(master["final_confidence"]=="high").sum()}')
print(f'  Medium confidence:      {(master["final_confidence"]=="medium").sum()}')
print(f'  Low confidence:         {(master["final_confidence"]=="low").sum()}')
print(f'  Unresolved:             {(master["final_confidence"]=="unresolved").sum()}')
print()
print('--- Hypothesis ---')
rate = 100*resolved/total
if rate > 30:
    print(f'H1 SUPPORTED: {rate:.1f}% > 30% threshold')
else:
    print(f'H0 NOT REJECTED: {rate:.1f}% < 30% threshold')
print()
print('--- Key Insight ---')
print('No single evidence stream achieves >35% resolution alone.')
print('Triangulation (EC + Bakta + BLAST) is essential, boosting')
print(f'resolution from 25.4% to {rate:.1f}%.')
print(f'{(master["final_confidence"]=="unresolved").sum()} pairs remain unresolved')
print('and represent genuine annotation gaps for experimental follow-up.')
print()
print('=' * 60)
print('NB08 COMPLETE')
print('=' * 60)

============================================================
FINAL PROJECT SUMMARY
============================================================

--- Study Design ---
Organisms studied: 14
Carbon sources tested: 18
Total gapfilled reactions: 219
Enzymatic (metabolic) reactions: 201
Unique reaction-organism pairs: 201

--- Evidence Integration ---
NB03 EC match:           51 pairs (25.4%)
NB04 Bakta alternatives: 22 pairs (10.9%)
NB06 BLAST homology:     70 pairs (34.8%)
Multi-stream support:    47 pairs

--- Resolution ---
Overall resolution rate:  47.8% (96/201)
  High confidence:        44
  Medium confidence:      19
  Low confidence:         33
  Unresolved:             105

--- Hypothesis ---
H1 SUPPORTED: 47.8% > 30% threshold

--- Key Insight ---
No single evidence stream achieves >35% resolution alone.
Triangulation (EC + Bakta + BLAST) is essential, boosting
resolution from 25.4% to 47.8%.
105 pairs remain unresolved
and represent genuine annotation gaps for experimental follow-up.

============================================================
NB08 COMPLETE
============================================================

08 Figures Report

NB08: Publication Figures and Summary Report¶

Figure 1: Pipeline Overview — Resolution by Evidence Stream¶

Figure 2: Evidence Stream Cross-Validation¶

Figure 3: Per-Organism Resolution Heatmap¶

Figure 4: BLAST Hit Quality Distribution¶

Figure 5: GapMind Concordance with Gapfilling¶

Figure 6: Gene Cluster Conservation Patterns¶

Summary Tables¶

Final Summary Statistics for REPORT.md¶