import pandas as pd
import numpy as np
import itertools
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import os

DATA_DIR = '../data'
FIG_DIR = '../figures'

# Load raw data
candidates_nb03 = pd.read_csv(f'{DATA_DIR}/annotation_gap_candidates.tsv', sep='\t')
blast_hits = pd.read_csv(f'{DATA_DIR}/blast_hits.tsv', sep='\t')
master = pd.read_csv(f'{DATA_DIR}/reaction_gene_candidates.tsv', sep='\t')
bakta_alts = pd.read_csv(f'{DATA_DIR}/bakta_ec_alternatives.tsv', sep='\t')
fitness_support = pd.read_csv(f'{DATA_DIR}/fitness_support.tsv', sep='\t')
cand_enhanced = pd.read_csv(f'{DATA_DIR}/candidates_enhanced.tsv', sep='\t')

total_pairs = len(master)
print(f'Total reaction-organism pairs: {total_pairs}')
print(f'NB03 candidates: {len(candidates_nb03)}')
print(f'BLAST hits: {len(blast_hits)}')
print(f'Baseline resolution: {(master["final_confidence"] != "unresolved").sum()} / {total_pairs} '
      f'({100*(master["final_confidence"] != "unresolved").sum()/total_pairs:.1f}%)')

Total reaction-organism pairs: 201
NB03 candidates: 107
BLAST hits: 154
Baseline resolution: 96 / 201 (47.8%)

# Fitness threshold sweep
fit_thresholds = [-1.0, -1.5, -2.0, -2.5, -3.0]
t_thresholds = [3.0, 4.0, 5.0]

# Re-classify each NB03 candidate under varied fitness thresholds
# Original moderate thresholds: FIT_MODERATE = -1.0, T_MODERATE = 3.0
def classify_fitness(row, fit_thresh, t_thresh):
    """Reclassify fitness_class for a candidate row."""
    if row['fitness_class'] == 'essential':
        return 'essential'
    if row['fitness_class'] == 'no_data':
        return 'no_data'
    min_fit = row.get('min_fit', np.nan)
    max_abs_t = row.get('max_abs_t', np.nan)
    if pd.isna(min_fit) or pd.isna(max_abs_t):
        return 'no_data'
    # Moderate thresholds scale proportionally
    fit_moderate = fit_thresh / 2.0
    t_moderate = t_thresh * 0.75
    if min_fit < fit_thresh and max_abs_t > t_thresh:
        return 'strong'
    if min_fit < fit_moderate and max_abs_t > t_moderate:
        return 'moderate'
    return 'non_significant'


def score_nb03_confidence(row, fitness_cls):
    """Re-score NB03 confidence given reclassified fitness."""
    has_fitness = fitness_cls in ('strong', 'moderate')
    is_essential = fitness_cls == 'essential'
    has_conservation = row.get('conservation_class') == 'core'
    gapmind_supports = (pd.notna(row.get('gapmind_score')) and
                        row.get('gapmind_score') == 0.0)
    if fitness_cls == 'strong' and (has_conservation or gapmind_supports):
        return 'high'
    if is_essential and has_conservation:
        return 'high'
    if has_fitness:
        return 'medium'
    if is_essential:
        return 'medium'
    if has_conservation and gapmind_supports:
        return 'medium'
    if has_conservation:
        return 'low'
    return 'low'


fitness_results = []
for fit_t, t_t in itertools.product(fit_thresholds, t_thresholds):
    # Reclassify each candidate
    new_classes = candidates_nb03.apply(
        lambda r: classify_fitness(r, fit_t, t_t), axis=1
    )
    new_confs = candidates_nb03.apply(
        lambda r: score_nb03_confidence(r, classify_fitness(r, fit_t, t_t)),
        axis=1
    )
    
    # Best candidate per reaction
    tmp = candidates_nb03.copy()
    tmp['new_fitness'] = new_classes
    tmp['new_confidence'] = new_confs
    
    n_strong = (new_classes == 'strong').sum()
    n_moderate = (new_classes == 'moderate').sum()
    n_high = (new_confs == 'high').sum()
    n_medium = (new_confs == 'medium').sum()
    n_low = (new_confs == 'low').sum()
    
    # Count resolved reaction-org pairs at NB03 level
    nb03_resolved = tmp[['orgId', 'rxn_id_base']].drop_duplicates().shape[0]
    nb03_high_set = set(zip(
        tmp[tmp['new_confidence'] == 'high']['orgId'],
        tmp[tmp['new_confidence'] == 'high']['rxn_id_base']
    ))
    nb03_medium_set = set(zip(
        tmp[tmp['new_confidence'] == 'medium']['orgId'],
        tmp[tmp['new_confidence'] == 'medium']['rxn_id_base']
    ))
    
    fitness_results.append({
        'fit_threshold': fit_t,
        't_threshold': t_t,
        'n_strong_fitness': n_strong,
        'n_moderate_fitness': n_moderate,
        'n_high_conf': n_high,
        'n_medium_conf': n_medium,
        'n_low_conf': n_low,
        'nb03_resolved': nb03_resolved,
        'nb03_high_pairs': len(nb03_high_set),
        'nb03_medium_pairs': len(nb03_medium_set),
    })

fitness_sens = pd.DataFrame(fitness_results)
print('Fitness threshold sensitivity (NB03-level):')
print(fitness_sens.to_string(index=False))

Fitness threshold sensitivity (NB03-level):
 fit_threshold  t_threshold  n_strong_fitness  n_moderate_fitness  n_high_conf  n_medium_conf  n_low_conf  nb03_resolved  nb03_high_pairs  nb03_medium_pairs
          -1.0          3.0                17                   0           16              7          84             51               12                  5
          -1.0          4.0                14                   3           13             10          84             51               10                  7
          -1.0          5.0                13                   1           12              9          86             51               10                  6
          -1.5          3.0                15                   2           14              9          84             51               12                  6
          -1.5          4.0                13                   4           12             11          84             51               10                  8
          -1.5          5.0                13                   1           12              9          86             51               10                  6
          -2.0          3.0                13                   4           13             10          84             51               11                  7
          -2.0          4.0                12                   5           12             11          84             51               10                  8
          -2.0          5.0                12                   2           12              9          86             51               10                  6
          -2.5          3.0                12                   3           12             10          85             51               10                  7
          -2.5          4.0                11                   4           11             11          85             51                9                  8
          -2.5          5.0                11                   2           11              9          87             51                9                  6
          -3.0          3.0                11                   4           11             11          85             51                9                  8
          -3.0          4.0                10                   5           10             12          85             51                8                  9
          -3.0          5.0                10                   3           10             10          87             51                8                  7

# BLAST threshold sweep
blast_id_thresholds = [25, 30, 35, 40]
blast_cov_thresholds = [50, 60, 70, 80]
blast_eval_high = 1e-10
blast_eval_medium = 1e-5

blast_results = []
for min_id, min_cov in itertools.product(blast_id_thresholds, blast_cov_thresholds):
    # Reclassify BLAST hits
    bh = blast_hits.copy()
    bh['new_quality'] = 'below_threshold'
    bh.loc[
        (bh['pident'] >= min_id) & (bh['qcovhsp'] >= min_cov) & (bh['evalue'] <= blast_eval_high),
        'new_quality'
    ] = 'high'
    bh.loc[
        (bh['new_quality'] == 'below_threshold') &
        (bh['pident'] >= max(min_id - 5, 20)) &
        (bh['qcovhsp'] >= max(min_cov - 20, 30)) &
        (bh['evalue'] <= blast_eval_medium),
        'new_quality'
    ] = 'medium'
    
    # Count hits that pass
    n_high = (bh['new_quality'] == 'high').sum()
    n_medium = (bh['new_quality'] == 'medium').sum()
    n_total = (bh['new_quality'] != 'below_threshold').sum()
    
    # Unique reaction-organism pairs with BLAST evidence
    passing = bh[bh['new_quality'] != 'below_threshold']
    blast_pairs = passing[['orgId', 'rxn_id_base']].drop_duplicates().shape[0]
    high_pairs = bh[bh['new_quality'] == 'high'][['orgId', 'rxn_id_base']].drop_duplicates().shape[0]
    
    blast_results.append({
        'min_identity': min_id,
        'min_coverage': min_cov,
        'n_high_hits': n_high,
        'n_medium_hits': n_medium,
        'n_total_hits': n_total,
        'blast_pairs': blast_pairs,
        'blast_high_pairs': high_pairs,
    })

blast_sens = pd.DataFrame(blast_results)
print('BLAST threshold sensitivity:')
print(blast_sens.to_string(index=False))

BLAST threshold sensitivity:
 min_identity  min_coverage  n_high_hits  n_medium_hits  n_total_hits  blast_pairs  blast_high_pairs
           25            50          153              1           154           70                70
           25            60          142             12           154           70                68
           25            70          135             19           154           70                66
           25            80          121             22           143           68                63
           30            50          137             17           154           70                65
           30            60          129             25           154           70                63
           30            70          123             31           154           70                62
           30            80          112             31           143           68                60
           35            50           96             41           137           65                56
           35            60           94             43           137           65                55
           35            70           88             49           137           65                53
           35            80           81             48           129           63                50
           40            50           80             16            96           56                46
           40            60           78             18            96           56                45
           40            70           75             21            96           56                43
           40            80           70             24            94           55                40

# Re-run full triangulation under threshold combinations
# Fixed: NB04 Bakta evidence (threshold-independent)
nb04_resolved_set = set(zip(bakta_alts['orgId'], bakta_alts['rxn_id_base']))

def compute_resolution(fit_t, t_t, blast_min_id, blast_min_cov):
    """Recompute the full pipeline resolution rate under given thresholds."""
    # Step 1: Reclassify NB03 fitness
    tmp = candidates_nb03.copy()
    tmp['new_fitness'] = tmp.apply(lambda r: classify_fitness(r, fit_t, t_t), axis=1)
    tmp['new_confidence'] = tmp.apply(
        lambda r: score_nb03_confidence(r, classify_fitness(r, fit_t, t_t)), axis=1
    )
    
    nb03_resolved = set(zip(tmp['orgId'], tmp['rxn_id_base']))
    nb03_high = set(zip(
        tmp[tmp['new_confidence'] == 'high']['orgId'],
        tmp[tmp['new_confidence'] == 'high']['rxn_id_base']
    ))
    nb03_medium = set(zip(
        tmp[tmp['new_confidence'] == 'medium']['orgId'],
        tmp[tmp['new_confidence'] == 'medium']['rxn_id_base']
    ))
    
    # Step 2: Reclassify BLAST
    bh = blast_hits.copy()
    bh['new_quality'] = 'below_threshold'
    bh.loc[
        (bh['pident'] >= blast_min_id) & (bh['qcovhsp'] >= blast_min_cov) &
        (bh['evalue'] <= blast_eval_high),
        'new_quality'
    ] = 'high'
    bh.loc[
        (bh['new_quality'] == 'below_threshold') &
        (bh['pident'] >= max(blast_min_id - 5, 20)) &
        (bh['qcovhsp'] >= max(blast_min_cov - 20, 30)) &
        (bh['evalue'] <= blast_eval_medium),
        'new_quality'
    ] = 'medium'
    
    passing = bh[bh['new_quality'] != 'below_threshold']
    blast_resolved = set(zip(passing['orgId'], passing['rxn_id_base']))
    blast_high = set(zip(
        bh[bh['new_quality'] == 'high']['orgId'],
        bh[bh['new_quality'] == 'high']['rxn_id_base']
    ))
    
    # Step 3: Triangulate (same logic as NB06 cell-10)
    n_high = 0
    n_medium = 0
    n_low = 0
    n_unresolved = 0
    
    for _, row in master.iterrows():
        key = (row['orgId'], row['rxn_id_base'])
        has_nb03 = key in nb03_resolved
        has_nb04 = key in nb04_resolved_set
        has_blast = key in blast_resolved
        
        if key in nb03_high or (has_blast and key in blast_high and (has_nb03 or has_nb04)):
            n_high += 1
        elif key in nb03_medium or (has_blast and key in blast_high) or (has_nb03 and has_nb04):
            n_medium += 1
        elif has_nb03 or has_nb04 or has_blast:
            n_low += 1
        else:
            n_unresolved += 1
    
    resolved = n_high + n_medium + n_low
    return {
        'resolved': resolved,
        'pct_resolved': 100 * resolved / total_pairs,
        'n_high': n_high,
        'n_medium': n_medium,
        'n_low': n_low,
        'n_unresolved': n_unresolved,
    }

# Sweep: fitness thresholds × BLAST thresholds
combined_results = []
for fit_t, t_t, blast_id, blast_cov in itertools.product(
    fit_thresholds, t_thresholds, blast_id_thresholds, blast_cov_thresholds
):
    res = compute_resolution(fit_t, t_t, blast_id, blast_cov)
    combined_results.append({
        'fit_threshold': fit_t,
        't_threshold': t_t,
        'blast_min_id': blast_id,
        'blast_min_cov': blast_cov,
        **res,
    })

combined_df = pd.DataFrame(combined_results)
print(f'Total threshold combinations tested: {len(combined_df)}')
print(f'Resolution rate range: {combined_df["pct_resolved"].min():.1f}% – {combined_df["pct_resolved"].max():.1f}%')
print(f'Baseline (fit=-2.0, t=4.0, id=30%, cov=70%): '
      f'{combined_df[(combined_df["fit_threshold"]==-2.0) & (combined_df["t_threshold"]==4.0) & (combined_df["blast_min_id"]==30) & (combined_df["blast_min_cov"]==70)]["pct_resolved"].values[0]:.1f}%')

Total threshold combinations tested: 240
Resolution rate range: 42.8% – 47.8%
Baseline (fit=-2.0, t=4.0, id=30%, cov=70%): 47.8%

# Show summary: resolution rate by fitness threshold (marginalizing over BLAST)
print('Resolution rate by fitness threshold (averaged over BLAST thresholds):')
fit_summary = combined_df.groupby(['fit_threshold', 't_threshold']).agg(
    mean_pct=('pct_resolved', 'mean'),
    min_pct=('pct_resolved', 'min'),
    max_pct=('pct_resolved', 'max'),
    mean_high=('n_high', 'mean'),
).reset_index()
print(fit_summary.to_string(index=False))

print()
print('Resolution rate by BLAST threshold (averaged over fitness thresholds):')
blast_summary = combined_df.groupby(['blast_min_id', 'blast_min_cov']).agg(
    mean_pct=('pct_resolved', 'mean'),
    min_pct=('pct_resolved', 'min'),
    max_pct=('pct_resolved', 'max'),
    mean_high=('n_high', 'mean'),
).reset_index()
print(blast_summary.to_string(index=False))

Resolution rate by fitness threshold (averaged over BLAST thresholds):
 fit_threshold  t_threshold  mean_pct  min_pct   max_pct  mean_high
          -3.0          3.0   46.0199 42.78607 47.761194     41.875
          -3.0          4.0   46.0199 42.78607 47.761194     41.875
          -3.0          5.0   46.0199 42.78607 47.761194     41.875
          -2.5          3.0   46.0199 42.78607 47.761194     41.875
          -2.5          4.0   46.0199 42.78607 47.761194     41.875
          -2.5          5.0   46.0199 42.78607 47.761194     41.875
          -2.0          3.0   46.0199 42.78607 47.761194     41.875
          -2.0          4.0   46.0199 42.78607 47.761194     41.875
          -2.0          5.0   46.0199 42.78607 47.761194     41.875
          -1.5          3.0   46.0199 42.78607 47.761194     41.875
          -1.5          4.0   46.0199 42.78607 47.761194     41.875
          -1.5          5.0   46.0199 42.78607 47.761194     41.875
          -1.0          3.0   46.0199 42.78607 47.761194     41.875
          -1.0          4.0   46.0199 42.78607 47.761194     41.875
          -1.0          5.0   46.0199 42.78607 47.761194     41.875

Resolution rate by BLAST threshold (averaged over fitness thresholds):
 blast_min_id  blast_min_cov  mean_pct   min_pct   max_pct  mean_high
           25             50 47.761194 47.761194 47.761194       47.0
           25             60 47.761194 47.761194 47.761194       46.0
           25             70 47.761194 47.761194 47.761194       44.0
           25             80 47.263682 47.263682 47.263682       44.0
           30             50 47.761194 47.761194 47.761194       46.0
           30             60 47.761194 47.761194 47.761194       45.0
           30             70 47.761194 47.761194 47.761194       44.0
           30             80 47.263682 47.263682 47.263682       44.0
           35             50 45.771144 45.771144 45.771144       42.0
           35             60 45.771144 45.771144 45.771144       42.0
           35             70 45.771144 45.771144 45.771144       41.0
           35             80 45.273632 45.273632 45.273632       40.0
           40             50 43.283582 43.283582 43.283582       37.0
           40             60 43.283582 43.283582 43.283582       37.0
           40             70 43.283582 43.283582 43.283582       36.0
           40             80 42.786070 42.786070 42.786070       35.0

fig, axes = plt.subplots(2, 2, figsize=(14, 11))

# Panel A: Resolution rate vs fitness threshold (lines for each t_threshold)
ax = axes[0, 0]
# Average over BLAST thresholds
for t_t in t_thresholds:
    subset = combined_df[combined_df['t_threshold'] == t_t].groupby('fit_threshold')['pct_resolved'].mean()
    ax.plot(subset.index, subset.values, marker='o', label=f'|t| > {t_t}')
ax.axhline(y=47.8, color='gray', linestyle='--', alpha=0.5, label='Baseline (47.8%)')
ax.axhline(y=30.0, color='red', linestyle=':', alpha=0.5, label='H1 threshold (30%)')
ax.set_xlabel('Fitness threshold (min_fit)')
ax.set_ylabel('Resolution rate (%)')
ax.set_title('A. Fitness threshold sensitivity')
ax.legend(fontsize=8)
ax.set_ylim(20, 60)
ax.invert_xaxis()

# Panel B: Resolution rate heatmap — BLAST identity × coverage
ax = axes[0, 1]
# At baseline fitness thresholds
baseline_fit = combined_df[
    (combined_df['fit_threshold'] == -2.0) & (combined_df['t_threshold'] == 4.0)
]
pivot_blast = baseline_fit.pivot(index='blast_min_id', columns='blast_min_cov', values='pct_resolved')
sns.heatmap(pivot_blast, annot=True, fmt='.1f', cmap='YlGnBu', ax=ax,
            vmin=30, vmax=55, cbar_kws={'label': 'Resolution %'})
ax.set_xlabel('Min coverage (%)')
ax.set_ylabel('Min identity (%)')
ax.set_title('B. BLAST threshold sensitivity\n(at baseline fitness thresholds)')
# Mark baseline cell
baseline_row = list(pivot_blast.index).index(30)
baseline_col = list(pivot_blast.columns).index(70)
ax.add_patch(plt.Rectangle((baseline_col, baseline_row), 1, 1,
                            fill=False, edgecolor='red', linewidth=2))

# Panel C: High-confidence count vs combined thresholds
ax = axes[1, 0]
# Group by fitness threshold, show high-confidence at baseline BLAST
baseline_blast = combined_df[
    (combined_df['blast_min_id'] == 30) & (combined_df['blast_min_cov'] == 70)
]
for t_t in t_thresholds:
    subset = baseline_blast[baseline_blast['t_threshold'] == t_t]
    ax.plot(subset['fit_threshold'], subset['n_high'], marker='s', label=f'|t| > {t_t}')
ax.axhline(y=44, color='gray', linestyle='--', alpha=0.5, label='Baseline (44)')
ax.set_xlabel('Fitness threshold (min_fit)')
ax.set_ylabel('High-confidence assignments')
ax.set_title('C. High-confidence count vs fitness threshold\n(at baseline BLAST thresholds)')
ax.legend(fontsize=8)
ax.invert_xaxis()

# Panel D: Overall distribution — resolution rate histogram across all combos
ax = axes[1, 1]
ax.hist(combined_df['pct_resolved'], bins=20, color='steelblue', edgecolor='white', alpha=0.8)
ax.axvline(x=47.8, color='red', linestyle='--', linewidth=2, label='Baseline (47.8%)')
ax.axvline(x=30.0, color='orange', linestyle=':', linewidth=2, label='H1 threshold (30%)')
ax.set_xlabel('Resolution rate (%)')
ax.set_ylabel('Number of threshold combinations')
ax.set_title(f'D. Resolution rate distribution\n(n={len(combined_df)} combinations)')
ax.legend(fontsize=8)

plt.tight_layout()
plt.savefig(f'{FIG_DIR}/supplement_threshold_sensitivity.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved: {FIG_DIR}/supplement_threshold_sensitivity.png')

Saved: ../figures/supplement_threshold_sensitivity.png

# Key robustness metrics
above_h1 = (combined_df['pct_resolved'] >= 30.0).sum()
above_40 = (combined_df['pct_resolved'] >= 40.0).sum()
total_combos = len(combined_df)

print('=' * 60)
print('THRESHOLD SENSITIVITY SUMMARY')
print('=' * 60)
print(f'Total threshold combinations tested: {total_combos}')
print(f'Resolution rate range: {combined_df["pct_resolved"].min():.1f}% – {combined_df["pct_resolved"].max():.1f}%')
print(f'Mean resolution rate:  {combined_df["pct_resolved"].mean():.1f}%')
print(f'Std deviation:         {combined_df["pct_resolved"].std():.1f}%')
print(f'Median:                {combined_df["pct_resolved"].median():.1f}%')
print()
print(f'Combinations exceeding H1 threshold (30%): {above_h1}/{total_combos} ({100*above_h1/total_combos:.0f}%)')
print(f'Combinations exceeding 40%:                {above_40}/{total_combos} ({100*above_40/total_combos:.0f}%)')
print()
print('High-confidence assignments range: '
      f'{combined_df["n_high"].min()} – {combined_df["n_high"].max()} '
      f'(baseline: 44)')
print()

# Most sensitive parameter
# Variance decomposition: which parameter explains most variance in resolution?
from itertools import combinations as combs

params = ['fit_threshold', 't_threshold', 'blast_min_id', 'blast_min_cov']
print('Resolution variance explained by each parameter:')
total_var = combined_df['pct_resolved'].var()
for param in params:
    group_means = combined_df.groupby(param)['pct_resolved'].mean()
    between_var = group_means.var()
    frac = between_var / total_var if total_var > 0 else 0
    print(f'  {param:20s}: {frac:.1%} of variance')

print()
print('Conclusion: ', end='')
if above_h1 == total_combos:
    print('H1 is supported under ALL tested threshold combinations.')
elif above_h1 > 0.9 * total_combos:
    print(f'H1 is supported under {100*above_h1/total_combos:.0f}% of threshold combinations — robust.')
else:
    print(f'H1 support depends on threshold choice ({100*above_h1/total_combos:.0f}% of combinations).')

============================================================
THRESHOLD SENSITIVITY SUMMARY
============================================================
Total threshold combinations tested: 240
Resolution rate range: 42.8% – 47.8%
Mean resolution rate:  46.0%
Std deviation:         1.9%
Median:                46.5%

Combinations exceeding H1 threshold (30%): 240/240 (100%)
Combinations exceeding 40%:                240/240 (100%)

High-confidence assignments range: 35 – 47 (baseline: 44)

Resolution variance explained by each parameter:
  fit_threshold       : 0.0% of variance
  t_threshold         : 0.0% of variance
  blast_min_id        : 131.0% of variance
  blast_min_cov       : 1.8% of variance

Conclusion: H1 is supported under ALL tested threshold combinations.

# Save results
combined_df.to_csv(f'{DATA_DIR}/threshold_sensitivity.tsv', sep='\t', index=False)
print(f'Saved threshold_sensitivity.tsv: {len(combined_df)} rows')

# Also save compact summary
summary_rows = []
summary_rows.append({
    'analysis': 'baseline',
    'fit_threshold': -2.0, 't_threshold': 4.0,
    'blast_min_id': 30, 'blast_min_cov': 70,
    'resolved': 96, 'pct_resolved': 47.8,
    'n_high': 44, 'n_medium': 19, 'n_low': 33,
})
# Most permissive
most_permissive = combined_df.loc[combined_df['pct_resolved'].idxmax()]
summary_rows.append({
    'analysis': 'most_permissive',
    **{k: most_permissive[k] for k in ['fit_threshold', 't_threshold',
       'blast_min_id', 'blast_min_cov', 'resolved', 'pct_resolved',
       'n_high', 'n_medium', 'n_low']}
})
# Most stringent
most_stringent = combined_df.loc[combined_df['pct_resolved'].idxmin()]
summary_rows.append({
    'analysis': 'most_stringent',
    **{k: most_stringent[k] for k in ['fit_threshold', 't_threshold',
       'blast_min_id', 'blast_min_cov', 'resolved', 'pct_resolved',
       'n_high', 'n_medium', 'n_low']}
})

summary_df = pd.DataFrame(summary_rows)
print('\nKey threshold scenarios:')
print(summary_df.to_string(index=False))

print('\n' + '=' * 60)
print('Supplement complete.')
print('=' * 60)

Saved threshold_sensitivity.tsv: 240 rows

Key threshold scenarios:
       analysis  fit_threshold  t_threshold  blast_min_id  blast_min_cov  resolved  pct_resolved  n_high  n_medium  n_low
       baseline           -2.0          4.0          30.0           70.0      96.0     47.800000    44.0      19.0   33.0
most_permissive           -1.0          3.0          25.0           50.0      96.0     47.761194    47.0      24.0   25.0
 most_stringent           -1.0          3.0          40.0           80.0      86.0     42.786070    35.0       7.0   44.0

============================================================
Supplement complete.
============================================================

Supplement Threshold Sensitivity

Supplement: Threshold Sensitivity Analysis¶

1. Fitness Threshold Sensitivity¶

2. BLAST Threshold Sensitivity¶

3. Combined Pipeline Sensitivity¶

4. Visualization¶

5. Robustness Assessment¶