import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from pathlib import Path
from berdl_notebook_utils.setup_spark_session import get_spark_session

pd.set_option('display.max_colwidth', 90)
pd.set_option('display.width', 200)
sns.set_context('notebook')
sns.set_style('whitegrid')

PROJ = Path('/home/aparkin/BERIL-research-observatory/projects/caulobacter_fur_lipida_loss')
DATA_OUT = PROJ / 'data'
FIG = PROJ / 'figures'

spark = get_spark_session()
print('Spark session ready:', spark.sql('SELECT 1 AS one').collect()[0]['one'])

Spark session ready: 1

# All Caulobacter experiments
exps = spark.sql("""
    SELECT expName, expDesc, expDescLong, expGroup,
           media, aerobic, liquid, condition_1, units_1, concentration_1,
           condition_2, units_2, concentration_2,
           condition_3, units_3, concentration_3,
           condition_4, units_4, concentration_4
    FROM kescience_fitnessbrowser.experiment
    WHERE orgId = 'Caulo'
""").toPandas()
print(f'Caulobacter experiments: {len(exps)}')

# expGroup distribution
print('\nexpGroup distribution:')
print(exps['expGroup'].value_counts())

Caulobacter experiments: 198

expGroup distribution:
expGroup
stress               95
nitrogen source      46
carbon source        42
pye                  10
no stress control     3
pH                    2
Name: count, dtype: int64

# All condition tokens
all_conditions = pd.concat([
    exps['condition_1'].dropna().str.lower(),
    exps['condition_2'].dropna().str.lower(),
    exps['condition_3'].dropna().str.lower(),
    exps['condition_4'].dropna().str.lower(),
])
print(f'Total condition tokens: {len(all_conditions)} ({all_conditions.nunique()} unique)')
print('\nMost common conditions:')
print(all_conditions.value_counts().head(30))

Total condition tokens: 183 (63 unique)

Most common conditions:
sodium fluoride                                   6
sodium nitrite                                    6
l-glutamic acid monopotassium salt monohydrate    5
l-proline                                         5
l-alanine                                         5
n-acetyl-d-glucosamine                            5
sodium nitrate                                    4
chloramphenicol                                   4
tetracycline hydrochloride                        4
spectinomycin dihydrochloride pentahydrate        4
carbenicillin disodium salt                       4
nalidixic acid sodium salt                        4
nickel (ii) chloride hexahydrate                  4
cobalt chloride hexahydrate                       4
paraquat dichloride                               4
thallium(i) acetate                               4
cisplatin                                         4
sodium perchlorate monohydrate                    4
dimethyl sulfoxide                                4
d-glucose                                         3
d-maltose monohydrate                             3
d-xylose                                          3
d-raffinose pentahydrate                          3
a-cyclodextrin                                    3
d-cellobiose                                      3
m-inositol                                        3
d-salicin                                         3
beta-lactose                                      3
d-mannose                                         3
chir-090                                          3
Name: count, dtype: int64

# Build condition class flags
IRON_RE = re.compile(r'fe\b|iron|2,2.bipyridyl|ferric|ferrous|hemin|fec|chelat|dipyridyl|deferoxamine|defroxam|edta|nitrilotriacetic|ntab|ntpa|fur', re.I)
OXIDATIVE_RE = re.compile(r'h2o2|hydrogen peroxide|peroxide|paraquat|menadione|tBHP|t.butyl.hydroperoxide|cumene|n.acetyl.cysteine|nac|sodium dichromate|chromat|tellurite|oxidative', re.I)
ENVELOPE_RE = re.compile(r'vancomycin|bacitracin|polymyxin|colistin|deoxycholat|cholat|sds|sodium dodecyl|triton|edta|cetrimide|chx|chlorhexidin|imp|tetracyclin|ampicillin|kanamycin|erythromycin|chloramphenicol|spectinomycin|gentamicin|fusid|nalid|cipro|novobiocin|nisin|cerulenin|cccp|carbonyl cyanide|protonophore|membrane|envelope', re.I)
CARBON_RE  = re.compile(r'glucose|fructose|galactose|sucrose|maltose|lactose|trehalose|xylose|arabinose|ribose|mannose|cellobiose|raffinose|melibiose|sorbose|inositol|glycerol|sorbitol|mannitol|gluconate|pyruvate|acetate|succinate|fumarate|malate|citrate|lactate|formate|ethanol|methanol|d-alanine|l-arginine|carbon source|c.source', re.I)

def classify(row):
    classes = []
    s = ' '.join(str(row.get(c) or '') for c in ['expDesc','expDescLong','condition_1','condition_2','condition_3','condition_4'])
    g = (row.get('expGroup') or '').lower()
    if IRON_RE.search(s) or 'fur' in g:
        classes.append('iron')
    if OXIDATIVE_RE.search(s) or g == 'stress':
        # 'stress' expGroup is broad; refine using condition text
        if OXIDATIVE_RE.search(s):
            classes.append('oxidative')
    if ENVELOPE_RE.search(s):
        classes.append('envelope')
    if CARBON_RE.search(s) or g == 'carbon source':
        classes.append('carbon')
    return ';'.join(sorted(set(classes))) if classes else 'other'

exps['cond_classes'] = exps.apply(classify, axis=1)
print('Condition class membership (may multi-belong):')
print(exps['cond_classes'].value_counts())
print()

# Per-class single-membership counts
for cls in ['iron','oxidative','envelope','carbon']:
    n = exps['cond_classes'].str.contains(cls, regex=False, na=False).sum()
    print(f'  {cls:10s}  {n} experiments')

Condition class membership (may multi-belong):
cond_classes
other        124
carbon        48
envelope      22
oxidative      4
Name: count, dtype: int64

  iron        0 experiments
  oxidative   4 experiments
  envelope    22 experiments
  carbon      48 experiments

# Decision gate (RESEARCH_PLAN v2)
n_iron = exps['cond_classes'].str.contains('iron', regex=False, na=False).sum()
n_env  = exps['cond_classes'].str.contains('envelope', regex=False, na=False).sum()
n_ox   = exps['cond_classes'].str.contains('oxidative', regex=False, na=False).sum()
n_carbon = exps['cond_classes'].str.contains('carbon', regex=False, na=False).sum()

PREFLIGHT_PASS = (n_env >= 3) and (n_iron >= 3)
print('Preflight gate (need ≥3 envelope AND ≥3 iron-limitation experiments):')
print(f'  envelope: {n_env}  (need ≥3)')
print(f'  iron:     {n_iron}  (need ≥3)')
print(f'  oxidative: {n_ox} (informational)')
print(f'  carbon:   {n_carbon} (informational)')
print()
if PREFLIGHT_PASS:
    print('PREFLIGHT PASS — H2 testable at full resolution. Proceed.')
else:
    print('PREFLIGHT FAIL — H2 drops to EXPLORATORY status (not a plan-reframe trigger; just descope).')
    print('Will still run the analysis but treat results as exploratory.')

exps.to_csv(DATA_OUT / 'NB02_caulo_experiments.csv', index=False)
print(f'\nSaved {DATA_OUT/"NB02_caulo_experiments.csv"}')

Preflight gate (need ≥3 envelope AND ≥3 iron-limitation experiments):
  envelope: 22  (need ≥3)
  iron:     0  (need ≥3)
  oxidative: 4 (informational)
  carbon:   48 (informational)

PREFLIGHT FAIL — H2 drops to EXPLORATORY status (not a plan-reframe trigger; just descope).
Will still run the analysis but treat results as exploratory.

Saved /home/aparkin/BERIL-research-observatory/projects/caulobacter_fur_lipida_loss/data/NB02_caulo_experiments.csv

genes = spark.sql("""
    SELECT locusId, sysName, gene, desc, scaffoldId, begin, end, strand, type
    FROM kescience_fitnessbrowser.gene
    WHERE orgId = 'Caulo'
""").toPandas()
print(f'Caulobacter gene rows: {len(genes)}')
print(f'sysName populated: {genes["sysName"].notna().sum()}')
print(f'Examples:')
display(genes.head(5))

# Build sysName (CCNA_*) → locusId map
ccna_to_fb = genes.set_index('sysName')['locusId'].to_dict() if genes['sysName'].notna().any() else {}
print(f'\nCCNA→FB locusId map size: {len(ccna_to_fb)}')
print('Example mappings:', dict(list(ccna_to_fb.items())[:5]))

Caulobacter gene rows: 3943
sysName populated: 3943
Examples:

CCNA→FB locusId map size: 3943
Example mappings: {'CCNA_00001': 'CCNA_00001', 'CCNA_00002': 'CCNA_00002', 'CCNA_00003': 'CCNA_00003', 'CCNA_00004': 'CCNA_00004', 'CCNA_00005': 'CCNA_00005'}

fit_raw = spark.sql("""
    SELECT expName, locusId,
           CAST(fit AS DOUBLE) AS fit,
           CAST(t   AS DOUBLE) AS t
    FROM kescience_fitnessbrowser.fitbyexp_caulo
""").toPandas()
print(f'fitbyexp_caulo rows: {len(fit_raw)}')
print(f'Distinct experiments: {fit_raw["expName"].nunique()}')
print(f'Distinct loci: {fit_raw["locusId"].nunique()}')

# Attach CCNA sysName
fit = fit_raw.merge(genes[['locusId','sysName','gene','desc']], on='locusId', how='left')
fit['has_ccna'] = fit['sysName'].notna()
print(f'\nFitness rows with CCNA sysName: {fit["has_ccna"].sum()} / {len(fit)}')

# Attach condition class
fit = fit.merge(exps[['expName','cond_classes','condition_1','condition_2','condition_3','condition_4','expDesc']], on='expName', how='left')

fitbyexp_caulo rows: 655776
Distinct experiments: 198
Distinct loci: 3312

Fitness rows with CCNA sysName: 655776 / 655776

# Re-derive concordant_strong from NB01 output
j = pd.read_csv(DATA_OUT / 'NB01_fur_only_signature.csv')
def cat(row):
    if pd.isna(row['log2fc_ours']): return 'missing'
    same = np.sign(row['log2fc_fur']) == np.sign(row['log2fc_ours'])
    ao = abs(row['log2fc_ours'])
    if not same and ao > 0.5: return 'discordant'
    if same and ao > 1: return 'concordant_strong'
    if same and ao > 0.5: return 'concordant_weak'
    return 'buffered'
j['cat'] = j.apply(cat, axis=1)

# Path A
path_A = j[j['cat']=='concordant_strong']['locustag'].tolist()
print(f'Path A (concordant_strong): {len(path_A)} genes')

# Path B - SspB-buffered set
# Define cbb3/fix operon: CCNA_01466-01476 ish from NB01 buffered top hits
# Take all NB01 buffered with |log2fc_fur|>3 (Leaden strong, ours muted)
buffered_strong = j[(j['cat']=='buffered') & (j['log2fc_fur'].abs() > 3)]
path_B = buffered_strong['locustag'].tolist()
print(f'Path B (SspB-buffered, |Leaden logFC|>3): {len(path_B)} genes')

print('\nPath A genes:')
print(path_A)
print('\nPath B genes:')
print(path_B)

Path A (concordant_strong): 32 genes
Path B (SspB-buffered, |Leaden logFC|>3): 26 genes

Path A genes:
['CCNA_02278', 'CCNA_02452', 'CCNA_03904', 'CCNA_03372', 'CCNA_03158', 'CCNA_03155', 'CCNA_03156', 'CCNA_03157', 'CCNA_03997', 'CCNA_00028', 'CCNA_00748', 'CCNA_02277', 'CCNA_03022', 'CCNA_02031', 'CCNA_03641', 'CCNA_03642', 'CCNA_03644', 'CCNA_01089', 'CCNA_01090', 'CCNA_01443', 'CCNA_02273', 'CCNA_02274', 'CCNA_00733', 'CCNA_03159', 'CCNA_02275', 'CCNA_03023', 'CCNA_R0088', 'CCNA_00055', 'CCNA_00210', 'CCNA_02048', 'CCNA_02910', 'CCNA_03108']

Path B genes:
['CCNA_00798', 'CCNA_00799', 'CCNA_01471', 'CCNA_01472', 'CCNA_01473', 'CCNA_01475', 'CCNA_01476', 'CCNA_01477', 'CCNA_02200', 'CCNA_01466', 'CCNA_01520', 'CCNA_02603', 'CCNA_00279', 'CCNA_00854', 'CCNA_02604', 'CCNA_00800', 'CCNA_00801', 'CCNA_01467', 'CCNA_01468', 'CCNA_01470', 'CCNA_00790', 'CCNA_01847', 'CCNA_01850', 'CCNA_01851', 'CCNA_03689', 'CCNA_00802']

def score_phenotype(gene_set, fit_df, classes=('iron','envelope'), t_threshold=4.0, n_exps_required=2):
    rows = []
    for ccna in gene_set:
        sub = fit_df[fit_df['sysName']==ccna].copy()
        if len(sub) == 0:
            rows.append({'ccna':ccna,'in_fb':False, 'n_phenotype_exps':0,
                         'max_abs_t_in_class':np.nan, 'phenotype_bearing':False,
                         'best_class':None, 'best_exp':None, 'best_t':np.nan, 'best_fit':np.nan})
            continue
        # exps in relevant classes
        in_class = sub[sub['cond_classes'].fillna('').str.contains('|'.join(classes), regex=True, na=False)].copy()
        n_strong = (in_class['t'].abs() > t_threshold).sum()
        if len(in_class) > 0:
            best_idx = in_class['t'].abs().idxmax()
            best = in_class.loc[best_idx]
            best_class = best['cond_classes']; best_exp = best['expName']
            best_t = best['t']; best_fit = best['fit']
            max_abs_t = abs(best_t)
        else:
            best_class = best_exp = best_t = best_fit = None
            max_abs_t = np.nan
        rows.append({'ccna':ccna,'in_fb':True, 'n_phenotype_exps':int(n_strong),
                     'max_abs_t_in_class':max_abs_t,
                     'phenotype_bearing': n_strong >= n_exps_required,
                     'best_class':best_class, 'best_exp':best_exp,
                     'best_t':best_t, 'best_fit':best_fit})
    return pd.DataFrame(rows)

score_A = score_phenotype(path_A, fit)
score_B = score_phenotype(path_B, fit)

# Attach product/category from NB01
nb1 = j[['locustag','product','category','log2fc_fur','log2fc_ours']].rename(columns={'locustag':'ccna'})
score_A = score_A.merge(nb1, on='ccna', how='left')
score_B = score_B.merge(nb1, on='ccna', how='left')

print('=== Path A (concordant_strong, n=32) — phenotype scoring ===')
print(score_A.sort_values('max_abs_t_in_class', ascending=False)[
    ['ccna','phenotype_bearing','n_phenotype_exps','max_abs_t_in_class','best_class','best_exp','best_t','best_fit','product']
].to_string(index=False))

=== Path A (concordant_strong, n=32) — phenotype scoring ===
      ccna  phenotype_bearing  n_phenotype_exps  max_abs_t_in_class best_class  best_exp    best_t  best_fit                                                product
CCNA_03108               True                22           43.658995   envelope set4IT033 43.658995  7.950308                 TonB-dependent outer membrane receptor
CCNA_02910               True                20           28.123009   envelope set4IT014 28.123009  3.247803                                TonB-dependent receptor
CCNA_00210               True                16           27.513853   envelope set4IT014 27.513853  3.327646                                TonB-dependent receptor
CCNA_02048               True                20           15.286668   envelope set4IT014 15.286668  1.971493                                TonB-dependent receptor
CCNA_03372               True                14           12.572875   envelope set4IT014 12.572875  2.808670                 bacterioferritin-associated ferredoxin
CCNA_03155               True                19           12.232948   envelope set3IT040 12.232948  1.734477                 PepSY-associated transmembrane protein
CCNA_03023               True                15            9.813239   envelope set3IT040  9.813239  1.260033                                TonB-dependent receptor
CCNA_00028               True                21            9.661992   envelope set3IT068  9.661992  2.014148                                TonB-dependent receptor
CCNA_03157               True                13            9.205196   envelope set3IT040  9.205196  1.576617                                   hypothetical protein
CCNA_02273               True                12            9.197246   envelope set4IT014  9.197246  1.912419                                 glutathione peroxidase
CCNA_03156               True                17            8.931569   envelope set3IT040  8.931569  2.041382                           putative periplasmic protein
CCNA_03158               True                14            8.718857   envelope set4IT014  8.718857  1.565527       iron-sulfur cluster assembly/repair protein ApbE
CCNA_03159               True                 7            7.859208   envelope set3IT040  7.859208  1.162932 sulfite reductase (NADPH) flavoprotein alpha-component
CCNA_01443               True                 3            7.851593   envelope set3IT059  7.851593  2.131700                                   hypothetical protein
CCNA_03022               True                 9            7.151345   envelope set3IT040  7.151345  1.304552                  PiuB-family transmembrane transporter
CCNA_00733               True                 8            6.363273   envelope set3IT040  6.363273  0.927269                               GumN superfamily protein
CCNA_02278               True                 3            6.341158   envelope set3IT040  6.341158  1.351597                                   hypothetical protein
CCNA_01090              False                 1            6.332621   envelope set4IT014  6.332621  0.975350                                   hypothetical protein
CCNA_01089              False                 1            4.322978   envelope set3IT006  4.322978  0.552917                                   hypothetical protein
CCNA_03997              False                 0            3.348868   envelope set4IT014  3.348868  0.744805                        amelogenin/CpxP-related protein
CCNA_02274              False                 0            3.146540   envelope set3IT006 -3.146540 -0.812167                                 EF-Hand domain protein
CCNA_02277              False                 0            2.448213   envelope set4IT033  2.448213  0.337980                  TonB-dependent outer membrane channel
CCNA_02452              False                 0            2.163892   envelope set3IT059 -2.163892 -0.799400                                   hypothetical protein
CCNA_02275              False                 0            1.964429   envelope set3IT003 -1.964429 -0.390799                 ABC transporter, periplasmic component
CCNA_03904              False                 0            1.405688   envelope set4IT033  1.405688  0.651674                                   hypothetical protein
CCNA_00748              False                 0                 NaN        NaN       NaN       NaN       NaN                       ferrous iron transport protein A
CCNA_02031              False                 0                 NaN        NaN       NaN       NaN       NaN                    NADH-quinone oxidoreductase chain C
CCNA_03641              False                 0                 NaN        NaN       NaN       NaN       NaN            succinate dehydrogenase iron-sulfur protein
CCNA_03642              False                 0                 NaN        NaN       NaN       NaN       NaN           succinate dehydrogenase flavoprotein subunit
CCNA_03644              False                 0                 NaN        NaN       NaN       NaN       NaN       succinate dehydrogenase cytochrome B-556 subunit
CCNA_R0088              False                 0                 NaN        NaN       NaN       NaN       NaN                          minimal medium expressed sRNA
CCNA_00055              False                 0                 NaN        NaN       NaN       NaN       NaN                       ferric uptake regulation protein

print('\n=== Path B (SspB-buffered, |Leaden logFC|>3, n=53) — phenotype scoring ===')
display(score_B.sort_values('max_abs_t_in_class', ascending=False).head(30)[
    ['ccna','phenotype_bearing','n_phenotype_exps','max_abs_t_in_class','best_class','best_exp','best_t','best_fit','product']
])

# Apply H2 threshold
def verdict(score_df, label):
    in_fb = score_df['in_fb'].sum()
    pb = score_df['phenotype_bearing'].sum()
    frac = pb / len(score_df) if len(score_df) else 0
    print(f'\n--- H2 verdict for {label} ---')
    print(f'Set size: {len(score_df)}')
    print(f'In FB:    {in_fb}')
    print(f'Phenotype-bearing (|t|>4 in ≥2 envelope/iron experiments): {pb} ({frac:.1%})')
    pass_thr = frac >= 0.10
    print(f'Threshold (≥10%): {"PASS — H2 supported" if pass_thr else "FAIL — H2 weakly supported / null"}')
    return pass_thr

A_pass = verdict(score_A, 'Path A — concordant_strong')
B_pass = verdict(score_B, 'Path B — SspB-buffered (cbb3/fix-rich)')

score_A.to_csv(DATA_OUT / 'NB02_pathA_concordant_strong_scoring.csv', index=False)
score_B.to_csv(DATA_OUT / 'NB02_pathB_buffered_scoring.csv', index=False)

=== Path B (SspB-buffered, |Leaden logFC|>3, n=53) — phenotype scoring ===

--- H2 verdict for Path A — concordant_strong ---
Set size: 32
In FB:    25
Phenotype-bearing (|t|>4 in ≥2 envelope/iron experiments): 17 (53.1%)
Threshold (≥10%): PASS — H2 supported

--- H2 verdict for Path B — SspB-buffered (cbb3/fix-rich) ---
Set size: 26
In FB:    26
Phenotype-bearing (|t|>4 in ≥2 envelope/iron experiments): 9 (34.6%)
Threshold (≥10%): PASS — H2 supported

# Background: all Caulo genes
all_ccnas = genes['sysName'].dropna().tolist()
print(f'Background gene set: {len(all_ccnas)} Caulobacter genes')

# Vectorized scoring
fit_class = fit[fit['cond_classes'].fillna('').str.contains('iron|envelope', regex=True, na=False)].copy()
fit_class['strong_t'] = (fit_class['t'].abs() > 4) & fit_class['sysName'].notna()
bg = (fit_class[fit_class['strong_t']]
      .groupby('sysName').size()
      .reindex(all_ccnas, fill_value=0))
bg_pass = (bg >= 2).sum()
bg_frac = bg_pass / len(bg)
print(f'\nBackground phenotype-bearing fraction: {bg_pass}/{len(bg)} ({bg_frac:.2%})')

# Hypergeometric / Fisher: is Path A enriched vs background?
from scipy.stats import hypergeom
def enrich_test(set_size, set_pb, bg_size, bg_pb, name):
    # P(X >= set_pb | hypergeom(N=bg_size, K=bg_pb, n=set_size))
    p = hypergeom.sf(set_pb - 1, bg_size, bg_pb, set_size) if set_pb > 0 else 1.0
    exp = bg_pb * set_size / bg_size
    fold = (set_pb / exp) if exp else float('nan')
    print(f'{name}:  observed {set_pb}/{set_size} pb; expected {exp:.2f}; fold {fold:.2f}x; hypergeom p = {p:.2e}')
    return p, fold

print('\n=== Enrichment vs Caulobacter genome background ===')
p_A, fold_A = enrich_test(len(score_A), int(score_A['phenotype_bearing'].sum()), len(bg), bg_pass, 'Path A')
p_B, fold_B = enrich_test(len(score_B), int(score_B['phenotype_bearing'].sum()), len(bg), bg_pass, 'Path B')

Background gene set: 3943 Caulobacter genes

Background phenotype-bearing fraction: 1311/3943 (33.25%)

=== Enrichment vs Caulobacter genome background ===
Path A:  observed 17/32 pb; expected 10.64; fold 1.60x; hypergeom p = 1.56e-02
Path B:  observed 9/26 pb; expected 8.64; fold 1.04x; hypergeom p = 5.15e-01

print('=== NB02 SUMMARY ===\n')
print('PREFLIGHT (RESEARCH_PLAN v2 gate):')
print(f'  envelope-stress experiments:  {n_env}  (need ≥3)')
print(f'  iron-limitation experiments:  {n_iron}  (need ≥3)')
print(f'  Verdict: {"PASS" if PREFLIGHT_PASS else "FAIL — H2 exploratory only"}')
print()
print('H2 PRE-REGISTERED THRESHOLD (≥10% Fur-released genes phenotype-bearing):')
print(f'  Path A (concordant_strong, n={len(score_A)}): {score_A["phenotype_bearing"].sum()} pb ({score_A["phenotype_bearing"].sum()/len(score_A):.1%}) — {"PASS" if A_pass else "FAIL"}')
print(f'  Path B (SspB-buffered, n={len(score_B)}):     {score_B["phenotype_bearing"].sum()} pb ({score_B["phenotype_bearing"].sum()/len(score_B):.1%}) — {"PASS" if B_pass else "FAIL"}')
print()
print('ENRICHMENT vs background:')
print(f'  Path A vs background: fold={fold_A:.2f}x, p={p_A:.2e}')
print(f'  Path B vs background: fold={fold_B:.2f}x, p={p_B:.2e}')
print()
print('NEXT — interpret what the strongly phenotype-bearing genes are, and which Path the data favors.')

=== NB02 SUMMARY ===

PREFLIGHT (RESEARCH_PLAN v2 gate):
  envelope-stress experiments:  22  (need ≥3)
  iron-limitation experiments:  0  (need ≥3)
  Verdict: FAIL — H2 exploratory only

H2 PRE-REGISTERED THRESHOLD (≥10% Fur-released genes phenotype-bearing):
  Path A (concordant_strong, n=32): 17 pb (53.1%) — PASS
  Path B (SspB-buffered, n=26):     9 pb (34.6%) — PASS

ENRICHMENT vs background:
  Path A vs background: fold=1.60x, p=1.56e-02
  Path B vs background: fold=1.04x, p=5.15e-01

NEXT — interpret what the strongly phenotype-bearing genes are, and which Path the data favors.

02 Caulo Fitness Ranking

02 — Caulobacter RB-TnSeq fitness ranking of Fur-released genes¶

Purpose¶

Pre-registered H2 threshold (RESEARCH_PLAN v2)¶

Preflight gate¶

1. PREFLIGHT — experiment metadata and condition coverage¶

2. Pull Caulobacter gene table → CCNA locus tag map¶

3. Pull per-experiment fitness data (with CAST per pitfalls.md)¶

4. Build the two Fur-released gene sets to rank¶

Path A — concordant_strong (clean Fur signature, 32 genes)¶

Path B — SspB-buffered set (cbb3 / fix operon, 11 genes)¶

5. Phenotype scoring — apply pre-registered threshold¶

6. Compare to background — is the Fur-released set actually enriched for phenotype-bearing genes?¶

7. Save and summarize¶

	locusId	sysName	gene	desc	scaffoldId	begin	end	strand	type
0	CCNA_00001	CCNA_00001	NaN	ATP/GTP-binding protein	NC_011916	202	1107	+	1
1	CCNA_00002	CCNA_00002	NaN	septum formation protein Maf	NC_011916	1104	1703	+	1
2	CCNA_00003	CCNA_00003	NaN	shikimate 5-dehydrogenase	NC_011916	1700	2557	+	1
3	CCNA_00004	CCNA_00004	NaN	dephospho-CoA kinase	NC_011916	2559	3158	+	1
4	CCNA_00005	CCNA_00005	NaN	DNA polymerase III, epsilon chain	NC_011916	3190	3903	+	1

	ccna	phenotype_bearing	n_phenotype_exps	max_abs_t_in_class	best_class	best_exp	best_t	best_fit	product
23	CCNA_01851	True	22	28.176634	envelope	set4IT014	28.176634	3.415614	quinol cytochrome oxidase polypeptide II
24	CCNA_03689	True	22	24.254883	envelope	set4IT014	24.254883	3.333089	alanine dehydrogenase
22	CCNA_01850	True	22	20.152315	envelope	set4IT014	20.152315	2.715550	quinol cytochrome oxidase polypeptide I
21	CCNA_01847	True	19	10.540280	envelope	set3IT040	10.540280	1.662412	cytochrome c oxidase assembly protein Surf1
15	CCNA_00800	True	11	7.359159	envelope	set3IT065	-7.359159	-2.013672	cytochrome bd-type quinol oxidase, subunit 1 cydA
1	CCNA_00799	True	11	6.632003	envelope	set3IT067	-6.632003	-1.980280	ABC transporter, ATP-binding protein cydD
0	CCNA_00798	True	10	6.467226	envelope	set3IT001	-6.467226	-1.339248	ABC transporter, ATP-binding protein cydC
17	CCNA_01467	True	8	6.289661	envelope	set3IT069	-6.289661	-1.116903	cytochrome cbb3 oxidase subunit I ccoN
2	CCNA_01471	True	9	6.231087	envelope	set3IT074	-6.231087	-1.225935	polyferredoxin protein fixG
16	CCNA_00801	False	1	4.014697	envelope	set3IT003	-4.014697	-1.886460	cytochrome bd-type quinol oxidase, subunit 2 cydB
20	CCNA_00790	False	0	3.878705	envelope	set4IT014	3.878705	0.839118	hypoxia negative feedback regulator FixT
11	CCNA_02603	False	0	3.742693	envelope	set3IT077	-3.742693	-0.609956	hypothetical protein
13	CCNA_00854	False	0	3.492546	envelope	set3IT066	-3.492546	-0.557726	metallo-beta-lactamase protein
25	CCNA_00802	False	0	3.385971	envelope	set4IT014	3.385971	1.091261	cyd operon protein YbgT
4	CCNA_01473	False	0	3.332413	envelope	set3IT068	-3.332413	-0.486061	E1-E2 cation pump ATPase fixI
3	CCNA_01472	False	0	3.267374	envelope	set3IT069	-3.267374	-1.080078	cation pump-linked membrane protein fixH
18	CCNA_01468	False	0	3.158115	envelope	set3IT003	-3.158115	-0.879008	cytochrome cbb3 oxidase, cytochrome c subunit ccoO
19	CCNA_01470	False	0	2.728018	envelope	set3IT065	-2.728018	-0.678979	cytochrome cbb3 oxidase, diheme subunit, membrane-bound ccoP
12	CCNA_00279	False	0	2.705050	envelope	set3IT003	-2.705050	-0.496669	NAD(P)H-quinone reductase
8	CCNA_02200	False	0	2.558396	envelope	set4IT014	-2.558396	-1.010195	cytochrome c-family protein
5	CCNA_01475	False	0	2.318072	envelope	set3IT068	-2.318072	-0.604996	OmpW family outer membrane cation channel
14	CCNA_02604	False	0	2.276428	envelope	set3IT040	2.276428	0.655144	host cell attachment protein
9	CCNA_01466	False	0	2.259044	envelope	set3IT075	2.259044	0.963074	hypothetical protein
6	CCNA_01476	False	0	2.102777	envelope	set3IT065	-2.102777	-0.485328	CRP-family transcription regulator FtrB
10	CCNA_01520	False	0	2.100928	envelope	set3IT001	-2.100928	-1.201844	hypothetical protein
7	CCNA_01477	False	0	1.761695	envelope	set3IT067	-1.761695	-0.320099	oxygen-independent coproporphyrinogen-III oxidase hemN