import json, time
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

PROJECT_ROOT = Path("/home/aparkin/BERIL-research-observatory/projects/gene_function_ecological_agora")
DATA_DIR = PROJECT_ROOT / "data"
FIG_DIR = PROJECT_ROOT / "figures"

RANKS = ["genus", "family", "order", "class", "phylum"]
PARENT_RANK = "phylum"  # consumer null is parent-phylum dispersion; phylum rank itself has no consumer score

diagnostics = {
    "timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "ranks": RANKS,
    "parent_rank": PARENT_RANK,
}

species_df = pd.read_csv(DATA_DIR / "p1a_pilot_species.tsv", sep="\t")
uniref_df = pd.read_csv(DATA_DIR / "p1a_pilot_uniref50.tsv", sep="\t")
extract_df = pd.read_parquet(DATA_DIR / "p1a_pilot_extract.parquet")
producer_lookup = pd.read_parquet(DATA_DIR / "p1a_null_producer_lookup.parquet")
consumer_lookup = pd.read_parquet(DATA_DIR / "p1a_null_consumer_lookup.parquet")
prev_bin_lookup = pd.read_csv(DATA_DIR / "p1a_uniref_prevalence_bin.tsv", sep="\t")

# Apply paralog fallback (option a)
extract_df["paralog_count"] = np.where(
    extract_df["n_uniref90_present"] >= 1,
    extract_df["n_uniref90_present"],
    extract_df["n_gene_clusters"],
).astype(int)

print(f"Species:           {species_df.shape}")
print(f"UniRef:            {uniref_df.shape}")
print(f"Extract:           {extract_df.shape}")
print(f"Producer lookup:   {producer_lookup.shape}  ranks: {producer_lookup['rank'].unique().tolist()}")
print(f"Consumer lookup:   {consumer_lookup.shape}  ranks: {consumer_lookup['rank'].unique().tolist()}")
print(f"Prev-bin lookup:   {prev_bin_lookup.shape}  ranks: {prev_bin_lookup['rank'].unique().tolist()}")

Species:           (1000, 22)
UniRef:            (1200, 15)
Extract:           (6638, 6)
Producer lookup:   (876, 9)  ranks: ['genus', 'family', 'order', 'class', 'phylum']
Consumer lookup:   (4800, 7)  ranks: ['genus', 'family', 'order', 'class']
Prev-bin lookup:   (6000, 4)  ranks: ['genus', 'family', 'order', 'class', 'phylum']

# Build species → clade label per rank (matching NB02's logic)
species_id_to_idx = {s: i for i, s in enumerate(species_df["gtdb_species_clade_id"])}
uniref_id_to_idx = {u: i for i, u in enumerate(uniref_df["uniref50_id"])}
N_SPECIES = len(species_id_to_idx)
N_UREFS = len(uniref_id_to_idx)

# Species × UniRef paralog matrix
M_par = np.zeros((N_SPECIES, N_UREFS), dtype=np.int32)
M_pres = np.zeros((N_SPECIES, N_UREFS), dtype=np.int8)
for _, row in extract_df.iterrows():
    s_id = row["gtdb_species_clade_id"]; u_id = row["uniref50_id"]
    if s_id in species_id_to_idx and u_id in uniref_id_to_idx:
        i = species_id_to_idx[s_id]; j = uniref_id_to_idx[u_id]
        M_par[i, j] = row["paralog_count"]
        M_pres[i, j] = 1

rank_clade_data = {}
for rank in RANKS:
    labels = species_df.set_index("gtdb_species_clade_id").reindex(
        species_df["gtdb_species_clade_id"]
    )[rank].fillna("unknown").tolist()
    label_arr = np.array(labels)
    unique_clades = sorted(set(label_arr) - {"unknown"})
    K = len(unique_clades)
    clade_to_idx = {c: i for i, c in enumerate(unique_clades)}

    M_clade_par = np.zeros((K, N_UREFS), dtype=np.int32)
    M_clade_pres = np.zeros((K, N_UREFS), dtype=np.int8)
    for sp_i in range(N_SPECIES):
        clade = label_arr[sp_i]
        if clade == "unknown":
            continue
        ki = clade_to_idx[clade]
        np.maximum(M_clade_par[ki, :], M_par[sp_i, :], out=M_clade_par[ki, :])
        M_clade_pres[ki, :] |= M_pres[sp_i, :]

    rank_clade_data[rank] = {
        "clade_ids": unique_clades,
        "clade_to_idx": clade_to_idx,
        "M_par": M_clade_par,
        "M_pres": M_clade_pres,
    }
    print(f"  {rank:8s}: {K:5d} clades  {int(M_clade_pres.sum()):6d} presences")

  genus   :   770 clades    5148 presences
  family  :   457 clades    3722 presences
  order   :   287 clades    2970 presences
  class   :   164 clades    2247 presences
  phylum  :   110 clades    2100 presences

score_rows = []

for rank in RANKS:
    rdata = rank_clade_data[rank]
    K = len(rdata["clade_ids"])
    if K == 0:
        continue
    M_par_r = rdata["M_par"]
    M_pres_r = rdata["M_pres"]
    clade_to_idx = rdata["clade_to_idx"]

    # Build (clade, prev_bin) → (cohort_mean, cohort_std) lookup for this rank
    rank_prod = producer_lookup[producer_lookup["rank"] == rank].copy()
    prod_lookup_dict = {
        (row["clade_id"], int(row["prevalence_bin"])): (row["cohort_mean_paralog"], row["cohort_std_paralog"], int(row["cohort_size"]))
        for _, row in rank_prod.iterrows()
    }

    # UniRef → prevalence_bin at this rank
    rank_pb = prev_bin_lookup[prev_bin_lookup["rank"] == rank].set_index("uniref50_id")["prevalence_bin"].to_dict()

    # Compute producer_z per (clade, uniref) where present and lookup exists
    for c_i, clade_id in enumerate(rdata["clade_ids"]):
        present_urefs = np.where(M_pres_r[c_i, :] == 1)[0]
        for u_j in present_urefs:
            uref_id = uniref_df["uniref50_id"].iloc[u_j]
            pb = rank_pb.get(uref_id)
            if pb is None:
                continue
            key = (clade_id, int(pb))
            if key not in prod_lookup_dict:
                continue
            mean_, std_, csize_ = prod_lookup_dict[key]
            paralog = int(M_par_r[c_i, u_j])
            producer_z = (paralog - mean_) / std_ if std_ > 0 else 0.0
            score_rows.append({
                "rank": rank,
                "clade_id": clade_id,
                "uniref50_id": uref_id,
                "paralog_count": paralog,
                "prevalence_bin": int(pb),
                "cohort_mean_paralog": float(mean_),
                "cohort_std_paralog": float(std_),
                "cohort_size": csize_,
                "producer_z": float(producer_z),
            })

scores_df = pd.DataFrame(score_rows)
print(f"Producer scores computed: {len(scores_df):,} (clade, UniRef) rows")
diagnostics["n_producer_scores"] = int(len(scores_df))
print(f"\nProducer z distribution per rank:")
print(scores_df.groupby("rank")["producer_z"].describe())

Producer scores computed: 9,201 (clade, UniRef) rows

Producer z distribution per rank:
         count          mean       std       min       25%       50%  75%  \
rank                                                                        
class   1705.0  2.708814e-17  0.829687 -0.724846 -0.418453 -0.316228  0.0   
family  1851.0  1.919348e-18  0.761222 -1.632993 -0.428174  0.000000  0.0   
genus   2061.0 -1.400572e-18  0.675508 -1.095445 -0.387174  0.000000  0.0   
order   1822.0 -1.316181e-17  0.811098 -0.866667 -0.408248 -0.250000  0.0   
phylum  1762.0  1.310592e-17  0.828235 -0.775959 -0.404536 -0.296646  0.0   

             max  
rank              
class   6.144803  
family  4.547640  
genus   3.750000  
order   5.361904  
phylum  6.450474

# Join consumer z (pre-computed in NB02) onto the producer-scored rows.
# Consumer is per-(rank, UniRef); broadcast over clades within rank.
consumer_compact = consumer_lookup[["rank", "uniref50_id", "n_clades_with", "obs_parent_dispersion", "consumer_z"]]
scores_df = scores_df.merge(consumer_compact, on=["rank", "uniref50_id"], how="left")

# Join control_class
scores_df = scores_df.merge(
    uniref_df[["uniref50_id", "control_class", "dominant_cog_category", "dominant_ipr_desc"]],
    on="uniref50_id", how="left",
)

# Filter consumer to informative subset (n_clades_with ≥ 3) for headline statistics
scores_df["consumer_z_informative"] = np.where(
    scores_df["n_clades_with"].fillna(0) >= 3, scores_df["consumer_z"], np.nan
)

print(f"Final scores table: {scores_df.shape}")
print(f"\nProducer scores per rank × class (count):")
print(scores_df.groupby(["rank", "control_class"]).size().unstack(fill_value=0))

Final scores table: (9201, 16)

Producer scores per rank × class (count):
control_class  natural_expansion  neg_ribosomal  neg_rnap_core  \
rank                                                             
class                        482            285            259   
family                       802            237            180   
genus                        975            276            237   
order                        654            258            211   
phylum                       457            315            284   

control_class  neg_trna_synth  pos_amr  pos_tcs_hk  
rank                                                
class                     222      283         174  
family                    192      316         124  
genus                     199      301          73  
order                     208      334         157  
phylum                    248      272         186

from scipy import stats

def ci95(values):
    """Return (mean, ci_low, ci_high, n) for a 95% CI of the mean. Returns NaNs if n<3."""
    arr = np.asarray(values, dtype=float)
    arr = arr[~np.isnan(arr)]
    n = len(arr)
    if n < 3:
        return (np.nan, np.nan, np.nan, n)
    mean = arr.mean()
    sem = arr.std(ddof=1) / np.sqrt(n)
    ci = sem * stats.t.ppf(0.975, n - 1)
    return (mean, mean - ci, mean + ci, n)

validation_rows = []
for (rank, cls), sub in scores_df.groupby(["rank", "control_class"]):
    pmean, pl, ph, pn = ci95(sub["producer_z"].values)
    cmean, cl, ch, cn = ci95(sub["consumer_z_informative"].values)
    validation_rows.append({
        "rank": rank, "control_class": cls,
        "n_producer": pn, "producer_mean": pmean, "producer_ci_low": pl, "producer_ci_high": ph,
        "n_consumer": cn, "consumer_mean": cmean, "consumer_ci_low": cl, "consumer_ci_high": ch,
    })
validation_df = pd.DataFrame(validation_rows)

# Apply pass/fail criteria
def negative_control_pass(row):
    if row["n_producer"] < 3 or row["n_consumer"] < 3:
        return "insufficient_n"
    producer_zero_in_ci = (row["producer_ci_low"] <= 0) and (row["producer_ci_high"] >= 0)
    consumer_clumped = row["consumer_ci_high"] <= 0
    return "PASS" if (producer_zero_in_ci and consumer_clumped) else "FAIL"

def positive_amr_pass(row):
    if row["n_consumer"] < 3:
        return "insufficient_n"
    return "PASS" if row["consumer_ci_low"] > 0 else "FAIL_OR_VERTICAL"

def positive_tcs_pass(row):
    if row["n_producer"] < 3:
        return "insufficient_n"
    return "PASS" if row["producer_ci_low"] > 0 else "FAIL"

def natural_expansion_check(row):
    if row["n_producer"] < 3:
        return "insufficient_n"
    return "PASS" if row["producer_ci_low"] > 0 else "WEAK"

verdict_rows = []
for _, row in validation_df.iterrows():
    cls = row["control_class"]
    if cls in ("neg_ribosomal", "neg_trna_synth", "neg_rnap_core"):
        v = negative_control_pass(row)
    elif cls == "pos_amr":
        v = positive_amr_pass(row)
    elif cls == "pos_tcs_hk":
        v = positive_tcs_pass(row)
    elif cls == "natural_expansion":
        v = natural_expansion_check(row)
    else:
        v = "n/a"
    verdict_rows.append(v)
validation_df["verdict"] = verdict_rows

print("Control validation per rank:")
print(validation_df.to_string(index=False, float_format="%.3f"))
diagnostics["control_validation"] = validation_df.to_dict(orient="records")

Control validation per rank:
  rank     control_class  n_producer  producer_mean  producer_ci_low  producer_ci_high  n_consumer  consumer_mean  consumer_ci_low  consumer_ci_high          verdict
 class natural_expansion         482          0.498            0.384             0.612         318         -1.566           -1.815            -1.316             PASS
 class     neg_ribosomal         285         -0.159           -0.202            -0.116         138         -0.034           -0.195             0.128             FAIL
 class     neg_rnap_core         259         -0.229           -0.256            -0.201         132         -1.076           -1.396            -0.757             FAIL
 class    neg_trna_synth         222         -0.213           -0.259            -0.166          67         -0.996           -1.403            -0.589             FAIL
 class           pos_amr         283         -0.170           -0.243            -0.098          87         -1.668           -2.154            -1.181 FAIL_OR_VERTICAL
 class        pos_tcs_hk         174         -0.231           -0.303            -0.158           3          0.273            0.273             0.273             FAIL
family natural_expansion         802          0.194            0.128             0.261         732         -4.852           -5.008            -4.696             PASS
family     neg_ribosomal         237         -0.197           -0.235            -0.159         180         -2.437           -2.896            -1.978             FAIL
family     neg_rnap_core         180         -0.191           -0.239            -0.143         128         -4.773           -5.383            -4.162             FAIL
family    neg_trna_synth         192         -0.175           -0.228            -0.123         121         -5.764           -6.195            -5.333             FAIL
family           pos_amr         316         -0.067           -0.151             0.016         185         -4.399           -4.670            -4.129 FAIL_OR_VERTICAL
family        pos_tcs_hk         124         -0.160           -0.256            -0.065          19         -5.967           -6.631            -5.302             FAIL
 genus natural_expansion         975          0.130            0.078             0.181         941         -5.069           -5.206            -4.932             PASS
 genus     neg_ribosomal         276         -0.149           -0.179            -0.119         256         -3.670           -4.057            -3.284             FAIL
 genus     neg_rnap_core         237         -0.157           -0.191            -0.123         209         -5.975           -6.452            -5.498             FAIL
 genus    neg_trna_synth         199         -0.153           -0.204            -0.101         165         -5.756           -6.066            -5.446             FAIL
 genus           pos_amr         301         -0.035           -0.120             0.050         228         -4.689           -4.921            -4.458 FAIL_OR_VERTICAL
 genus        pos_tcs_hk          73         -0.098           -0.234             0.038          23         -5.314           -5.955            -4.672             FAIL
 order natural_expansion         654          0.307            0.221             0.393         537         -4.529           -4.749            -4.308             PASS
 order     neg_ribosomal         258         -0.194           -0.226            -0.162         153         -1.591           -2.014            -1.167             FAIL
 order     neg_rnap_core         211         -0.196           -0.241            -0.150         108         -3.478           -4.109            -2.847             FAIL
 order    neg_trna_synth         208         -0.199           -0.249            -0.149          98         -4.861           -5.528            -4.194             FAIL
 order           pos_amr         334         -0.114           -0.192            -0.037         167         -4.788           -5.156            -4.420 FAIL_OR_VERTICAL
 order        pos_tcs_hk         157         -0.191           -0.272            -0.110           7         -5.622           -7.094            -4.150             FAIL
phylum natural_expansion         457          0.546            0.426             0.665           0            NaN              NaN               NaN             PASS
phylum     neg_ribosomal         315         -0.151           -0.192            -0.109           0            NaN              NaN               NaN   insufficient_n
phylum     neg_rnap_core         284         -0.238           -0.264            -0.212           0            NaN              NaN               NaN   insufficient_n
phylum    neg_trna_synth         248         -0.192           -0.239            -0.146           0            NaN              NaN               NaN   insufficient_n
phylum           pos_amr         272         -0.160           -0.239            -0.082           0            NaN              NaN               NaN   insufficient_n
phylum        pos_tcs_hk         186         -0.231           -0.299            -0.163           0            NaN              NaN               NaN             FAIL

tcs_scores = scores_df[scores_df["control_class"] == "pos_tcs_hk"].copy()

alm_rows = []
for rank in RANKS:
    sub = tcs_scores[tcs_scores["rank"] == rank]
    n_total = len(sub)
    if n_total == 0:
        continue
    n_pos_z = int((sub["producer_z"] > 0).sum())
    n_pos_z_2sig = int((sub["producer_z"] > 2).sum())
    pmean, pl, ph, _ = ci95(sub["producer_z"].values)
    # Top-5 most-expanded TCS HKs at this rank
    top5 = sub.nlargest(5, "producer_z")[["clade_id", "uniref50_id", "paralog_count", "cohort_mean_paralog", "producer_z", "dominant_ipr_desc"]]
    alm_rows.append({
        "rank": rank, "n_tcs_scored": n_total,
        "n_producer_z_positive": n_pos_z,
        "n_producer_z_above_2sigma": n_pos_z_2sig,
        "producer_z_mean": pmean, "producer_z_ci_low": pl, "producer_z_ci_high": ph,
    })
    print(f"\n  --- {rank} (n={n_total}) ---")
    print(f"  Producer z mean = {pmean:.2f} [95% CI: {pl:.2f}, {ph:.2f}]")
    print(f"  Positive z: {n_pos_z}/{n_total} ({100*n_pos_z/n_total:.1f}%)")
    print(f"  Above 2σ:   {n_pos_z_2sig}/{n_total} ({100*n_pos_z_2sig/n_total:.1f}%)")
    print(f"  Top 5 expanded TCS HKs:")
    print(top5.to_string(index=False))

alm_df = pd.DataFrame(alm_rows)
diagnostics["alm_2006_pilot_backtest"] = alm_df.to_dict(orient="records")
alm_df.to_csv(DATA_DIR / "p1a_alm_2006_pilot_backtest.tsv", sep="\t", index=False)
print(f"\nWrote p1a_alm_2006_pilot_backtest.tsv")

  --- genus (n=73) ---
  Producer z mean = -0.10 [95% CI: -0.23, 0.04]
  Positive z: 8/73 (11.0%)
  Above 2σ:   2/73 (2.7%)
  Top 5 expanded TCS HKs:
        clade_id         uniref50_id  paralog_count  cohort_mean_paralog  producer_z                                                         dominant_ipr_desc
 g__Burkholderia     UniRef50_U3GC98              2             1.142857    2.267787                                  Transcription regulator LuxR, C-terminal
 g__Butyrivibrio UniRef50_A0A1M5XY67              3             1.333333    2.041241                   Signal transduction response regulator, receiver domain
  g__Paucibacter UniRef50_A0A8B4S8C5              2             1.200000    1.788854                                         OmpR/PhoB-type DNA-binding domain
  g__Sphingobium UniRef50_A0A087N6H5              2             1.222222    1.763834                                                                PAS domain
g__Pseudomonas_E UniRef50_A0A3M3WB47              2             1.428571    0.723189 Signal transduction histidine kinase, dimerisation/phosphoacceptor domain

  --- family (n=124) ---
  Producer z mean = -0.16 [95% CI: -0.26, -0.06]
  Positive z: 8/124 (6.5%)
  Above 2σ:   2/124 (1.6%)
  Top 5 expanded TCS HKs:
             clade_id         uniref50_id  paralog_count  cohort_mean_paralog  producer_z                                                         dominant_ipr_desc
   f__Lachnospiraceae UniRef50_A0A1M5XY67              3             1.133333    3.614784                   Signal transduction response regulator, receiver domain
f__Burkholderiaceae_B UniRef50_A0A8B4S8C5              2             1.090909    3.015113                                         OmpR/PhoB-type DNA-binding domain
  f__Burkholderiaceae     UniRef50_U3GC98              2             1.294118    1.200751                                  Transcription regulator LuxR, C-terminal
 f__Sphingomonadaceae UniRef50_A0A087N6H5              2             1.368421    0.923381                                                                PAS domain
  f__Pseudomonadaceae UniRef50_A0A3M3WB47              2             1.552632    0.470898 Signal transduction histidine kinase, dimerisation/phosphoacceptor domain

  --- order (n=157) ---
  Producer z mean = -0.19 [95% CI: -0.27, -0.11]
  Positive z: 7/157 (4.5%)
  Above 2σ:   3/157 (1.9%)
  Top 5 expanded TCS HKs:
            clade_id         uniref50_id  paralog_count  cohort_mean_paralog  producer_z                                       dominant_ipr_desc
   o__Lachnospirales UniRef50_A0A1M5XY67              3             1.095238    4.364358 Signal transduction response regulator, receiver domain
  o__Burkholderiales UniRef50_A0A8B4S8C5              2             1.127660    2.199755                       OmpR/PhoB-type DNA-binding domain
  o__Burkholderiales     UniRef50_U3GC98              2             1.127660    2.199755                Transcription regulator LuxR, C-terminal
 o__Sphingomonadales UniRef50_A0A087N6H5              2             1.380952    0.925273                                              PAS domain
o__Campylobacterales UniRef50_A0A4U7BFB2              2             1.541667    0.550239 Signal transduction response regulator, receiver domain

  --- class (n=174) ---
  Producer z mean = -0.23 [95% CI: -0.30, -0.16]
  Positive z: 9/174 (5.2%)
  Above 2σ:   2/174 (1.1%)
  Top 5 expanded TCS HKs:
              clade_id         uniref50_id  paralog_count  cohort_mean_paralog  producer_z                                                         dominant_ipr_desc
         c__Clostridia UniRef50_A0A0L9Y983              4             1.217391    4.356725          Signal transduction histidine kinase-related protein, C-terminal
         c__Clostridia UniRef50_A0A1M5XY67              3             1.217391    2.791027                   Signal transduction response regulator, receiver domain
c__Gammaproteobacteria UniRef50_A0A3M3WB47              2             1.418033    0.652405 Signal transduction histidine kinase, dimerisation/phosphoacceptor domain
c__Gammaproteobacteria UniRef50_A0A4Q1QDC6              2             1.418033    0.652405                   Signal transduction response regulator, receiver domain
c__Gammaproteobacteria UniRef50_A0A8B4S8C5              2             1.418033    0.652405                                         OmpR/PhoB-type DNA-binding domain

  --- phylum (n=186) ---
  Producer z mean = -0.23 [95% CI: -0.30, -0.16]
  Positive z: 9/186 (4.8%)
  Above 2σ:   2/186 (1.1%)
  Top 5 expanded TCS HKs:
           clade_id         uniref50_id  paralog_count  cohort_mean_paralog  producer_z                                                         dominant_ipr_desc
     p__Bacillota_A UniRef50_A0A0L9Y983              4             1.217391    4.356725          Signal transduction histidine kinase-related protein, C-terminal
     p__Bacillota_A UniRef50_A0A1M5XY67              3             1.217391    2.791027                   Signal transduction response regulator, receiver domain
p__Campylobacterota UniRef50_A0A4U7BFB2              2             1.500000    0.615457                   Signal transduction response regulator, receiver domain
  p__Pseudomonadota UniRef50_A0A3M3WB47              2             1.465318    0.565810 Signal transduction histidine kinase, dimerisation/phosphoacceptor domain
  p__Pseudomonadota UniRef50_A0A087N6H5              2             1.465318    0.565810                                                                PAS domain

Wrote p1a_alm_2006_pilot_backtest.tsv

# Violin plots of producer and consumer z by control class per rank
control_order = ["neg_ribosomal", "neg_trna_synth", "neg_rnap_core", "pos_amr", "pos_tcs_hk", "natural_expansion"]
control_colors = {
    "neg_ribosomal": "#1f77b4", "neg_trna_synth": "#aec7e8", "neg_rnap_core": "#9ecae1",
    "pos_amr": "#ff7f0e", "pos_tcs_hk": "#d62728", "natural_expansion": "#2ca02c",
}

fig, axes = plt.subplots(2, len(RANKS), figsize=(4 * len(RANKS), 8), sharey="row")
for col, rank in enumerate(RANKS):
    sub = scores_df[scores_df["rank"] == rank]

    # Top: producer z
    ax = axes[0, col]
    data = [sub[sub["control_class"] == c]["producer_z"].dropna().values for c in control_order]
    pos = list(range(len(control_order)))
    parts = ax.violinplot(data, positions=pos, showmeans=True, showmedians=True, widths=0.7)
    for pc, c in zip(parts["bodies"], control_order):
        pc.set_facecolor(control_colors[c]); pc.set_alpha(0.7)
    ax.axhline(0, color="black", lw=0.6)
    ax.axhline(2, color="red", ls="--", lw=0.6); ax.axhline(-2, color="red", ls="--", lw=0.6)
    ax.set_xticks(pos); ax.set_xticklabels(control_order, rotation=45, ha="right", fontsize=7)
    ax.set_title(f"{rank} — producer z", fontsize=10)
    if col == 0:
        ax.set_ylabel("producer z (paralog-expansion signal)")

    # Bottom: consumer z (informative)
    ax = axes[1, col]
    data = [sub[sub["control_class"] == c]["consumer_z_informative"].dropna().values for c in control_order]
    if any(len(d) > 0 for d in data):
        parts = ax.violinplot([d if len(d) > 0 else [0] for d in data], positions=pos,
                              showmeans=True, showmedians=True, widths=0.7)
        for pc, c in zip(parts["bodies"], control_order):
            pc.set_facecolor(control_colors[c]); pc.set_alpha(0.7)
    ax.axhline(0, color="black", lw=0.6)
    ax.axhline(2, color="red", ls="--", lw=0.6); ax.axhline(-2, color="red", ls="--", lw=0.6)
    ax.set_xticks(pos); ax.set_xticklabels(control_order, rotation=45, ha="right", fontsize=7)
    ax.set_title(f"{rank} — consumer z (informative)", fontsize=10)
    if col == 0:
        ax.set_ylabel("consumer z (HGT acquisition signal)")

plt.tight_layout()
plt.savefig(FIG_DIR / "p1a_scores_by_class_per_rank.png", dpi=150, bbox_inches="tight")
plt.show()
print("Saved figures/p1a_scores_by_class_per_rank.png")

Saved figures/p1a_scores_by_class_per_rank.png

# Producer × Consumer scatter per rank, colored by class — the deep-rank "category-cross" view
fig, axes = plt.subplots(1, len(RANKS), figsize=(4 * len(RANKS), 4.5), sharex=True, sharey=True)
for col, rank in enumerate(RANKS):
    ax = axes[col]
    sub = scores_df[scores_df["rank"] == rank]
    for cls in control_order:
        cls_sub = sub[sub["control_class"] == cls]
        cls_sub = cls_sub.dropna(subset=["producer_z", "consumer_z_informative"])
        if len(cls_sub) == 0:
            continue
        ax.scatter(cls_sub["producer_z"], cls_sub["consumer_z_informative"],
                   s=12, alpha=0.5, color=control_colors[cls], label=f"{cls} (n={len(cls_sub)})")
    ax.axhline(0, color="black", lw=0.6); ax.axvline(0, color="black", lw=0.6)
    ax.set_title(f"{rank}", fontsize=11)
    ax.set_xlabel("producer z")
    if col == 0:
        ax.set_ylabel("consumer z (informative)")
    if col == len(RANKS) - 1:
        ax.legend(fontsize=7, loc="upper right")
plt.tight_layout()
plt.savefig(FIG_DIR / "p1a_producer_consumer_per_rank.png", dpi=150, bbox_inches="tight")
plt.show()
print("Saved figures/p1a_producer_consumer_per_rank.png")

/tmp/ipykernel_75783/257848299.py:19: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  ax.legend(fontsize=7, loc="upper right")

Saved figures/p1a_producer_consumer_per_rank.png

# Strip Spark Connect attrs (per docs/pitfalls.md learning from NB01)
scores_clean = pd.DataFrame({col: scores_df[col].to_numpy() for col in scores_df.columns})
scores_clean.to_parquet(DATA_DIR / "p1a_pilot_scores.parquet", index=False)
print(f"Wrote p1a_pilot_scores.parquet: {len(scores_clean):,} rows")

validation_df.to_csv(DATA_DIR / "p1a_control_validation.tsv", sep="\t", index=False)
print(f"Wrote p1a_control_validation.tsv: {len(validation_df):,} rows")

diagnostics["completed_utc"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
with open(DATA_DIR / "p1a_pilot_atlas_diagnostics.json", "w") as f:
    json.dump(diagnostics, f, indent=2, default=str)
print(f"Wrote p1a_pilot_atlas_diagnostics.json")

Wrote p1a_pilot_scores.parquet: 9,201 rows
Wrote p1a_control_validation.tsv: 30 rows
Wrote p1a_pilot_atlas_diagnostics.json

03 P1A Pilot Atlas

NB03 — Phase 1A Pilot Atlas + Control Validation¶

Inputs¶

Outputs¶

Methodology¶

Producer z-score per (clade, UniRef)¶

Consumer z-score¶

Control validation criteria¶

Setup¶

Stage 1 — Load pilot data + null lookups¶

Stage 2 — Aggregate paralog matrices to each rank¶

Stage 3 — Compute producer z-scores per (rank, clade, UniRef)¶

Stage 4 — Join with consumer scores + control class¶

Stage 5 — Control validation¶

Stage 6 — Alm 2006 TCS HK paralog-expansion reproduction at pilot scale¶

Stage 7 — Visualize per-rank score distributions by control class¶

Stage 8 — Materialize¶

Summary¶