import json, time
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import stats

PROJECT_ROOT = Path("/home/aparkin/BERIL-research-observatory/projects/gene_function_ecological_agora")
DATA_DIR = PROJECT_ROOT / "data"

scores = pd.read_parquet(DATA_DIR / "p1b_full_scores.parquet")
diagnostics = {
    "timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "n_scores": int(len(scores)),
    "purpose": "audit-trail capture of the Mann-Whitney discrimination diagnostic that revised REPORT.md Phase 1B narrative",
}
print(f"Scores loaded: {scores.shape}")

Scores loaded: (1294615, 15)

rows = []
for rank in ["genus", "family", "order", "class"]:
    for cls in ["neg_ribosomal", "neg_trna_synth", "neg_rnap_core",
                "pos_amr", "pos_tcs_hk", "pos_betalac", "pos_crispr_cas",
                "hyp_cazyme", "natural_expansion", "none"]:
        sub = scores[(scores["rank"] == rank)
                     & (scores["control_class"] == cls)
                     & (~scores["consumer_z_informative"].isna())]
        if len(sub) < 3:
            continue
        z = sub["consumer_z_informative"].values
        rows.append({
            "rank": rank, "control_class": cls, "n": len(z),
            "mean": round(float(z.mean()), 3),
            "median": round(float(np.median(z)), 3),
            "q25": round(float(np.percentile(z, 25)), 3),
            "q75": round(float(np.percentile(z, 75)), 3),
        })
summary = pd.DataFrame(rows)
summary.to_csv(DATA_DIR / "p1b_per_class_consumer_z_summary.tsv", sep="\t", index=False)
print(f"Wrote p1b_per_class_consumer_z_summary.tsv ({len(summary)} rows)")
print()
print(summary.to_string(index=False))

Wrote p1b_per_class_consumer_z_summary.tsv (40 rows)

  rank     control_class      n    mean  median     q25     q75
 genus     neg_ribosomal  76854 -13.636 -13.483 -18.614  -9.141
 genus    neg_trna_synth  58968 -12.506 -12.838 -17.315  -7.944
 genus     neg_rnap_core  26560 -12.525 -12.584 -17.310  -7.678
 genus           pos_amr  46919 -11.901 -12.216 -15.281  -8.888
 genus        pos_tcs_hk  11268 -12.224 -12.898 -17.262  -7.834
 genus       pos_betalac  14980 -12.543 -13.263 -17.981  -7.966
 genus    pos_crispr_cas  10449 -10.344 -10.442 -15.349  -5.610
 genus        hyp_cazyme  20277 -11.869 -12.261 -15.912  -7.896
 genus natural_expansion 160817 -13.976 -14.333 -17.762 -10.553
 genus              none  28167 -10.865 -11.080 -15.642  -6.114
family     neg_ribosomal  30302  -7.240  -6.993  -9.984  -3.867
family    neg_trna_synth  25242  -7.406  -6.832 -10.618  -3.795
family     neg_rnap_core  11890  -7.088  -6.603 -10.073  -3.869
family           pos_amr  20760  -6.940  -7.366  -9.187  -5.170
family        pos_tcs_hk   4496  -6.986  -6.567 -10.826  -3.389
family       pos_betalac   5945  -6.894  -6.371 -10.668  -2.937
family    pos_crispr_cas   5439  -5.859  -5.811  -8.173  -3.198
family        hyp_cazyme   8973  -6.433  -6.212  -9.319  -2.823
family natural_expansion  57448  -7.822  -7.802 -10.210  -5.328
family              none  13247  -6.207  -5.867  -9.065  -2.069
 order     neg_ribosomal  17523  -3.792  -3.346  -6.215  -1.281
 order    neg_trna_synth  14654  -4.063  -3.820  -6.628  -1.733
 order     neg_rnap_core   7236  -4.120  -4.201  -6.040  -2.225
 order           pos_amr  12772  -4.618  -4.831  -6.055  -3.541
 order        pos_tcs_hk   2628  -3.995  -3.971  -6.457  -2.093
 order       pos_betalac   3470  -4.291  -4.562  -6.580  -2.640
 order    pos_crispr_cas   3528  -2.850  -2.940  -4.373  -1.102
 order        hyp_cazyme   5467  -3.908  -3.980  -5.630  -2.344
 order natural_expansion  32421  -4.612  -4.723  -6.455  -3.101
 order              none   8134  -3.837  -3.738  -5.939  -2.071
 class     neg_ribosomal   7983  -1.714  -1.646  -2.707  -0.162
 class    neg_trna_synth   6260  -1.820  -1.477  -3.140   0.224
 class     neg_rnap_core   3264  -2.669  -2.416  -4.198  -0.359
 class           pos_amr   4738  -1.586  -1.459  -2.870   0.214
 class        pos_tcs_hk   1022  -2.307  -1.643  -4.181   0.234
 class       pos_betalac   1185  -2.212  -2.324  -3.954   0.229
 class    pos_crispr_cas   1934  -1.997  -2.138  -3.032   0.203
 class        hyp_cazyme   2166  -2.021  -1.903  -3.588   0.229
 class natural_expansion  12036  -1.835  -1.731  -3.066   0.199
 class              none   3327  -1.791  -1.470  -3.275   0.234

discrim_rows = []
for rank in ["genus", "family", "order", "class"]:
    pos_z = scores[(scores["rank"] == rank)
                   & (scores["control_class"].isin(["pos_betalac", "pos_crispr_cas", "pos_amr"]))
                   & (~scores["consumer_z_informative"].isna())]["consumer_z_informative"].values
    neg_z = scores[(scores["rank"] == rank)
                   & (scores["control_class"].isin(["neg_ribosomal", "neg_trna_synth", "neg_rnap_core"]))
                   & (~scores["consumer_z_informative"].isna())]["consumer_z_informative"].values
    if len(pos_z) < 3 or len(neg_z) < 3:
        continue
    u, p = stats.mannwhitneyu(pos_z, neg_z, alternative="greater")
    median_diff = float(np.median(pos_z) - np.median(neg_z))
    discrim_rows.append({
        "rank": rank,
        "n_positive_HGT": len(pos_z),
        "n_negative_housekeeping": len(neg_z),
        "positive_median": round(float(np.median(pos_z)), 3),
        "negative_median": round(float(np.median(neg_z)), 3),
        "median_diff": round(median_diff, 3),
        "mann_whitney_u": float(u),
        "p_one_sided_greater": float(p),
        "discriminates": p < 1e-6 and median_diff > 0,
    })
discrim = pd.DataFrame(discrim_rows)
discrim.to_csv(DATA_DIR / "p1b_discrimination_diagnostic.tsv", sep="\t", index=False)
print("Per-rank discrimination test:")
print(discrim.to_string(index=False))

Per-rank discrimination test:
  rank  n_positive_HGT  n_negative_housekeeping  positive_median  negative_median  median_diff  mann_whitney_u  p_one_sided_greater  discriminates
 genus           72348                   162382          -12.135          -13.056        0.921    6548075085.0         0.000000e+00           True
family           32144                    67434           -6.936           -6.806       -0.131    1115492980.0         3.921428e-14          False
 order           19770                    39413           -4.514           -3.688       -0.826     357124183.5         1.000000e+00          False
 class            7857                    17507           -1.654           -1.684        0.030      70046452.5         9.244936e-03          False

neg_rib = scores[(scores["rank"] == "family")
                 & (scores["control_class"] == "neg_ribosomal")
                 & (~scores["consumer_z_informative"].isna())]["consumer_z_informative"].values

pair_rows = []
for pos_cls in ["pos_amr", "pos_betalac", "pos_crispr_cas", "pos_tcs_hk", "hyp_cazyme", "natural_expansion"]:
    pos_z = scores[(scores["rank"] == "family")
                   & (scores["control_class"] == pos_cls)
                   & (~scores["consumer_z_informative"].isna())]["consumer_z_informative"].values
    if len(pos_z) < 3:
        continue
    u, p = stats.mannwhitneyu(pos_z, neg_rib, alternative="greater")
    pair_rows.append({
        "comparison": f"{pos_cls} vs neg_ribosomal",
        "rank": "family (parent=order)",
        "pos_n": len(pos_z), "pos_median": round(float(np.median(pos_z)), 3),
        "neg_n": len(neg_rib), "neg_median": round(float(np.median(neg_rib)), 3),
        "median_diff": round(float(np.median(pos_z) - np.median(neg_rib)), 3),
        "p_one_sided": float(p),
        "interpretation": "DISCRIMINATES (HGT signal real, smaller than absolute-zero criterion expected)" if (p < 1e-6 and np.median(pos_z) > np.median(neg_rib))
                          else "weak discrimination" if p < 0.05
                          else "no discrimination",
    })
pair_df = pd.DataFrame(pair_rows)
diagnostics["family_rank_pair_comparisons"] = pair_df.to_dict(orient="records")
print("Per-class vs neg_ribosomal at family rank:")
print(pair_df.to_string(index=False))

Per-class vs neg_ribosomal at family rank:
                        comparison                  rank  pos_n  pos_median  neg_n  neg_median  median_diff  p_one_sided                                                                 interpretation
          pos_amr vs neg_ribosomal family (parent=order)  20760      -7.366  30302      -6.993       -0.373 9.408929e-01                                                              no discrimination
      pos_betalac vs neg_ribosomal family (parent=order)   5945      -6.371  30302      -6.993        0.622 5.077550e-08 DISCRIMINATES (HGT signal real, smaller than absolute-zero criterion expected)
   pos_crispr_cas vs neg_ribosomal family (parent=order)   5439      -5.811  30302      -6.993        1.182 1.037815e-80 DISCRIMINATES (HGT signal real, smaller than absolute-zero criterion expected)
       pos_tcs_hk vs neg_ribosomal family (parent=order)   4496      -6.567  30302      -6.993        0.426 4.721514e-04                                                            weak discrimination
       hyp_cazyme vs neg_ribosomal family (parent=order)   8973      -6.212  30302      -6.993        0.781 1.096363e-43 DISCRIMINATES (HGT signal real, smaller than absolute-zero criterion expected)
natural_expansion vs neg_ribosomal family (parent=order)  57448      -7.802  30302      -6.993       -0.809 1.000000e+00                                                              no discrimination

# Materialize diagnostics
diagnostics["per_rank_discrimination"] = discrim.to_dict(orient="records")
diagnostics["completed_utc"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
with open(DATA_DIR / "p1b_diagnostic_response_log.json", "w") as f:
    json.dump(diagnostics, f, indent=2, default=str)
print("Wrote p1b_diagnostic_response_log.json")

Wrote p1b_diagnostic_response_log.json

08B P1B Diagnostic Response

NB08b — Phase 1B Diagnostic Response: Discrimination Test¶

Why this is in a notebook¶

Inputs¶

Outputs¶

Stage 1 — Per-(rank, class) consumer z summary¶

Stage 2 — Mann-Whitney U: positive HGT controls vs negative housekeeping per rank¶

Stage 3 — Per-class vs neg_ribosomal at family rank¶

Stage 4 — Headline interpretation¶