from __future__ import annotations

from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import binomtest, beta
from statsmodels.stats.multitest import multipletests

DATA = Path("../data")
FIG = Path("../figures")

phylum_rollup = pd.read_csv(DATA / "marker_taxonomy_rollup_phylum.csv")
h1_phylum = pd.read_csv(DATA / "h1_xoxF_vs_mxaF_per_phylum.csv")
print(f"Phyla: {len(phylum_rollup)}")
print(f"H1 phylum tests: {len(h1_phylum)}")
print(h1_phylum.head().to_string())

Phyla: 142
H1 phylum tests: 29
                phylum  n_genomes  n_xoxF  n_mxaF  xoxF_fraction  binomial_p_two_sided
0    p__Pseudomonadota     117619    2988     171       0.945869          0.000000e+00
1         p__Bacillota      67072       9       4       0.692308          2.668457e-01
2    p__Actinomycetota      26949     106       0       1.000000          2.465190e-32
3      p__Bacteroidota      20615      22       0       1.000000          4.768372e-07
4  p__Campylobacterota       7523      12       0       1.000000          4.882812e-04

# Drop phyla with no MDH at all (test not informative)
testable = h1_phylum[(h1_phylum["n_xoxF"] + h1_phylum["n_mxaF"]) > 0].copy()
reject, p_bh, _, _ = multipletests(testable["binomial_p_two_sided"].values, alpha=0.05, method="fdr_bh")
testable["p_BH"] = p_bh
testable["reject_H0_at_FDR_0.05"] = reject
testable = testable.sort_values("n_genomes", ascending=False)
print(testable.to_string(index=False))

# Save
testable.to_csv(DATA / "h1_phylum_results_bh_corrected.csv", index=False)

               phylum  n_genomes  n_xoxF  n_mxaF  xoxF_fraction  binomial_p_two_sided         p_BH  reject_H0_at_FDR_0.05
    p__Pseudomonadota     117619    2988     171       0.945869          0.000000e+00 0.000000e+00                   True
         p__Bacillota      67072       9       4       0.692308          2.668457e-01 3.869263e-01                  False
    p__Actinomycetota      26949     106       0       1.000000          2.465190e-32 2.383017e-31                   True
      p__Bacteroidota      20615      22       0       1.000000          4.768372e-07 2.765656e-06                   True
  p__Campylobacterota       7523      12       0       1.000000          4.882812e-04 1.573351e-03                   True
   p__Patescibacteria       3412       2       0       1.000000          5.000000e-01 6.304348e-01                  False
 p__Verrucomicrobiota       2440      19       5       0.791667          6.610751e-03 1.474706e-02                   True
   p__Cyanobacteriota       2440       1       0       1.000000          1.000000e+00 1.000000e+00                  False
     p__Chloroflexota       1596      19       0       1.000000          3.814697e-06 1.580375e-05                   True
   p__Planctomycetota       1120       6       0       1.000000          3.125000e-02 5.664062e-02                  False
  p__Thermoplasmatota       1116       1       0       1.000000          1.000000e+00 1.000000e+00                  False
   p__Acidobacteriota       1006     285       3       0.989583          1.601193e-80 2.321730e-79                   True
    p__Halobacteriota       1002       8       1       0.888889          3.906250e-02 6.663603e-02                  False
    p__Thermoproteota        923      17       1       0.944444          1.449585e-04 5.254745e-04                   True
       p__Myxococcota        519      10       1       0.909091          1.171875e-02 2.427455e-02                   True
   p__Gemmatimonadota        386      98       1       0.989899          3.155444e-28 2.287697e-27                   True
    p__Marinisomatota        376       2       0       1.000000          5.000000e-01 6.304348e-01                  False
      p__Deinococcota        246       7       0       1.000000          1.562500e-02 3.020833e-02                   True
p__Desulfobacterota_F        231       1       0       1.000000          1.000000e+00 1.000000e+00                  False
    p__Dormibacterota        139       3       0       1.000000          2.500000e-01 3.815789e-01                  False
            p__SAR324        123       2       0       1.000000          5.000000e-01 6.304348e-01                  False
   p__Eremiobacterota        114      11       0       1.000000          9.765625e-04 2.574574e-03                   True
       p__Bacillota_E         92       1       0       1.000000          1.000000e+00 1.000000e+00                  False
 p__Methylomirabilota         80      23       7       0.766667          5.222879e-03 1.262196e-02                   True
p__Desulfobacterota_B         65      11       0       1.000000          9.765625e-04 2.574574e-03                   True
      p__Poribacteria         55       5       0       1.000000          6.250000e-02 1.006944e-01                  False
        p__Aquificota         45       1       0       1.000000          1.000000e+00 1.000000e+00                  False
     p__Myxococcota_A         36      20       0       1.000000          1.907349e-06 9.218852e-06                   True
          p__JAAXHH01         20       0       1       0.000000          1.000000e+00 1.000000e+00                  False

n_xoxF_total = int(h1_phylum["n_xoxF"].sum())
n_mxaF_total = int(h1_phylum["n_mxaF"].sum())
total_mdh_calls = n_xoxF_total + n_mxaF_total

# Clopper-Pearson 95% CI for xoxF fraction
alpha = 0.05
lo = beta.ppf(alpha / 2, n_xoxF_total, n_mxaF_total + 1)
hi = beta.ppf(1 - alpha / 2, n_xoxF_total + 1, n_mxaF_total)
xoxF_fraction = n_xoxF_total / total_mdh_calls
ratio_point = n_xoxF_total / n_mxaF_total if n_mxaF_total > 0 else float("inf")
ratio_lo = (xoxF_fraction / (1 - xoxF_fraction)) * ((1 - hi) / hi) if hi < 1 else 0
# Equivalent CI for the ratio: r = p/(1-p) ; from CI on p we derive CI on r
ratio_lo = lo / (1 - lo) if lo > 0 else 0
ratio_hi = hi / (1 - hi) if hi < 1 else float("inf")

global_summary = pd.DataFrame([{
    "n_xoxF": n_xoxF_total,
    "n_mxaF": n_mxaF_total,
    "n_total_mdh_calls": total_mdh_calls,
    "xoxF_fraction": xoxF_fraction,
    "xoxF_fraction_95CI_lo": lo,
    "xoxF_fraction_95CI_hi": hi,
    "xoxF_to_mxaF_ratio": ratio_point,
    "ratio_95CI_lo": ratio_lo,
    "ratio_95CI_hi": ratio_hi,
}])
print(global_summary.T.to_string())

global_summary.to_csv(DATA / "h1_global_summary.csv", index=False)

                                 0
n_xoxF                 3690.000000
n_mxaF                  195.000000
n_total_mdh_calls      3885.000000
xoxF_fraction             0.949807
xoxF_fraction_95CI_lo     0.942467
xoxF_fraction_95CI_hi     0.956461
xoxF_to_mxaF_ratio       18.923077
ratio_95CI_lo            16.381204
ratio_95CI_hi            21.967845

H1_THRESHOLD = 10 / 11  # 0.9091
test_h1 = binomtest(n_xoxF_total, total_mdh_calls, p=H1_THRESHOLD, alternative="greater")
print(f"Observed xoxF fraction: {xoxF_fraction:.4f}")
print(f"H1 threshold (10:1 ratio): xoxF_fraction > {H1_THRESHOLD:.4f}")
print(f"One-sided binomial p-value (greater than threshold): {test_h1.pvalue:.3e}")
print(f"H1 supported at alpha=0.05: {test_h1.pvalue < 0.05}")

Observed xoxF fraction: 0.9498
H1 threshold (10:1 ratio): xoxF_fraction > 0.9091
One-sided binomial p-value (greater than threshold): 7.584e-22
H1 supported at alpha=0.05: True

# Need genome-level data with class. Load matrix + tax via cached files (no Spark needed).
# We'll re-derive a class-level rollup from genus-level rollup (which has phylum + family) — but we need class.
# Workaround: since NB02 didn't include class in rollups, we go back to the matrix and re-aggregate by class.
# For this we need taxonomy class column — load family rollup (which has phylum+family) and use family-to-class mapping
# isn't available from the cached CSVs. Instead, regenerate using the matrix + a small Spark pull is not desired here
# (NB03 is local). Solution: read the family rollup and group by family; report top families with xoxF/mxaF presence.

family_rollup = pd.read_csv(DATA / "marker_taxonomy_rollup_family.csv")
pseudo_families = family_rollup[
    (family_rollup["phylum"] == "p__Pseudomonadota") & (family_rollup["n_xoxF"] > 0)
].sort_values("n_xoxF", ascending=False)
print(f"Pseudomonadota families with xoxF: {len(pseudo_families)}")
print(pseudo_families[["family", "n_genomes", "n_xoxF", "n_mxaF", "n_xoxJ", "n_lanM_bakta", "n_full_cassette"]].head(20).to_string(index=False))
pseudo_families.to_csv(DATA / "h1_pseudomonadota_family_breakdown.csv", index=False)

Pseudomonadota families with xoxF: 138
                family  n_genomes  n_xoxF  n_mxaF  n_xoxJ  n_lanM_bakta  n_full_cassette
   f__Pseudomonadaceae      13315     566       1       0             0                0
  f__Xanthobacteraceae        768     224      11       0             0                0
   f__Burkholderiaceae       6039     207       0       0             0                0
   f__Rhodobacteraceae       2092     204      15      27             0                2
   f__Beijerinckiaceae        508     171      55      18            48               10
       f__Rhizobiaceae       5084     161       3       0             0                0
f__Pseudohongiellaceae        194     126       0       0             0                0
 f__Burkholderiaceae_B       1589     123       2       5             0                0
  f__Sphingomonadaceae       1150      92       0       0             0                0
     f__Rhodocyclaceae        515      77       4       5             0                2
   f__Acetobacteraceae        720      72       3       0            12                0
           f__HTCC2089        220      66       0       0             0                0
         f__Halieaceae        137      64       0       0             0                0
     f__Porticoccaceae        204      53       0       0             0                0
f__Steroidobacteraceae        225      52       0       0             0                0
      f__Oleiphilaceae        228      51       0       0             0                0
    f__Azospirillaceae         90      41       2       0             0                0
   f__Alteromonadaceae        980      40       1       0             0                0
  f__Methylomonadaceae        391      33      27       0             0                0
  f__Hyphomicrobiaceae         56      33      10       6             2                3

# 95% CI per phylum via Clopper-Pearson
def cp_ci(k, n, alpha=0.05):
    if n == 0:
        return (np.nan, np.nan)
    lo_ = beta.ppf(alpha / 2, k, n - k + 1) if k > 0 else 0.0
    hi_ = beta.ppf(1 - alpha / 2, k + 1, n - k) if k < n else 1.0
    return lo_, hi_

testable = testable.assign(
    ci_lo=testable.apply(lambda r: cp_ci(r["n_xoxF"], r["n_xoxF"] + r["n_mxaF"])[0], axis=1),
    ci_hi=testable.apply(lambda r: cp_ci(r["n_xoxF"], r["n_xoxF"] + r["n_mxaF"])[1], axis=1),
).sort_values("xoxF_fraction", ascending=True)

fig, ax = plt.subplots(figsize=(9, 7))
ypos = np.arange(len(testable))
ax.errorbar(
    testable["xoxF_fraction"],
    ypos,
    xerr=[testable["xoxF_fraction"] - testable["ci_lo"], testable["ci_hi"] - testable["xoxF_fraction"]],
    fmt="o",
    color="#1f77b4",
)
ax.axvline(H1_THRESHOLD, ls="--", color="red", label=f"H1 threshold (10:1 ratio = {H1_THRESHOLD:.3f})")
ax.axvline(0.5, ls=":", color="gray", label="Equal rate (0.5)")
ax.set_yticks(ypos)
ax.set_yticklabels(
    [f"{p} (n={int(nx)+int(nm)})" for p, nx, nm in zip(testable["phylum"], testable["n_xoxF"], testable["n_mxaF"])]
)
ax.set_xlabel("xoxF fraction of (xoxF + mxaF) calls")
ax.set_title("Per-phylum xoxF dominance over mxaF (95% Clopper-Pearson CI)")
ax.set_xlim(-0.05, 1.05)
ax.legend(loc="lower right")
plt.tight_layout()
plt.savefig(FIG / "h1_phylum_forest_plot.png", dpi=150)
plt.show()

fig, ax = plt.subplots(figsize=(7, 3))
ax.barh(["BERDL pangenome\n(all phyla pooled)"], [ratio_point], color="#2ca02c", alpha=0.6)
ax.errorbar(
    [ratio_point], [0],
    xerr=[[ratio_point - ratio_lo], [ratio_hi - ratio_point]] if np.isfinite(ratio_hi) else [[ratio_point - ratio_lo], [ratio_point * 0.1]],
    fmt="o", color="black",
)
ax.axvline(10, ls="--", color="red", label="H1 threshold (10:1 ratio)")
ax.set_xlabel("xoxF : mxaF ratio")
ax.set_title(f"Global xoxF:mxaF ratio = {ratio_point:.1f} : 1\n95% CI: [{ratio_lo:.1f}, {ratio_hi:.1f}]")
ax.legend()
plt.tight_layout()
plt.savefig(FIG / "h1_global_ratio_with_CI.png", dpi=150)
plt.show()

03 H1 Formal Test

NB03 — Formal H1 testing¶

1. Load inputs¶

2. Benjamini-Hochberg correction across phyla¶

3. Global xoxF:mxaF ratio with 95% CI¶

4. H1 test against the ≥10× threshold¶

5. Per-class breakdown within Pseudomonadota¶

6. Forest plot of per-phylum xoxF fractions¶

7. Global ratio with CI bar¶

8. Summary¶