from collections import Counter, defaultdict
from itertools import combinations
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

PROJECT_ROOT = Path.cwd().parent
DATA = PROJECT_ROOT / "data"
FIGURES = PROJECT_ROOT / "figures"
FIGURES.mkdir(exist_ok=True)

use = pd.read_csv(DATA / "realized_use.csv")
bridges = pd.read_csv(DATA / "cross_tenant_bridges.csv")
atlas = pd.read_csv(DATA / "table_topic_map.csv")
print(f"Audited projects: {len(use)}; theoretical bridges: {len(bridges)}; atlas tables: {len(atlas)}")

Audited projects: 68; theoretical bridges: 536; atlas tables: 1740

n_total       = len(use)
n_cross       = int(use["cross_tenant"].sum())
n_single      = n_total - n_cross
by_n_tenants  = use["tenants_used"].str.split("|").str.len().value_counts().sort_index()
print(f"Projects audited:        {n_total}")
print(f"Cross-tenant (≥2):       {n_cross} ({100*n_cross/n_total:.0f}%)")
print(f"Single-tenant:           {n_single} ({100*n_single/n_total:.0f}%)")
print()
print("Distribution of # tenants per project:")
print(by_n_tenants.to_string())

Projects audited:        68
Cross-tenant (≥2):       45 (66%)
Single-tenant:           23 (34%)

Distribution of # tenants per project:
tenants_used
1    25
2    35
3     5
4     1
5     2

tenant_freq = Counter()
for tlist in use["tenants_used"].str.split("|"):
    for t in tlist:
        tenant_freq[t.strip()] += 1
tenant_freq_df = (
    pd.Series(tenant_freq, name="projects")
      .to_frame()
      .assign(percent=lambda d: (100 * d["projects"] / n_total).round(1))
      .sort_values("projects", ascending=False)
)
tenant_freq_df

fig, ax = plt.subplots(figsize=(8, 4))
tenant_freq_df["projects"].sort_values().plot.barh(ax=ax, color="#4c72b0")
for i, v in enumerate(tenant_freq_df["projects"].sort_values().values):
    ax.text(v + 0.5, i, f"{v}  ({100*v/n_total:.0f}%)", va="center", fontsize=8)
ax.set_xlabel(f"Projects using this tenant (of {n_total})")
ax.set_ylabel("")
ax.set_title("Realized BERDL tenant reuse across BERIL projects")
ax.set_xlim(0, tenant_freq_df['projects'].max() * 1.4)
fig.tight_layout()
fig.savefig(FIGURES / "nb03_tenant_frequency.png", dpi=140, bbox_inches="tight")
plt.show()

pair_freq = Counter()
for _, row in use.iterrows():
    tenants = sorted(set(t.strip() for t in str(row["tenants_used"]).split("|")))
    for a, b in combinations(tenants, 2):
        pair_freq[(a, b)] += 1

realized_pairs = (
    pd.DataFrame([(a, b, n) for (a, b), n in pair_freq.items()], columns=["tenant_a", "tenant_b", "projects"])
      .sort_values("projects", ascending=False)
      .reset_index(drop=True)
)
print(f"Realized cross-tenant pairs: {len(realized_pairs)}")
realized_pairs.head(20)

Realized cross-tenant pairs: 20

# Roll the NB02 bridges to tenant×tenant level. For each unordered tenant pair,
# sum shared_keys across all cells. (A pair gets counted once per cell-bridge.)
th_pairs = defaultdict(int)
th_keys  = defaultdict(set)
for _, b in bridges.iterrows():
    pair = tuple(sorted([b["tenant_a"], b["tenant_b"]]))
    th_pairs[pair] += int(b["shared_keys"])
    th_keys[pair].update(k.strip() for k in str(b["keys"]).split(","))

all_pairs = sorted(set(th_pairs) | set(tuple(sorted(p)) for p in pair_freq))
overlay = []
real_pair_lookup = {tuple(sorted((r["tenant_a"], r["tenant_b"]))): r["projects"] for _, r in realized_pairs.iterrows()}
for pair in all_pairs:
    overlay.append({
        "tenant_a":         pair[0],
        "tenant_b":         pair[1],
        "theoretical_keys": len(th_keys.get(pair, set())),
        "realized_projects": real_pair_lookup.get(pair, 0),
        "keys":             ", ".join(sorted(th_keys.get(pair, set()))[:8]),
    })
overlay_df = (
    pd.DataFrame(overlay)
      .sort_values(["realized_projects", "theoretical_keys"], ascending=False)
      .reset_index(drop=True)
)
overlay_df.head(25)

fig, ax = plt.subplots(figsize=(9, 6))
# All theoretical pairs that have at least one shared key OR have been realized.
plot_df = overlay_df[(overlay_df["theoretical_keys"] > 0) | (overlay_df["realized_projects"] > 0)].copy()
ax.scatter(plot_df["theoretical_keys"], plot_df["realized_projects"],
           s=80, alpha=0.65, edgecolor="black", linewidth=0.4)
for _, r in plot_df.iterrows():
    if r["realized_projects"] >= 3 or r["theoretical_keys"] >= 12 or (r["theoretical_keys"] >= 6 and r["realized_projects"] == 0):
        ax.annotate(f"{r['tenant_a']}↔{r['tenant_b']}",
                    (r["theoretical_keys"], r["realized_projects"]),
                    xytext=(5, 3), textcoords="offset points", fontsize=8)
ax.set_xlabel("Theoretical shared join keys (rolled-up from NB02)")
ax.set_ylabel("Realized: BERIL projects using both tenants")
ax.set_title("Realized vs. theoretical cross-tenant bridges\n(top-right = high opportunity AND high use; bottom-right = untapped)")
ax.axhline(y=0, color="red", alpha=0.3, linestyle=":", linewidth=1)
ax.grid(True, alpha=0.3)
fig.tight_layout()
fig.savefig(FIGURES / "nb03_theoretical_vs_realized.png", dpi=140, bbox_inches="tight")
plt.show()

untapped = (
    overlay_df[(overlay_df["realized_projects"] == 0) & (overlay_df["theoretical_keys"] > 0)]
      .sort_values("theoretical_keys", ascending=False)
      .head(20)
      .reset_index(drop=True)
)
untapped

overlay_df.to_csv(DATA / "theoretical_vs_realized.csv", index=False)
untapped.to_csv(DATA / "untapped_bridges.csv", index=False)
tenant_freq_df.to_csv(DATA / "tenant_reuse_frequency.csv")
print("Saved:")
for f in ["theoretical_vs_realized.csv", "untapped_bridges.csv", "tenant_reuse_frequency.csv"]:
    print(f"  data/{f} — {(DATA / f).stat().st_size:,} bytes")

Saved:
  data/theoretical_vs_realized.csv — 4,481 bytes
  data/untapped_bridges.csv — 1,756 bytes
  data/tenant_reuse_frequency.csv — 151 bytes

	projects	percent
kbase	60	88.2
kescience	40	58.8
nmdc	6	8.8
enigma	5	7.4
arkinlab	5	7.4
phagefoundry	4	5.9
protect	2	2.9
user	1	1.5
globalusers	1	1.5

	tenant_a	tenant_b	projects
0	kbase	kescience	36
1	kbase	nmdc	5
2	enigma	kbase	4
3	kbase	phagefoundry	4
4	arkinlab	kbase	4
5	kescience	phagefoundry	3
6	enigma	kescience	3
7	kbase	protect	2
8	kescience	protect	2
9	phagefoundry	protect	2
10	arkinlab	kescience	2
11	kescience	nmdc	1
12	arkinlab	globalusers	1
13	arkinlab	enigma	1
14	enigma	globalusers	1
15	globalusers	kbase	1
16	globalusers	kescience	1
17	arkinlab	phagefoundry	1
18	arkinlab	protect	1
19	nmdc	phagefoundry	1

	tenant_a	tenant_b	theoretical_keys	realized_projects	keys
0	kbase	kescience	15	36	cazy, cog, compound_id, ec_number, eggnog, fea...
1	kbase	nmdc	11	5	biosample_id, cog, compound_id, ec_number, fea...
2	enigma	kbase	9	4	cazy, cog, ec_number, eggnog, feature_id, geno...
3	kbase	phagefoundry	9	4	cazy, cog, ec_number, eggnog, feature_id, geno...
4	arkinlab	kbase	5	4	ec_number, feature_id, genome_id, genus, species
5	enigma	kescience	9	3	cazy, cog, ec_number, eggnog, feature_id, geno...
6	kescience	phagefoundry	9	3	cazy, cog, ec_number, eggnog, feature_id, geno...
7	kbase	protect	12	2	cazy, cog, ec_number, eggnog, feature_id, geno...
8	kescience	protect	12	2	cazy, cog, ec_number, eggnog, feature_id, geno...
9	phagefoundry	protect	11	2	cazy, cog, ec_number, eggnog, feature_id, geno...
10	arkinlab	kescience	7	2	ec_number, feature_id, genome_id, genus, lat_l...
11	kescience	nmdc	10	1	cog, compound_id, ec_number, feature_id, genus...
12	nmdc	phagefoundry	7	1	cog, ec_number, feature_id, kegg_pathway, ncbi...
13	arkinlab	protect	6	1	ec_number, feature_id, genome_id, genus, sampl...
14	arkinlab	enigma	4	1	ec_number, feature_id, genome_id, sample_id
15	arkinlab	phagefoundry	4	1	ec_number, feature_id, genome_id, sample_id
16	arkinlab	globalusers	0	1
17	enigma	globalusers	0	1
18	globalusers	kbase	0	1
19	globalusers	kescience	0	1
20	kescience	refdata	12	0	alphafold_pdb, ec_number, feature_id, gene_clu...
21	enigma	phagefoundry	11	0	cazy, cog, ec_number, eggnog, feature_id, geno...
22	enigma	protect	11	0	cazy, cog, ec_number, eggnog, feature_id, geno...
23	kbase	refdata	11	0	biosample_id, ec_number, feature_id, gene_clus...
24	nmdc	protect	10	0	cog, ec_number, feature_id, genus, kegg_ko, ke...

	tenant_a	tenant_b	theoretical_keys	keys
0	kescience	refdata	12	alphafold_pdb, ec_number, feature_id, gene_clu...
1	enigma	phagefoundry	11	cazy, cog, ec_number, eggnog, feature_id, geno...
2	enigma	protect	11	cazy, cog, ec_number, eggnog, feature_id, geno...
3	kbase	refdata	11	biosample_id, ec_number, feature_id, gene_clus...
4	nmdc	protect	10	cog, ec_number, feature_id, genus, kegg_ko, ke...
5	nmdc	refdata	9	biosample_id, ec_number, envo, feature_id, gen...
6	arkinlab	refdata	8	ec_number, feature_id, genome_id, genus, lat_l...
7	protect	refdata	8	ec_number, feature_id, genome_id, genus, ncbi_...
8	enigma	nmdc	7	cog, ec_number, feature_id, kegg_pathway, ncbi...
9	enigma	refdata	6	ec_number, feature_id, genome_id, ncbi_taxon_i...
10	phagefoundry	refdata	6	ec_number, feature_id, genome_id, ncbi_taxon_i...
11	arkinlab	nmdc	5	ec_number, feature_id, genus, sample_id, species
12	msyscolo	refdata	5	genome_id, genus, gtdb_taxonomy, lat_lon, species
13	arkinlab	msyscolo	4	genome_id, genus, lat_lon, species
14	kbase	msyscolo	4	genome_id, genus, gtdb_taxonomy, species
15	kescience	msyscolo	4	genome_id, genus, lat_lon, species
16	netl	refdata	4	gtdb_taxonomy, lat_lon, pmid_doi, sample_id
17	arkinlab	netl	3	lat_lon, pmid_doi, sample_id
18	msyscolo	protect	3	genome_id, genus, species
19	nmdc	planetmicrobe	3	ncbi_taxon_id, pfam, sample_id

03 Realized Use Audit

NB03 — Realized-use Audit: which of the 536 potential bridges has actually been used¶

1. Project-level summary¶

2. Tenant reuse frequency — which BERDL tenants drive the most analyses¶

3. Realized cross-tenant pair frequency¶

4. Theoretical vs. realized — overlay¶

5. Untapped opportunities — high-key bridges with zero realized projects¶

6. Findings¶

Audit caveats¶