import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

try:
    spark = get_spark_session()
except NameError:
    from berdl_notebook_utils.setup_spark_session import get_spark_session
    spark = get_spark_session()

DATA_DIR = Path('../data')
FIG_DIR = Path('../figures')

# Load previous results
class_df = pd.read_csv(DATA_DIR / 'gene_tier_classification.csv')
pairs_df = pd.read_csv(DATA_DIR / 'all_metal_pairs.csv')
consensus = pd.read_csv(DATA_DIR / 'consensus_cross_resistance.csv', index_col=0)
summary = pd.read_csv(DATA_DIR / 'extraction_summary.csv')

# Load ortholog groups
ogs = pd.read_csv(Path('../../essential_genome/data/all_ortholog_groups.csv'))
ogs['locusId'] = ogs['locusId'].astype(str)

print(f'Gene classifications: {len(class_df)}')
print(f'Ortholog groups: {ogs.OG_id.nunique()}')
print('Spark ready.')

Gene classifications: 109300
Ortholog groups: 17222
Spark ready.

# Get metal-shared and general-stress genes with OG assignments
metal_genes = class_df[class_df.tier.isin(['metal_shared', 'general_stress', 'metal_specific'])].copy()
metal_genes['locusId'] = metal_genes['locusId'].astype(str)

# Merge with OGs
metal_ogs = metal_genes.merge(ogs[['orgId', 'locusId', 'OG_id']].drop_duplicates(),
                               on=['orgId', 'locusId'], how='inner')

print(f'Metal-important genes with OG assignments: {len(metal_ogs)}')

# Aggregate: for each OG, count organisms and dominant tier
og_summary = metal_ogs.groupby('OG_id').agg(
    n_organisms=('orgId', 'nunique'),
    n_genes=('locusId', 'count'),
    dominant_tier=('tier', lambda x: x.value_counts().index[0]),
    mean_metals=('n_metals_important', 'mean'),
    mean_sick_rate=('sick_rate_nonmetal', 'mean'),
    organisms=('orgId', lambda x: ','.join(sorted(x.unique()))),
).reset_index()

# Cross-resistance families: OGs with metal-shared as dominant tier in >=2 organisms
cr_families = og_summary[
    (og_summary.dominant_tier == 'metal_shared') & (og_summary.n_organisms >= 2)
].copy()

cr_families.to_csv(DATA_DIR / 'cross_resistance_gene_families.csv', index=False)

print(f'\n=== Conserved Cross-Resistance Families ===')
print(f'Total OGs with metal phenotype: {len(og_summary)}')
print(f'Cross-resistance families (shared, >=2 orgs): {len(cr_families)}')
print(f'Mean organisms per family: {cr_families.n_organisms.mean():.1f}')
print(f'Max organisms: {cr_families.n_organisms.max()}')
print()

# Also count all metal OGs by tier breadth
tier_breadth = og_summary.groupby('dominant_tier').agg(
    n_families=('OG_id', 'count'),
    conserved_2plus=('n_organisms', lambda x: (x >= 2).sum()),
    conserved_5plus=('n_organisms', lambda x: (x >= 5).sum()),
).reset_index()
print('OG families by dominant tier:')
print(tier_breadth.to_string(index=False))

Metal-important genes with OG assignments: 6781

=== Conserved Cross-Resistance Families ===
Total OGs with metal phenotype: 2830
Cross-resistance families (shared, >=2 orgs): 318
Mean organisms per family: 3.4
Max organisms: 14

OG families by dominant tier:
 dominant_tier  n_families  conserved_2plus  conserved_5plus
general_stress         456              245               89
  metal_shared         774              318               60
metal_specific        1600              611              172

# Build multi-metal scores per organism
# CRITICAL FIX: Only include organisms that have tier classification data (n_metals >= 3)
# azobra and BFirm have only 2 metals and were not classified in NB04
tier_orgs = set(class_df[class_df.tier != 'metal_neutral'].orgId.unique())
scored_summary = summary[summary.orgId.isin(tier_orgs)]
print(f'Organisms with tier data: {len(scored_summary)} (excluded {len(summary) - len(scored_summary)} with <3 metals)')

org_scores = []

for org in scored_summary.orgId.unique():
    org_class = class_df[class_df.orgId == org]
    org_metal = org_class[org_class.tier != 'metal_neutral']
    
    # Tier counts
    tier_counts = org_metal.tier.value_counts()
    n_shared = tier_counts.get('metal_shared', 0)
    n_specific = tier_counts.get('metal_specific', 0)
    n_general = tier_counts.get('general_stress', 0)
    n_total_metal = len(org_metal)
    
    # Cross-resistance family count (via OGs)
    org_ogs = metal_ogs[(metal_ogs.orgId == org) & (metal_ogs.tier == 'metal_shared')]
    n_cr_families = org_ogs.OG_id.nunique()
    
    # Mean cross-resistance r (from NB02 pairs)
    org_pairs = pairs_df[pairs_df.orgId == org]
    mean_cr = org_pairs.r.mean() if len(org_pairs) > 0 else np.nan
    
    # Number of metals tested
    n_metals = summary[summary.orgId == org].n_metals.values[0]
    
    # Shared fraction (of metal-important genes)
    shared_frac = n_shared / n_total_metal if n_total_metal > 0 else 0
    
    # Get genus/species for BacDive matching later
    org_scores.append({
        'orgId': org,
        'n_metals': n_metals,
        'n_metal_genes': n_total_metal,
        'n_shared': n_shared,
        'n_specific': n_specific,
        'n_general': n_general,
        'n_cr_families': n_cr_families,
        'shared_fraction': shared_frac,
        'mean_cross_resistance_r': mean_cr,
    })

scores_df = pd.DataFrame(org_scores)
scores_df = scores_df.sort_values('n_cr_families', ascending=False)
scores_df.to_csv(DATA_DIR / 'organism_multimetal_scores.csv', index=False)

print(f'\n=== Multi-Metal Tolerance Scores ({len(scores_df)} organisms) ===')
print(scores_df.to_string(index=False, float_format='%.3f'))

Organisms with tier data: 28 (excluded 2 with <3 metals)

=== Multi-Metal Tolerance Scores (28 organisms) ===
             orgId  n_metals  n_metal_genes  n_shared  n_specific  n_general  n_cr_families  shared_fraction  mean_cross_resistance_r
                PS         4            841       406         336         99            287            0.483                    0.658
            psRCH2         8            313       118         147         48            108            0.377                    0.388
           Pedo557         5            402       143         187         72            104            0.356                    0.690
             Ponti         4            346       141         173         32             99            0.408                    0.519
              Kang         4            221       113          58         50             94            0.511                    0.393
              SB2B         5            329        90         155         84             87            0.274                    0.432
            Marino         6            361        88         247         26             75            0.244                    0.335
              Keio         4            413        91         250         72             74            0.220                    0.301
               MR1         5            312        77         130        105             72            0.247                    0.532
   acidovorax_3H11         5            245        91         124         30             71            0.371                    0.502
             Smeli         4            321        89         170         62             70            0.277                    0.435
           Cup4G11         3            304        84         188         32             70            0.276                    0.388
               PV4         5            280        69         153         58             68            0.246                    0.489
    pseudo5_N2C3_1         4            401        68         278         55             66            0.170                    0.183
              Cola         6            190        74          92         24             65            0.389                    0.507
               DvH        13            212        69          77         66             50            0.325                    0.425
            Btheta         7            313        78         144         91             48            0.249                    0.528
      pseudo3_N2E3         3            263        48         156         59             46            0.183                    0.420
              Koxy         3            303        53         203         47             43            0.175                    0.291
              ANA3         3            260        47         175         38             41            0.181                    0.348
             Phaeo         5            133        44          54         35             37            0.331                    0.469
      pseudo1_N1B4         4            257        41         200         16             35            0.160                    0.150
      pseudo6_N2E2         4            268        37         210         21             31            0.138                    0.195
             Caulo         4            335        48         150        137             30            0.143                    0.652
pseudo13_GW456_L13         4            182        30          95         57             29            0.165                    0.318
             Korea         5            150        35          79         36             29            0.233                    0.558
              Dino         6            114        25          69         20             19            0.219                    0.363
            WCS417         3             93         9          72         12              8            0.097                    0.236

# Get FB organism taxonomy for BacDive matching
org_info = spark.sql("""
    SELECT orgId, genus, species, taxonomyId
    FROM kescience_fitnessbrowser.organism
""").toPandas()

print(f'FB organisms: {len(org_info)}')

# Get BacDive isolation data
# strain.bacdive_id is int, isolation.bacdive_id is string — CAST for join
bacdive_env = spark.sql("""
    SELECT s.species_name, s.bacdive_id, i.sample_type, i.cat1, i.cat2, i.cat3
    FROM kescience_bacdive.strain s
    JOIN kescience_bacdive.isolation i ON CAST(s.bacdive_id AS STRING) = i.bacdive_id
    WHERE i.sample_type IS NOT NULL
""").toPandas()

print(f'BacDive strains with isolation data: {len(bacdive_env)}')

# Classify environments as metal-relevant
def is_metal_env(row):
    text = ' '.join([str(row.get(c, '')) for c in ['sample_type', 'cat1', 'cat2', 'cat3']]).lower()
    return any(kw in text for kw in ['metal', 'mine', 'contamina', 'waste', 'sludge',
                                      'uranium', 'acid mine', 'tailing', 'industrial'])

bacdive_env['metal_env'] = bacdive_env.apply(is_metal_env, axis=1)
bacdive_env['soil_env'] = bacdive_env['sample_type'].fillna('').str.lower().str.contains('soil')

print(f'Strains from metal environments: {bacdive_env.metal_env.sum()}')
print(f'Strains from soil: {bacdive_env.soil_env.sum()}')

FB organisms: 48

BacDive strains with isolation data: 52968

Strains from metal environments: 3664
Strains from soil: 12570

# Match FB organisms to BacDive species
# CRITICAL FIX: Collapse to species level to avoid counting the same BacDive
# strains multiple times for different FB strains of the same species
# (e.g., 5 Pseudomonas fluorescens FB strains all share the same BacDive pool)

matched = []
for _, org_row in org_info.iterrows():
    org = org_row['orgId']
    genus = str(org_row.get('genus', ''))
    species = str(org_row.get('species', ''))
    
    if not genus or genus == 'nan':
        continue
    
    search_term = f'{genus} {species}' if species and species != 'nan' else genus
    bd_match = bacdive_env[bacdive_env.species_name.str.contains(search_term, case=False, na=False)]
    
    if len(bd_match) > 0:
        matched.append({
            'orgId': org,
            'genus': genus,
            'species': species,
            'species_key': search_term.lower().strip(),
            'n_bacdive_strains': len(bd_match),
            'n_metal_env': int(bd_match.metal_env.sum()),
            'n_soil_env': int(bd_match.soil_env.sum()),
            'pct_metal_env': bd_match.metal_env.mean(),
            'pct_soil_env': bd_match.soil_env.mean(),
        })

bd_df = pd.DataFrame(matched)

# Merge with multi-metal scores (only scored organisms)
validation_raw = scores_df.merge(bd_df, on='orgId', how='inner')
print(f'FB organisms matched to BacDive (raw): {len(validation_raw)}')

# Collapse to species level: for species with multiple FB strains,
# average the cross-resistance scores and keep one BacDive entry
species_groups = validation_raw.groupby('species_key')
dup_species = species_groups.filter(lambda x: len(x) > 1).species_key.unique()
if len(dup_species) > 0:
    print(f'Species with multiple FB strains (collapsing): {list(dup_species)}')

validation = validation_raw.groupby('species_key').agg(
    orgId=('orgId', lambda x: ','.join(x)),
    n_fb_strains=('orgId', 'count'),
    n_metals=('n_metals', 'mean'),
    n_metal_genes=('n_metal_genes', 'mean'),
    n_shared=('n_shared', 'mean'),
    shared_fraction=('shared_fraction', 'mean'),
    n_cr_families=('n_cr_families', 'mean'),
    mean_cross_resistance_r=('mean_cross_resistance_r', 'mean'),
    n_bacdive_strains=('n_bacdive_strains', 'first'),  # same for all FB strains of a species
    n_metal_env=('n_metal_env', 'first'),
    pct_metal_env=('pct_metal_env', 'first'),
).reset_index()

validation.to_csv(DATA_DIR / 'bacdive_metal_environment.csv', index=False)

print(f'Species-level validation set: {len(validation)} (collapsed from {len(validation_raw)} FB strains)')
print(f'With metal-environment strains: {(validation.n_metal_env > 0).sum()}')
print()
print(validation[['species_key', 'n_fb_strains', 'shared_fraction', 'n_cr_families',
                   'n_bacdive_strains', 'pct_metal_env']]
      .sort_values('pct_metal_env', ascending=False).to_string(index=False, float_format='%.3f'))

FB organisms matched to BacDive (raw): 24
Species with multiple FB strains (collapsing): ['pseudomonas fluorescens']
Species-level validation set: 20 (collapsed from 24 FB strains)
With metal-environment strains: 10

                 species_key  n_fb_strains  shared_fraction  n_cr_families  n_bacdive_strains  pct_metal_env
      sphingomonas koreensis             1            0.233         29.000                  1          1.000
      cupriavidus basilensis             1            0.276         70.000                 16          0.500
              acidovorax sp.             1            0.371         71.000                 32          0.281
        phaeobacter inhibens             1            0.331         37.000                  4          0.250
     pseudomonas fluorescens             5            0.163         41.400                 78          0.231
    klebsiella michiganensis             1            0.175         43.000                  5          0.200
        pseudomonas stutzeri             1            0.377        108.000                 67          0.164
      desulfovibrio vulgaris             1            0.325         50.000                 12          0.083
              pedobacter sp.             1            0.356        104.000                 35          0.057
            escherichia coli             1            0.220         74.000                595          0.012
bacteroides thetaiotaomicron             1            0.249         48.000                 12          0.000
        kangiella aquimarina             1            0.511         94.000                  1          0.000
      marinobacter adhaerens             1            0.244         75.000                  1          0.000
      dinoroseobacter shibae             1            0.219         19.000                  1          0.000
     echinicola vietnamensis             1            0.389         65.000                  1          0.000
      pontibacter actiniarum             1            0.408         99.000                  1          0.000
          pseudomonas simiae             1            0.097          8.000                  2          0.000
          shewanella loihica             1            0.246         68.000                  1          0.000
       shewanella oneidensis             1            0.247         72.000                  2          0.000
              shewanella sp.             1            0.181         41.000                 12          0.000

# H3 Tests: Do organisms with more cross-resistance genes come from metal environments?
print('=== H3: Multi-Metal Tolerance Predicts Metal Environments ===')

if len(validation) >= 5:
    # Test 1: Correlation between shared fraction and metal environment frequency
    r1, p1 = stats.spearmanr(validation.shared_fraction, validation.pct_metal_env)
    print(f'Shared fraction vs metal env: rho={r1:.3f}, p={p1:.3f}')
    
    # Test 2: Correlation between n_cr_families and metal environment
    r2, p2 = stats.spearmanr(validation.n_cr_families, validation.pct_metal_env)
    print(f'N cross-resistance families vs metal env: rho={r2:.3f}, p={p2:.3f}')
    
    # Test 3: Mean cross-resistance r vs metal environment
    valid_cr = validation.dropna(subset=['mean_cross_resistance_r'])
    if len(valid_cr) >= 5:
        r3, p3 = stats.spearmanr(valid_cr.mean_cross_resistance_r, valid_cr.pct_metal_env)
        print(f'Mean cross-resistance r vs metal env: rho={r3:.3f}, p={p3:.3f}')
    
    # Test 4: n_metals tested vs metal environment (control — more metals tested ≠ more tolerant)
    r4, p4 = stats.spearmanr(validation.n_metals, validation.pct_metal_env)
    print(f'N metals tested vs metal env (control): rho={r4:.3f}, p={p4:.3f}')
    
    # Test 5: Total metal-important genes vs metal environment
    r5, p5 = stats.spearmanr(validation.n_metal_genes, validation.pct_metal_env)
    print(f'N metal-important genes vs metal env: rho={r5:.3f}, p={p5:.3f}')
else:
    print(f'Insufficient organisms with BacDive matches: {len(validation)}')

=== H3: Multi-Metal Tolerance Predicts Metal Environments ===
Shared fraction vs metal env: rho=-0.006, p=0.981
N cross-resistance families vs metal env: rho=-0.101, p=0.671
Mean cross-resistance r vs metal env: rho=0.011, p=0.962
N metals tested vs metal env (control): rho=-0.119, p=0.618
N metal-important genes vs metal env: rho=-0.085, p=0.722

# Figure: Multi-metal validation
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

if len(validation) >= 5:
    # Panel 1: Shared fraction vs metal env
    ax = axes[0]
    ax.scatter(validation.shared_fraction, validation.pct_metal_env, 
              s=validation.n_bacdive_strains.clip(upper=200), alpha=0.6, c='steelblue')
    for _, row in validation.iterrows():
        ax.annotate(row.orgId, (row.shared_fraction, row.pct_metal_env), 
                   fontsize=5, alpha=0.7)
    ax.set_xlabel('Shared Gene Fraction', fontsize=10)
    ax.set_ylabel('% BacDive Strains from Metal Env', fontsize=10)
    ax.set_title(f'Shared Genes vs Metal Isolation\n(rho={r1:.3f}, p={p1:.3f})', fontsize=10)
    
    # Panel 2: CR families vs metal env
    ax = axes[1]
    ax.scatter(validation.n_cr_families, validation.pct_metal_env,
              s=validation.n_bacdive_strains.clip(upper=200), alpha=0.6, c='coral')
    for _, row in validation.iterrows():
        ax.annotate(row.orgId, (row.n_cr_families, row.pct_metal_env),
                   fontsize=5, alpha=0.7)
    ax.set_xlabel('N Cross-Resistance Gene Families', fontsize=10)
    ax.set_ylabel('% BacDive Strains from Metal Env', fontsize=10)
    ax.set_title(f'CR Families vs Metal Isolation\n(rho={r2:.3f}, p={p2:.3f})', fontsize=10)
    
    # Panel 3: Distribution of cross-resistance scores by environment
    ax = axes[2]
    has_metal = validation[validation.pct_metal_env > 0]
    no_metal = validation[validation.pct_metal_env == 0]
    
    data = [has_metal.shared_fraction.values, no_metal.shared_fraction.values]
    labels = [f'Metal env\n(n={len(has_metal)})', f'No metal env\n(n={len(no_metal)})']
    bp = ax.boxplot(data, labels=labels, widths=0.5, patch_artist=True)
    bp['boxes'][0].set_facecolor('#d73027')
    bp['boxes'][1].set_facecolor('#4575b4')
    
    if len(has_metal) > 0 and len(no_metal) > 0:
        u, p = stats.mannwhitneyu(has_metal.shared_fraction, no_metal.shared_fraction, 
                                  alternative='greater')
        ax.set_title(f'Shared Fraction by Environment\n(MWU p={p:.3f})', fontsize=10)
    ax.set_ylabel('Shared Gene Fraction', fontsize=10)

plt.tight_layout()
fig.savefig(FIG_DIR / 'multimetal_validation.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved to figures/multimetal_validation.png')

Saved to figures/multimetal_validation.png

# Get gene descriptions for top cross-resistance families
top_families = cr_families.nlargest(20, 'n_organisms')

# Get representative gene descriptions from FB
cr_gene_details = metal_ogs[metal_ogs.OG_id.isin(top_families.OG_id)]
target_orgs = cr_gene_details.orgId.unique().tolist()
org_list = "', '".join(target_orgs)

desc_data = spark.sql(f"""
    SELECT orgId, locusId, `desc`
    FROM kescience_fitnessbrowser.gene
    WHERE orgId IN ('{org_list}') AND type = '1'
""").toPandas()
desc_data['locusId'] = desc_data['locusId'].astype(str)

cr_annotated = cr_gene_details.merge(desc_data, on=['orgId', 'locusId'], how='left')

# Get representative description per family (most common non-hypothetical)
family_desc = []
for og_id in top_families.OG_id:
    og_genes = cr_annotated[cr_annotated.OG_id == og_id]
    descs = og_genes['desc'].dropna()
    non_hyp = descs[~descs.str.contains('hypothetical|uncharacterized', case=False, na=True)]
    rep_desc = non_hyp.mode().values[0] if len(non_hyp) > 0 else (descs.mode().values[0] if len(descs) > 0 else 'unknown')
    n_orgs = top_families[top_families.OG_id == og_id].n_organisms.values[0]
    mean_metals = og_genes.n_metals_important.mean()
    family_desc.append({
        'OG_id': og_id, 'description': rep_desc[:80],
        'n_organisms': n_orgs, 'mean_metals_important': mean_metals
    })

family_desc_df = pd.DataFrame(family_desc)
print('=== Top 20 Conserved Cross-Resistance Gene Families ===')
for _, row in family_desc_df.iterrows():
    print(f"  {row.OG_id}: {row.n_organisms} orgs, {row.mean_metals_important:.1f} metals — {row.description}")

=== Top 20 Conserved Cross-Resistance Gene Families ===
  OG01234: 14 orgs, 2.1 metals — DNA polymerase I
  OG01329: 14 orgs, 2.4 metals — Glutathione synthetase (EC 6.3.2.3)
  OG00363: 12 orgs, 2.1 metals — NAD-dependent dehydratase
  OG00425: 12 orgs, 2.3 metals — elongation factor 4
  OG00742: 12 orgs, 1.7 metals — exoribonuclease R
  OG02259: 12 orgs, 1.8 metals — trigger factor
  OG00583: 11 orgs, 1.8 metals — ATP-dependent DNA helicase UvrD/PcrA
  OG00768: 10 orgs, 2.4 metals — Putative Zn-dependent protease, contains TPR repeats
  OG00973: 10 orgs, 3.2 metals — Glutamate--cysteine ligase (EC 6.3.2.2)
  OG01002: 10 orgs, 2.4 metals — MotA/TolQ/ExbB proton channel (RefSeq)
  OG01653: 10 orgs, 2.2 metals — transcription elongation factor GreA
  OG00135: 9 orgs, 2.3 metals — two component transcriptional regulator, LuxR family
  OG00715: 9 orgs, 2.6 metals — peptide chain release factor 3
  OG00779: 9 orgs, 2.0 metals — ABC-type dipeptide transport system, periplasmic component
  OG01437: 9 orgs, 2.4 metals — ATP-dependent protease La
  OG00417: 8 orgs, 1.8 metals — ABC-type dipeptide/oligopeptide/nickel transport system, permease component
  OG00451: 8 orgs, 2.4 metals — 3-oxoacyl-ACP synthase
  OG00501: 8 orgs, 2.2 metals — aminotransferase DegT
  OG01318: 8 orgs, 2.6 metals — dimethyladenosine transferase
  OG01928: 8 orgs, 2.2 metals — phosphate ABC transporter ATP-binding protein

05 Pangenome Prediction

NB05: Pangenome-Scale Multi-Metal Prediction & BacDive Validation (H3)¶

1. Identify Conserved Cross-Resistance Gene Families¶

2. Per-Organism Multi-Metal Tolerance Scores¶

3. BacDive Validation¶

4. Cross-Resistance Family Characterization¶

Summary¶