import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

DATA_DIR = Path('../data')
MATRIX_DIR = DATA_DIR / 'gene_metal_fitness'
CORR_DIR = DATA_DIR / 'cross_resistance_matrices'
CORR_DIR.mkdir(exist_ok=True)
FIG_DIR = Path('../figures')

# Load extraction summary
summary = pd.read_csv(DATA_DIR / 'extraction_summary.csv')
print(f'Organisms available: {len(summary)}')
print(f'Organisms with >=3 metals: {(summary.n_metals >= 3).sum()}')
print(f'Organisms with >=5 metals: {(summary.n_metals >= 5).sum()}')

Organisms available: 30
Organisms with >=3 metals: 28
Organisms with >=5 metals: 13

all_pairs = []  # Collect all pairwise correlations
org_corr_matrices = {}  # Store full matrices

for _, row in summary.iterrows():
    org = row['orgId']
    infile = MATRIX_DIR / f'{org}_metal_fitness.csv'
    
    if not infile.exists():
        continue
    
    mat = pd.read_csv(infile, index_col=0)
    
    # Need at least 3 metals for a meaningful matrix
    if mat.shape[1] < 3:
        continue
    
    # Compute pairwise Pearson correlations with p-values
    metals = list(mat.columns)
    corr_matrix = mat.corr(method='pearson')
    
    # Save per-organism matrix
    corr_matrix.to_csv(CORR_DIR / f'{org}_metal_corr.csv')
    org_corr_matrices[org] = corr_matrix
    
    # Extract all pairs with p-values
    for i, m1 in enumerate(metals):
        for j, m2 in enumerate(metals):
            if i < j:
                # Get complete cases for this pair
                valid = mat[[m1, m2]].dropna()
                n_genes = len(valid)
                if n_genes < 30:  # minimum for meaningful correlation
                    continue
                r, p = stats.pearsonr(valid[m1], valid[m2])
                all_pairs.append({
                    'orgId': org, 'metal1': m1, 'metal2': m2,
                    'r': r, 'p': p, 'n_genes': n_genes,
                    'n_metals_org': len(metals),
                    'pair': f'{min(m1,m2)}-{max(m1,m2)}'
                })

pairs_df = pd.DataFrame(all_pairs)
pairs_df.to_csv(DATA_DIR / 'all_metal_pairs.csv', index=False)

print(f'Computed cross-resistance for {len(org_corr_matrices)} organisms')
print(f'Total metal pairs: {len(pairs_df)}')
print(f'Unique metal pairs: {pairs_df.pair.nunique()}')
print(f'Pairs with p < 0.001: {(pairs_df.p < 0.001).sum()} ({(pairs_df.p < 0.001).mean():.1%})')
print(f'Pairs with p < 0.05: {(pairs_df.p < 0.05).sum()} ({(pairs_df.p < 0.05).mean():.1%})')

Computed cross-resistance for 28 organisms
Total metal pairs: 317
Unique metal pairs: 85
Pairs with p < 0.001: 309 (97.5%)
Pairs with p < 0.05: 314 (99.1%)

# Aggregate by metal pair
pair_summary = pairs_df.groupby('pair').agg(
    mean_r=('r', 'mean'),
    median_r=('r', 'median'),
    std_r=('r', 'std'),
    min_r=('r', 'min'),
    max_r=('r', 'max'),
    n_orgs=('orgId', 'nunique'),
    pct_positive=('r', lambda x: (x > 0).mean()),
    pct_sig=('p', lambda x: (x < 0.05).mean()),
    mean_n_genes=('n_genes', 'mean'),
).reset_index()

# Sign consistency: what fraction of organisms agree on the direction
pair_summary['sign_consistency'] = pair_summary['pct_positive'].apply(
    lambda x: max(x, 1-x)  # e.g., 90% positive → 90% consistent
)

pair_summary = pair_summary.sort_values('n_orgs', ascending=False)

print('=== Metal Pair Summary (sorted by number of organisms) ===')
print(f'Pairs tested in >=5 organisms: {(pair_summary.n_orgs >= 5).sum()}')
print(f'Pairs tested in >=10 organisms: {(pair_summary.n_orgs >= 10).sum()}')
print()
print(pair_summary[pair_summary.n_orgs >= 3].to_string(index=False, float_format='%.3f'))

=== Metal Pair Summary (sorted by number of organisms) ===
Pairs tested in >=5 organisms: 15
Pairs tested in >=10 organisms: 10

 pair  mean_r  median_r  std_r  min_r  max_r  n_orgs  pct_positive  pct_sig  mean_n_genes  sign_consistency
Co-Ni   0.559     0.580  0.188  0.204  0.915      28         1.000    1.000      3903.571             1.000
Cu-Ni   0.429     0.421  0.219 -0.080  0.897      24         0.958    1.000      3769.958             0.958
Co-Cu   0.433     0.404  0.224 -0.017  0.902      24         0.958    0.958      3769.958             0.958
Al-Ni   0.337     0.333  0.210 -0.053  0.679      20         0.950    0.950      3892.450             0.950
Al-Co   0.302     0.281  0.197 -0.064  0.737      20         0.950    1.000      3892.450             0.950
Ni-Zn   0.513     0.487  0.218  0.127  0.974      18         1.000    1.000      3593.389             1.000
Co-Zn   0.518     0.500  0.198  0.252  0.861      18         1.000    1.000      3593.389             1.000
Al-Cu   0.340     0.329  0.227 -0.060  0.801      17         0.941    1.000      3847.765             0.941
Cu-Zn   0.477     0.483  0.217 -0.070  0.787      16         0.938    1.000      3371.688             0.938
Al-Zn   0.441     0.438  0.166  0.120  0.711      13         1.000    1.000      3588.846             1.000
Fe-Ni   0.375     0.352  0.268  0.105  0.837       7         1.000    1.000      3538.429             1.000
Cu-Fe   0.455     0.473  0.195  0.181  0.777       7         1.000    1.000      3538.429             1.000
Co-Fe   0.453     0.562  0.283  0.020  0.791       7         1.000    0.857      3538.429             1.000
Fe-Zn   0.614     0.620  0.084  0.475  0.731       6         1.000    1.000      3496.667             1.000
Al-Fe   0.381     0.293  0.313  0.048  0.720       5         1.000    1.000      3517.400             1.000
 Ni-U   0.464     0.446  0.137  0.337  0.610       3         1.000    1.000      3381.667             1.000
 Al-U   0.193     0.102  0.173  0.085  0.394       3         1.000    1.000      3381.667             1.000
 Co-U   0.422     0.434  0.179  0.238  0.595       3         1.000    1.000      3381.667             1.000
 Cu-U   0.513     0.575  0.293  0.194  0.769       3         1.000    1.000      3381.667             1.000
 U-Zn   0.356     0.248  0.306  0.120  0.702       3         1.000    1.000      3381.667             1.000

# Build consensus correlation matrix from pairs tested in >=3 organisms
well_tested = pair_summary[pair_summary.n_orgs >= 3].copy()

# Get all metals that appear in well-tested pairs
metals_in_pairs = set()
for pair in well_tested['pair']:
    m1, m2 = pair.split('-')
    metals_in_pairs.add(m1)
    metals_in_pairs.add(m2)
metals_sorted = sorted(metals_in_pairs)

# Build symmetric matrix
consensus = pd.DataFrame(np.eye(len(metals_sorted)), 
                         index=metals_sorted, columns=metals_sorted)
n_orgs_matrix = pd.DataFrame(0, index=metals_sorted, columns=metals_sorted, dtype=int)

for _, row in well_tested.iterrows():
    m1, m2 = row['pair'].split('-')
    consensus.loc[m1, m2] = row['mean_r']
    consensus.loc[m2, m1] = row['mean_r']
    n_orgs_matrix.loc[m1, m2] = row['n_orgs']
    n_orgs_matrix.loc[m2, m1] = row['n_orgs']

consensus.to_csv(DATA_DIR / 'consensus_cross_resistance.csv')
n_orgs_matrix.to_csv(DATA_DIR / 'consensus_n_organisms.csv')

print('Consensus cross-resistance matrix (mean r across organisms):')
print(consensus.round(3).to_string())
print()
print('Number of organisms per pair:')
print(n_orgs_matrix.to_string())

Consensus cross-resistance matrix (mean r across organisms):
       Al     Co     Cu     Fe     Ni      U     Zn
Al  1.000  0.302  0.340  0.381  0.337  0.193  0.441
Co  0.302  1.000  0.433  0.453  0.559  0.422  0.518
Cu  0.340  0.433  1.000  0.455  0.429  0.513  0.477
Fe  0.381  0.453  0.455  1.000  0.375  0.000  0.614
Ni  0.337  0.559  0.429  0.375  1.000  0.464  0.513
U   0.193  0.422  0.513  0.000  0.464  1.000  0.356
Zn  0.441  0.518  0.477  0.614  0.513  0.356  1.000

Number of organisms per pair:
    Al  Co  Cu  Fe  Ni  U  Zn
Al   0  20  17   5  20  3  13
Co  20   0  24   7  28  3  18
Cu  17  24   0   7  24  3  16
Fe   5   7   7   0   7  0   6
Ni  20  28  24   7   0  3  18
U    3   3   3   0   3  0   3
Zn  13  18  16   6  18  3   0

# H1 Test 1: Sign consistency
# For well-tested pairs (>=5 organisms), what fraction have >70% sign agreement?
robust_pairs = pair_summary[pair_summary.n_orgs >= 5].copy()

print('=== H1 Test: Sign Consistency ===')
print(f'Metal pairs tested in >=5 organisms: {len(robust_pairs)}')
print(f'Pairs with >70% sign consistency: {(robust_pairs.sign_consistency > 0.70).sum()} '
      f'({(robust_pairs.sign_consistency > 0.70).mean():.1%})')
print(f'Pairs with >80% sign consistency: {(robust_pairs.sign_consistency > 0.80).sum()} '
      f'({(robust_pairs.sign_consistency > 0.80).mean():.1%})')
print(f'Pairs with >90% sign consistency: {(robust_pairs.sign_consistency > 0.90).sum()} '
      f'({(robust_pairs.sign_consistency > 0.90).mean():.1%})')
print(f'Pairs with 100% positive: {(robust_pairs.pct_positive == 1.0).sum()}')
print()

# All pairs are positive (cross-resistance, not cross-sensitivity)
print(f'Mean sign consistency: {robust_pairs.sign_consistency.mean():.3f}')
print(f'Mean pct_positive: {robust_pairs.pct_positive.mean():.3f}')
print()

# Binomial test: is 100% positive significant?
n_all_positive = (robust_pairs.pct_positive == 1.0).sum()
n_total = len(robust_pairs)
binom_result = stats.binomtest(n_all_positive, n_total, 0.5)
binom_p = binom_result.pvalue
print(f'Binomial test (all pairs positive vs 50% chance): '
      f'{n_all_positive}/{n_total} = {n_all_positive/n_total:.1%}, p = {binom_p:.2e}')

=== H1 Test: Sign Consistency ===
Metal pairs tested in >=5 organisms: 15
Pairs with >70% sign consistency: 15 (100.0%)
Pairs with >80% sign consistency: 15 (100.0%)
Pairs with >90% sign consistency: 15 (100.0%)
Pairs with 100% positive: 9

Mean sign consistency: 0.980
Mean pct_positive: 0.980

Binomial test (all pairs positive vs 50% chance): 9/15 = 60.0%, p = 6.07e-01

# H1 Test 2: Rank-order conservation
# For organisms with >=5 metals, do they agree on which metal pairs are most/least correlated?
# Use Kendall's W (coefficient of concordance) across organisms

# Get organisms with >=5 metals
rich_orgs = summary[summary.n_metals >= 5]['orgId'].tolist()

# Find metal pairs present in ALL rich organisms
# The common set is likely {Al, Co, Cu, Ni, Zn} tested in 12 organisms
common_metals = None
for org in rich_orgs:
    if org in org_corr_matrices:
        metals_org = set(org_corr_matrices[org].columns)
        if common_metals is None:
            common_metals = metals_org
        else:
            common_metals = common_metals & metals_org

common_metals = sorted(common_metals) if common_metals else []
print(f'Common metals across {len(rich_orgs)} rich organisms: {common_metals}')

# Extract the common-metal correlation values for each organism
if len(common_metals) >= 3:
    # Build pairs from common metals
    common_pairs = []
    for i, m1 in enumerate(common_metals):
        for j, m2 in enumerate(common_metals):
            if i < j:
                common_pairs.append((m1, m2))
    
    # For each organism, get the correlation for each common pair
    rank_data = []
    for org in rich_orgs:
        if org not in org_corr_matrices:
            continue
        corr_mat = org_corr_matrices[org]
        org_corrs = []
        for m1, m2 in common_pairs:
            if m1 in corr_mat.columns and m2 in corr_mat.columns:
                org_corrs.append(corr_mat.loc[m1, m2])
            else:
                org_corrs.append(np.nan)
        rank_data.append(org_corrs)
    
    rank_df = pd.DataFrame(rank_data, columns=[f'{m1}-{m2}' for m1, m2 in common_pairs])
    rank_df.index = [org for org in rich_orgs if org in org_corr_matrices]
    
    # Compute pairwise Spearman correlations between organisms
    # (do they agree on the ranking of metal pairs?)
    org_agreement = rank_df.T.corr(method='spearman')
    
    # Mean inter-organism agreement
    upper_tri = org_agreement.values[np.triu_indices_from(org_agreement.values, k=1)]
    print(f'\nMean inter-organism Spearman rho (rank agreement): {upper_tri.mean():.3f}')
    print(f'Median: {np.median(upper_tri):.3f}')
    print(f'Range: [{upper_tri.min():.3f}, {upper_tri.max():.3f}]')
    print(f'Fraction positive: {(upper_tri > 0).mean():.1%}')
    
    # One-sample t-test: is mean rho > 0?
    t, p = stats.ttest_1samp(upper_tri, 0)
    print(f'One-sample t-test (rho > 0): t = {t:.2f}, p = {p:.2e}')
    
    # Show the rank matrix
    print(f'\nCorrelation values for common metals ({len(common_pairs)} pairs × {len(rank_df)} organisms):')
    print(rank_df.round(3).to_string())

Common metals across 13 rich organisms: ['Co', 'Cu', 'Ni', 'Zn']

Mean inter-organism Spearman rho (rank agreement): -0.017
Median: 0.029
Range: [-0.886, 0.943]
Fraction positive: 51.3%
One-sample t-test (rho > 0): t = -0.29, p = 7.69e-01

Correlation values for common metals (6 pairs × 13 organisms):
                 Co-Cu  Co-Ni  Co-Zn  Cu-Ni  Cu-Zn  Ni-Zn
DvH              0.659  0.323  0.502  0.341  0.355  0.358
psRCH2           0.417  0.577  0.381  0.449  0.129  0.387
Korea            0.548  0.513  0.635  0.491  0.526  0.565
Dino             0.570  0.431  0.498  0.165  0.455  0.127
Cola             0.506  0.716  0.826  0.405  0.495  0.680
Pedo557          0.790  0.866  0.711  0.622  0.472  0.784
acidovorax_3H11  0.337  0.505  0.709  0.437  0.541  0.499
SB2B             0.319  0.221  0.339  0.446  0.766  0.494
MR1              0.471  0.619  0.291  0.651  0.729  0.479
Marino           0.588  0.740  0.252  0.719  0.417  0.400
Phaeo            0.325  0.601  0.386  0.378  0.536  0.555
PV4              0.698  0.707  0.354  0.671  0.464  0.474
Btheta           0.902  0.915  0.606  0.897  0.592  0.627

# Figure 1: Panel of cross-resistance heatmaps for top organisms
top_orgs = summary.nlargest(9, 'n_metals')['orgId'].tolist()
top_orgs = [o for o in top_orgs if o in org_corr_matrices]

n_panels = min(len(top_orgs), 9)
ncols = 3
nrows = (n_panels + ncols - 1) // ncols

fig, axes = plt.subplots(nrows, ncols, figsize=(5*ncols, 4.5*nrows))
axes = axes.flatten() if n_panels > 1 else [axes]

for idx, org in enumerate(top_orgs[:n_panels]):
    ax = axes[idx]
    corr = org_corr_matrices[org]
    n_metals = len(corr)
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
                vmin=-0.2, vmax=1, square=True, ax=ax,
                linewidths=0.5, linecolor='white',
                annot_kws={'size': 7 if n_metals > 6 else 9})
    ax.set_title(f'{org} ({n_metals} metals)', fontsize=10, fontweight='bold')
    ax.tick_params(labelsize=7)

# Hide unused panels
for idx in range(n_panels, len(axes)):
    axes[idx].set_visible(False)

fig.suptitle('Metal Cross-Resistance Matrices Across Organisms\n'
             '(Gene-level Pearson correlation of fitness profiles)',
             fontsize=14, y=1.02)
plt.tight_layout()
fig.savefig(FIG_DIR / 'cross_resistance_panel.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved to figures/cross_resistance_panel.png')

Saved to figures/cross_resistance_panel.png

# Figure 2: Metal pair conservation — distribution of r values across organisms
# For pairs tested in >=5 organisms, show boxplots
well_tested_pairs = pair_summary[pair_summary.n_orgs >= 5].sort_values('mean_r', ascending=False)

fig, ax = plt.subplots(figsize=(14, 6))

pair_order = well_tested_pairs['pair'].tolist()
plot_data = pairs_df[pairs_df['pair'].isin(pair_order)]

sns.boxplot(data=plot_data, x='pair', y='r', order=pair_order, ax=ax,
            color='steelblue', width=0.6, fliersize=3)
sns.stripplot(data=plot_data, x='pair', y='r', order=pair_order, ax=ax,
              color='black', size=3, alpha=0.5, jitter=0.15)

ax.axhline(0, color='red', linestyle='--', alpha=0.5, linewidth=0.8)
ax.set_xlabel('Metal Pair', fontsize=11)
ax.set_ylabel('Pearson r (gene fitness correlation)', fontsize=11)
ax.set_title('Cross-Resistance Conservation Across Organisms\n'
             f'(pairs tested in ≥5 organisms; n={len(pair_order)} pairs)',
             fontsize=12)
ax.tick_params(axis='x', rotation=45, labelsize=8)

# Add n_orgs annotation
for i, pair in enumerate(pair_order):
    n = well_tested_pairs[well_tested_pairs.pair == pair]['n_orgs'].values[0]
    ax.text(i, ax.get_ylim()[1] * 0.95, f'n={n}', ha='center', va='top', fontsize=6)

plt.tight_layout()
fig.savefig(FIG_DIR / 'metal_pair_conservation.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved to figures/metal_pair_conservation.png')

Saved to figures/metal_pair_conservation.png

# Figure 3: Hierarchical clustering of consensus matrix
# Only use metals with >=3 well-tested pairs
metals_with_data = [m for m in consensus.columns 
                    if (consensus.loc[m] != 1.0).sum() >= 2]  # at least 2 non-self entries
consensus_sub = consensus.loc[metals_with_data, metals_with_data]

# Convert correlation to distance: d = 1 - r
dist_matrix = 1 - consensus_sub.values
np.fill_diagonal(dist_matrix, 0)

# Ensure symmetry and non-negative
dist_matrix = (dist_matrix + dist_matrix.T) / 2
dist_matrix = np.clip(dist_matrix, 0, 2)

# Hierarchical clustering
condensed = squareform(dist_matrix)
Z = linkage(condensed, method='average')

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7), 
                                gridspec_kw={'width_ratios': [1, 1.5]})

# Dendrogram
dend = dendrogram(Z, labels=metals_with_data, ax=ax1, leaf_rotation=0,
                  leaf_font_size=11, orientation='left', color_threshold=0.5)
ax1.set_xlabel('Distance (1 - mean r)', fontsize=11)
ax1.set_title('Metal Clustering\n(average linkage on consensus correlations)', fontsize=11)

# Reorder consensus matrix by dendrogram
order = dend['ivl'][::-1]
consensus_ordered = consensus_sub.loc[order, order]

sns.heatmap(consensus_ordered, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
            vmin=-0.2, vmax=1, square=True, ax=ax2,
            linewidths=0.5, linecolor='white', annot_kws={'size': 10})
ax2.set_title('Consensus Cross-Resistance Matrix\n(mean Pearson r across organisms)', fontsize=11)

plt.tight_layout()
fig.savefig(FIG_DIR / 'metal_clustering_dendrogram.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved to figures/metal_clustering_dendrogram.png')

Saved to figures/metal_clustering_dendrogram.png

# Figure 4: Inter-organism agreement heatmap
if len(common_metals) >= 3:
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(org_agreement, annot=True, fmt='.2f', cmap='RdBu_r', center=0,
                vmin=-1, vmax=1, square=True, ax=ax,
                linewidths=0.5, linecolor='white', annot_kws={'size': 7})
    ax.set_title(f'Inter-Organism Agreement on Metal Pair Rankings\n'
                 f'(Spearman rho on {len(common_pairs)} common metal pairs: {", ".join(common_metals)})',
                 fontsize=11)
    ax.tick_params(labelsize=8)
    plt.tight_layout()
    fig.savefig(FIG_DIR / 'organism_agreement_heatmap.png', dpi=150, bbox_inches='tight')
    plt.show()
    print('Saved to figures/organism_agreement_heatmap.png')

Saved to figures/organism_agreement_heatmap.png

# Most and least cross-resistant pairs
well_tested = pair_summary[pair_summary.n_orgs >= 5].copy()

print('=== Strongest Cross-Resistance (highest mean r) ===')
top10 = well_tested.nlargest(10, 'mean_r')
for _, row in top10.iterrows():
    print(f"  {row['pair']:>6s}: r = {row['mean_r']:.3f} ± {row['std_r']:.3f} "
          f"(n={row['n_orgs']} orgs, {row['pct_positive']:.0%} positive, "
          f"{row['pct_sig']:.0%} significant)")

print()
print('=== Weakest Cross-Resistance (lowest mean r) ===')
bottom10 = well_tested.nsmallest(10, 'mean_r')
for _, row in bottom10.iterrows():
    print(f"  {row['pair']:>6s}: r = {row['mean_r']:.3f} ± {row['std_r']:.3f} "
          f"(n={row['n_orgs']} orgs, {row['pct_positive']:.0%} positive, "
          f"{row['pct_sig']:.0%} significant)")

print()
print('=== Overall Statistics ===')
print(f'Mean r across all well-tested pairs: {well_tested.mean_r.mean():.3f}')
print(f'All pairs positive: {(well_tested.pct_positive == 1.0).all()}')
print(f'Mean sign consistency: {well_tested.sign_consistency.mean():.3f}')

=== Strongest Cross-Resistance (highest mean r) ===
   Fe-Zn: r = 0.614 ± 0.084 (n=6 orgs, 100% positive, 100% significant)
   Co-Ni: r = 0.559 ± 0.188 (n=28 orgs, 100% positive, 100% significant)
   Co-Zn: r = 0.518 ± 0.198 (n=18 orgs, 100% positive, 100% significant)
   Ni-Zn: r = 0.513 ± 0.218 (n=18 orgs, 100% positive, 100% significant)
   Cu-Zn: r = 0.477 ± 0.217 (n=16 orgs, 94% positive, 100% significant)
   Cu-Fe: r = 0.455 ± 0.195 (n=7 orgs, 100% positive, 100% significant)
   Co-Fe: r = 0.453 ± 0.283 (n=7 orgs, 100% positive, 86% significant)
   Al-Zn: r = 0.441 ± 0.166 (n=13 orgs, 100% positive, 100% significant)
   Co-Cu: r = 0.433 ± 0.224 (n=24 orgs, 96% positive, 96% significant)
   Cu-Ni: r = 0.429 ± 0.219 (n=24 orgs, 96% positive, 100% significant)

=== Weakest Cross-Resistance (lowest mean r) ===
   Al-Co: r = 0.302 ± 0.197 (n=20 orgs, 95% positive, 100% significant)
   Al-Ni: r = 0.337 ± 0.210 (n=20 orgs, 95% positive, 95% significant)
   Al-Cu: r = 0.340 ± 0.227 (n=17 orgs, 94% positive, 100% significant)
   Fe-Ni: r = 0.375 ± 0.268 (n=7 orgs, 100% positive, 100% significant)
   Al-Fe: r = 0.381 ± 0.313 (n=5 orgs, 100% positive, 100% significant)
   Cu-Ni: r = 0.429 ± 0.219 (n=24 orgs, 96% positive, 100% significant)
   Co-Cu: r = 0.433 ± 0.224 (n=24 orgs, 96% positive, 96% significant)
   Al-Zn: r = 0.441 ± 0.166 (n=13 orgs, 100% positive, 100% significant)
   Co-Fe: r = 0.453 ± 0.283 (n=7 orgs, 100% positive, 86% significant)
   Cu-Fe: r = 0.455 ± 0.195 (n=7 orgs, 100% positive, 100% significant)

=== Overall Statistics ===
Mean r across all well-tested pairs: 0.442
All pairs positive: False
Mean sign consistency: 0.980

# Define expected chemical groups
chem_groups = {
    'Co': 'divalent', 'Ni': 'divalent', 'Zn': 'divalent', 
    'Mn': 'divalent', 'Cu': 'divalent',
    'Mo': 'oxyanion', 'W': 'oxyanion', 'Se': 'oxyanion', 'Cr': 'oxyanion',
    'Fe': 'essential', 
    'Al': 'trivalent',
    'Hg': 'heavy', 'U': 'heavy', 'Cd': 'heavy',
}

# For well-tested pairs, check if within-group correlations > between-group
well_tested_with_groups = well_tested.copy()
well_tested_with_groups['m1'] = well_tested_with_groups['pair'].str.split('-').str[0]
well_tested_with_groups['m2'] = well_tested_with_groups['pair'].str.split('-').str[1]
well_tested_with_groups['g1'] = well_tested_with_groups['m1'].map(chem_groups)
well_tested_with_groups['g2'] = well_tested_with_groups['m2'].map(chem_groups)
well_tested_with_groups['same_group'] = well_tested_with_groups['g1'] == well_tested_with_groups['g2']

within = well_tested_with_groups[well_tested_with_groups.same_group]
between = well_tested_with_groups[~well_tested_with_groups.same_group]

print('=== Chemical Group Validation ===')
print(f'Within-group pairs: n={len(within)}, mean r = {within.mean_r.mean():.3f}')
print(f'Between-group pairs: n={len(between)}, mean r = {between.mean_r.mean():.3f}')

if len(within) > 0 and len(between) > 0:
    t, p = stats.mannwhitneyu(within.mean_r, between.mean_r, alternative='greater')
    effect = within.mean_r.mean() - between.mean_r.mean()
    print(f'Mann-Whitney U (within > between): p = {p:.4f}, delta = {effect:+.3f}')
    print()
    
    print('Within-group pairs:')
    for _, row in within.sort_values('mean_r', ascending=False).iterrows():
        print(f"  {row['pair']} ({row['g1']}): r = {row['mean_r']:.3f}")
    print()
    print('Highest between-group pairs (unexpected cross-resistance):')
    for _, row in between.nlargest(5, 'mean_r').iterrows():
        print(f"  {row['pair']} ({row['g1']}/{row['g2']}): r = {row['mean_r']:.3f}")

=== Chemical Group Validation ===
Within-group pairs: n=6, mean r = 0.488
Between-group pairs: n=9, mean r = 0.411
Mann-Whitney U (within > between): p = 0.0440, delta = +0.077

Within-group pairs:
  Co-Ni (divalent): r = 0.559
  Co-Zn (divalent): r = 0.518
  Ni-Zn (divalent): r = 0.513
  Cu-Zn (divalent): r = 0.477
  Co-Cu (divalent): r = 0.433
  Cu-Ni (divalent): r = 0.429

Highest between-group pairs (unexpected cross-resistance):
  Fe-Zn (essential/divalent): r = 0.614
  Cu-Fe (divalent/essential): r = 0.455
  Co-Fe (divalent/essential): r = 0.453
  Al-Zn (trivalent/divalent): r = 0.441
  Al-Fe (trivalent/essential): r = 0.381

02 Cross Resistance Matrices

NB02: Per-Organism Cross-Resistance Matrices¶

1. Compute Per-Organism Cross-Resistance Matrices¶

2. Consensus Cross-Resistance Matrix¶

3. Test H1: Is Cross-Resistance Conserved?¶

4. Visualizations¶

5. Strongest and Weakest Cross-Resistance Pairs¶

6. Chemical Grouping Validation¶

Summary¶