import pandas as pd
import numpy as np
from pathlib import Path
from scipy import stats
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Spark session for pangenome lookups
try:
    spark = get_spark_session()
except NameError:
    from berdl_notebook_utils.setup_spark_session import get_spark_session
    spark = get_spark_session()

DATA_DIR = Path('../data')
FIG_DIR = Path('../figures')

summary = pd.read_csv(DATA_DIR / 'extraction_summary.csv')
metal_exps = pd.read_csv(DATA_DIR / 'metal_experiments.csv')
org_exp_counts = pd.read_csv(DATA_DIR / 'organism_experiment_counts.csv')

print(f'Organisms: {len(summary)}')
print('Spark session ready.')

Organisms: 30
Spark session ready.

# For each organism, we need:
# 1. Number of metals where each gene is important (fit < -1)
# 2. Sick rate across non-metal experiments

# Get non-metal experiment fitness data from FB
# We'll compute sick rate = fraction of non-metal experiments where fit < -1

all_classifications = []
target_orgs = summary[summary.n_metals >= 3]['orgId'].tolist()

for idx, org in enumerate(target_orgs):
    # Load metal fitness matrix
    metal_file = DATA_DIR / 'gene_metal_fitness' / f'{org}_metal_fitness.csv'
    if not metal_file.exists():
        continue
    
    metal_mat = pd.read_csv(metal_file, index_col=0)
    metal_mat.index = metal_mat.index.astype(str)
    
    # Count metals where each gene is important (fit < -1)
    metal_important = (metal_mat < -1).sum(axis=1)
    n_metals_tested = metal_mat.notna().sum(axis=1)
    
    # Get non-metal sick rate from FB
    # Query genefitness for all non-metal experiments
    metal_exp_names = set(metal_exps[metal_exps.orgId == org]['expName'])
    
    try:
        all_fit = spark.sql(f"""
            SELECT locusId, expName, CAST(fit AS FLOAT) as fit
            FROM kescience_fitnessbrowser.genefitness
            WHERE orgId = '{org}'
        """).toPandas()
    except Exception as e:
        print(f'  [{idx+1}] {org}: Spark error — {e}')
        continue
    
    all_fit['locusId'] = all_fit['locusId'].astype(str)
    all_fit['fit'] = all_fit['fit'].astype(float)
    
    # Split into metal and non-metal
    nonmetal_fit = all_fit[~all_fit.expName.isin(metal_exp_names)]
    
    # Compute non-metal sick rate per gene
    if len(nonmetal_fit) > 0:
        nonmetal_stats = nonmetal_fit.groupby('locusId').agg(
            n_nonmetal_exps=('fit', 'count'),
            n_sick_nonmetal=('fit', lambda x: (x < -1).sum()),
        ).reset_index()
        nonmetal_stats['sick_rate_nonmetal'] = (
            nonmetal_stats['n_sick_nonmetal'] / nonmetal_stats['n_nonmetal_exps']
        )
    else:
        continue
    
    # Merge metal importance with non-metal sick rate
    gene_df = pd.DataFrame({
        'locusId': metal_mat.index,
        'n_metals_important': metal_important,
        'n_metals_tested': n_metals_tested,
    }).reset_index(drop=True)
    
    gene_df = gene_df.merge(nonmetal_stats[['locusId', 'sick_rate_nonmetal', 'n_nonmetal_exps']], 
                            on='locusId', how='left')
    gene_df['sick_rate_nonmetal'] = gene_df['sick_rate_nonmetal'].fillna(0)
    
    # Classify
    def classify_tier(row):
        if row['n_metals_important'] == 0:
            return 'metal_neutral'
        elif row['sick_rate_nonmetal'] >= 0.50:
            return 'general_stress'
        elif row['n_metals_important'] >= 2:
            return 'metal_shared'
        else:
            return 'metal_specific'
    
    gene_df['tier'] = gene_df.apply(classify_tier, axis=1)
    gene_df['orgId'] = org
    all_classifications.append(gene_df)
    
    # Report
    tier_counts = gene_df.tier.value_counts()
    metal_genes = gene_df[gene_df.tier != 'metal_neutral']
    print(f'  [{idx+1}/{len(target_orgs)}] {org}: '
          f'{len(metal_genes)} metal-important genes — '
          f'specific={tier_counts.get("metal_specific",0)}, '
          f'shared={tier_counts.get("metal_shared",0)}, '
          f'general={tier_counts.get("general_stress",0)}')

class_df = pd.concat(all_classifications, ignore_index=True)
class_df.to_csv(DATA_DIR / 'gene_tier_classification.csv', index=False)

print(f'\nTotal classified genes: {len(class_df)}')
print(f'Tier distribution:')
print(class_df.tier.value_counts().to_string())

  [1/28] DvH: 212 metal-important genes — specific=77, shared=69, general=66

  [2/28] psRCH2: 313 metal-important genes — specific=147, shared=118, general=48

  [3/28] Korea: 150 metal-important genes — specific=79, shared=35, general=36

  [4/28] Dino: 114 metal-important genes — specific=69, shared=25, general=20

  [5/28] Cola: 190 metal-important genes — specific=92, shared=74, general=24

  [6/28] Pedo557: 402 metal-important genes — specific=187, shared=143, general=72

  [7/28] acidovorax_3H11: 245 metal-important genes — specific=124, shared=91, general=30

  [8/28] SB2B: 329 metal-important genes — specific=155, shared=90, general=84

  [9/28] MR1: 312 metal-important genes — specific=130, shared=77, general=105

  [10/28] Marino: 361 metal-important genes — specific=247, shared=88, general=26

# Tier summary statistics
metal_genes = class_df[class_df.tier != 'metal_neutral'].copy()

tier_stats = metal_genes.groupby('tier').agg(
    n_genes=('locusId', 'count'),
    mean_metals_important=('n_metals_important', 'mean'),
    mean_sick_rate=('sick_rate_nonmetal', 'mean'),
    n_organisms=('orgId', 'nunique'),
).reset_index()

# Add percentage
tier_stats['pct'] = tier_stats.n_genes / tier_stats.n_genes.sum() * 100

print('=== Three-Tier Architecture ===')
print(tier_stats.to_string(index=False, float_format='%.2f'))
print()
print(f'Metal-specific: {tier_stats[tier_stats.tier=="metal_specific"].pct.values[0]:.1f}% — '
      f'important for 1 metal, rarely sick otherwise')
print(f'Metal-shared: {tier_stats[tier_stats.tier=="metal_shared"].pct.values[0]:.1f}% — '
      f'important for 2+ metals, core cross-resistance genes')
print(f'General stress: {tier_stats[tier_stats.tier=="general_stress"].pct.values[0]:.1f}% — '
      f'pleiotropic, sick under many conditions')

=== Three-Tier Architecture ===
          tier  n_genes  mean_metals_important  mean_sick_rate  n_organisms   pct
general_stress     1484                   2.96            0.66           28 18.18
  metal_shared     2306                   2.65            0.24           28 28.25
metal_specific     4372                   1.00            0.11           28 53.57

Metal-specific: 53.6% — important for 1 metal, rarely sick otherwise
Metal-shared: 28.3% — important for 2+ metals, core cross-resistance genes
General stress: 18.2% — pleiotropic, sick under many conditions

# Load ortholog groups and conservation data from essential_genome
og_file = Path('../../essential_genome/data/all_ortholog_groups.csv')
conservation_file = Path('../../essential_genome/data/family_conservation.tsv')

# Try local first, then lakehouse
if og_file.exists():
    ogs = pd.read_csv(og_file)
    print(f'Loaded ortholog groups from local: {len(ogs)} entries')
else:
    # Try lakehouse
    lakehouse_path = 's3a://cdm-lake/tenant-general-warehouse/microbialdiscoveryforge/projects/essential_genome/data/all_ortholog_groups.csv'
    ogs = spark.read.csv(lakehouse_path, header=True).toPandas()
    print(f'Loaded ortholog groups from lakehouse: {len(ogs)} entries')

if conservation_file.exists():
    conservation = pd.read_csv(conservation_file, sep='\t')
    print(f'Loaded conservation data from local: {len(conservation)} families')
else:
    lakehouse_path = 's3a://cdm-lake/tenant-general-warehouse/microbialdiscoveryforge/projects/essential_genome/data/family_conservation.tsv'
    conservation = spark.read.csv(lakehouse_path, header=True, sep='\t').toPandas()
    print(f'Loaded conservation data from lakehouse: {len(conservation)} families')

# Ensure types
ogs['locusId'] = ogs['locusId'].astype(str)
if 'pct_core' in conservation.columns:
    conservation['pct_core'] = pd.to_numeric(conservation['pct_core'], errors='coerce')

Loaded ortholog groups from local: 179237 entries
Loaded conservation data from local: 16758 families

# Merge gene classifications with ortholog groups and conservation
# Step 1: gene → OG_id
metal_genes = class_df[class_df.tier != 'metal_neutral'].copy()
metal_genes['locusId'] = metal_genes['locusId'].astype(str)

merged = metal_genes.merge(
    ogs[['orgId', 'locusId', 'OG_id']].drop_duplicates(),
    on=['orgId', 'locusId'], how='left'
)

print(f'Metal-important genes: {len(metal_genes)}')
print(f'Matched to OG: {merged.OG_id.notna().sum()} ({merged.OG_id.notna().mean():.1%})')

# Step 2: OG_id → pct_core
if 'OG_id' in conservation.columns:
    merged = merged.merge(
        conservation[['OG_id', 'pct_core']].drop_duplicates(),
        on='OG_id', how='left'
    )
    print(f'Matched to conservation: {merged.pct_core.notna().sum()} ({merged.pct_core.notna().mean():.1%})')
else:
    # Try alternative column names
    print('Conservation columns:', conservation.columns.tolist())
    # Use 'familyId' or similar
    for col in ['familyId', 'family_id', 'OG']:
        if col in conservation.columns:
            merged = merged.merge(
                conservation[[col, 'pct_core']].rename(columns={col: 'OG_id'}).drop_duplicates(),
                on='OG_id', how='left'
            )
            print(f'Matched via {col}: {merged.pct_core.notna().sum()}')
            break

Metal-important genes: 8162
Matched to OG: 6781 (83.1%)
Matched to conservation: 6769 (82.9%)

# Get functional descriptions from FB gene table
# metal_genes is defined in cell-3 (tier classification output)
metal_genes_classified = class_df[class_df.tier != 'metal_neutral'].copy()
target_orgs = metal_genes_classified.orgId.unique().tolist()
org_list = "', '".join(target_orgs)

desc_data = spark.sql(f"""
    SELECT orgId, locusId, `desc`
    FROM kescience_fitnessbrowser.gene
    WHERE orgId IN ('{org_list}')
      AND type = '1'
""").toPandas()

desc_data['locusId'] = desc_data['locusId'].astype(str)
print(f'Gene descriptions loaded: {len(desc_data)} genes')

# Merge with tier classifications
tier_desc = metal_genes_classified.merge(desc_data, on=['orgId', 'locusId'], how='left')
print(f'Metal-important genes with desc: {tier_desc["desc"].notna().sum()} ({tier_desc["desc"].notna().mean():.1%})')

# Functional keyword enrichment by tier
keywords = {
    'transporter/efflux': ['transport', 'efflux', 'permease', 'ABC', 'pump', 'porter'],
    'cell envelope': ['cell wall', 'membrane', 'lipopolysaccharide', 'peptidoglycan', 'outer membrane'],
    'DNA repair': ['DNA repair', 'recombination', 'SOS', 'RecA', 'UvrA', 'mutS'],
    'iron/metal': ['iron', 'metal', 'zinc', 'copper', 'cobalt', 'nickel', 'siderophore', 'ferr'],
    'oxidative stress': ['oxidase', 'reductase', 'thioredoxin', 'glutathione', 'catalase', 'superoxide'],
    'energy/respiration': ['cytochrome', 'NADH', 'electron transport', 'ATP synthase', 'respiratory'],
    'transcription/regulation': ['transcriptional regulator', 'response regulator', 'sensor', 'sigma'],
    'hypothetical': ['hypothetical', 'uncharacterized', 'DUF', 'unknown function'],
}

results = []
for tier in ['general_stress', 'metal_shared', 'metal_specific']:
    tier_data = tier_desc[tier_desc.tier == tier]
    total = len(tier_data)
    for cat, kws in keywords.items():
        count = tier_data['desc'].fillna('').apply(
            lambda d: any(kw.lower() in d.lower() for kw in kws)).sum()
        results.append({'tier': tier, 'category': cat, 'count': count, 
                       'total': total, 'fraction': count/total if total > 0 else 0})

func_df = pd.DataFrame(results)
func_pivot = func_df.pivot_table(index='category', columns='tier', values='fraction')

print('\nFunctional keyword enrichment by tier (fraction of genes):')
for cat in func_pivot.index:
    gen = func_pivot.loc[cat, 'general_stress'] * 100 if 'general_stress' in func_pivot.columns else 0
    sha = func_pivot.loc[cat, 'metal_shared'] * 100 if 'metal_shared' in func_pivot.columns else 0
    spe = func_pivot.loc[cat, 'metal_specific'] * 100 if 'metal_specific' in func_pivot.columns else 0
    diff = gen - spe
    print(f'  {cat:>25s}: general={gen:.1f}% shared={sha:.1f}% specific={spe:.1f}% (Δ={diff:+.1f}pp)')

Gene descriptions loaded: 131519 genes
Metal-important genes with desc: 8078 (99.0%)

Functional keyword enrichment by tier (fraction of genes):
                 DNA repair: general=0.7% shared=0.4% specific=0.7% (Δ=+0.1pp)
              cell envelope: general=3.5% shared=4.6% specific=4.4% (Δ=-0.9pp)
         energy/respiration: general=2.9% shared=2.2% specific=2.1% (Δ=+0.8pp)
               hypothetical: general=10.8% shared=21.2% specific=26.9% (Δ=-16.1pp)
                 iron/metal: general=1.8% shared=4.0% specific=3.6% (Δ=-1.9pp)
           oxidative stress: general=5.7% shared=3.9% specific=3.9% (Δ=+1.8pp)
   transcription/regulation: general=4.9% shared=6.7% specific=7.8% (Δ=-3.0pp)
         transporter/efflux: general=5.6% shared=10.4% specific=9.2% (Δ=-3.6pp)

# Figure: Functional keyword enrichment by tier
tier_order = ['general_stress', 'metal_shared', 'metal_specific']

if len(func_df) > 0:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    categories = func_pivot.index.tolist()
    x = np.arange(len(categories))
    width = 0.25
    colors = {'general_stress': '#d73027', 'metal_shared': '#fc8d59', 'metal_specific': '#fee090'}
    
    for i, tier in enumerate(tier_order):
        if tier in func_pivot.columns:
            vals = [func_pivot.loc[c, tier] * 100 for c in categories]
            ax.bar(x + i*width, vals, width, label=tier.replace('_', ' ').title(),
                   color=colors.get(tier, 'gray'), edgecolor='black', linewidth=0.5)
    
    ax.set_xticks(x + width)
    ax.set_xticklabels(categories, fontsize=8, rotation=30, ha='right')
    ax.set_ylabel('% of Genes', fontsize=11)
    ax.set_title('Functional Categories by Gene Tier', fontsize=12)
    ax.legend(fontsize=9)
    
    plt.tight_layout()
    fig.savefig(FIG_DIR / 'tier_functional_enrichment.png', dpi=150, bbox_inches='tight')
    plt.show()
    print('Saved to figures/tier_functional_enrichment.png')

Saved to figures/tier_functional_enrichment.png

# Per-organism gradient
# with_conservation is from cell-7 (merged with OG + pct_core)
with_conservation = merged[merged.pct_core.notna()].copy()

per_org_gradient = []

for org in with_conservation.orgId.unique():
    org_data = with_conservation[with_conservation.orgId == org]
    tier_means = org_data.groupby('tier')['pct_core'].mean()
    
    general = tier_means.get('general_stress', np.nan)
    shared = tier_means.get('metal_shared', np.nan)
    specific = tier_means.get('metal_specific', np.nan)
    
    # Check if gradient holds: general > shared > specific
    gradient_holds = False
    if pd.notna(general) and pd.notna(shared) and pd.notna(specific):
        gradient_holds = general > shared > specific
    
    per_org_gradient.append({
        'orgId': org,
        'general_core': general,
        'shared_core': shared,
        'specific_core': specific,
        'gradient_holds': gradient_holds,
        'n_genes': len(org_data),
    })

gradient_df = pd.DataFrame(per_org_gradient)
testable = gradient_df.dropna(subset=['general_core', 'shared_core', 'specific_core'])

print(f'Organisms with all 3 tiers: {len(testable)}')
print(f'Gradient holds (general > shared > specific): '
      f'{testable.gradient_holds.sum()}/{len(testable)} '
      f'({testable.gradient_holds.mean():.1%})')
print()
print(testable.sort_values('gradient_holds', ascending=False).to_string(
    index=False, float_format='%.1f'))

Organisms with all 3 tiers: 28
Gradient holds (general > shared > specific): 10/28 (35.7%)

             orgId  general_core  shared_core  specific_core  gradient_holds  n_genes
               DvH          95.4         93.0           87.4            True      170
              SB2B          93.7         92.7           92.0            True      322
      pseudo3_N2E3          95.9         93.1           88.8            True      250
            Btheta          95.5         92.0           90.4            True      210
             Phaeo          97.5         91.3           90.1            True      118
            Marino          94.4         93.3           88.5            True      287
               MR1          93.6         93.2           90.2            True      287
      pseudo1_N1B4          95.6         88.1           86.9            True      221
   acidovorax_3H11          91.8         91.4           90.2            True      177
              Cola          94.3         94.3           88.3            True      143
           Cup4G11          90.9         85.0           87.8           False      241
              Kang          89.3         90.4           92.9           False      182
              Koxy          89.1         88.3           91.0           False      255
            WCS417          86.3         87.5           92.4           False       87
             Korea          91.8         93.4           86.1           False      120
             Ponti          92.2         89.7           92.6           False      271
              ANA3          95.2         88.3           88.9           False      231
pseudo13_GW456_L13          94.5         95.2           93.4           False      176
              Keio          92.8         94.3           91.0           False      351
           Pedo557          82.3         93.7           91.7           False      284
                PS          87.6         87.7           87.7           False      644
             Smeli          94.0         94.3           91.9           False      239
              Dino          96.2         92.8           94.2           False       81
            psRCH2          92.9         89.2           90.1           False      293
      pseudo6_N2E2          91.7         93.2           89.3           False      246
             Caulo          87.3         92.0           88.9           False      234
               PV4          89.2         90.8           92.1           False      269
    pseudo5_N2C3_1          91.7         92.3           87.4           False      380

04 Shared Vs Specific Genes

NB04: Shared vs Specific Gene Architecture (H2)¶

1. Three-Tier Gene Classification¶

2. Map Genes to Pangenome Conservation¶

3. Per-Organism H2 Consistency¶

Summary¶