import pandas as pd
import numpy as np
import os, warnings
from scipy import stats
warnings.filterwarnings('ignore')

DATA = '../data'
FIG  = '../figures'
os.makedirs(DATA, exist_ok=True)
os.makedirs(FIG, exist_ok=True)

spark = get_spark_session()
print('Spark session ready')

# Load Phase 1/2 data
sp_comp = pd.read_csv(f'{DATA}/species_compartment.csv')
refined = pd.read_csv(f'{DATA}/species_cohort_refined.csv')
novel_ogs = pd.read_csv(f'{DATA}/novel_og_annotations.csv')
host_sp = pd.read_csv(f'{DATA}/genome_host_species.csv')

print(f'Pangenome species: {len(sp_comp):,}')
print(f'Plant-associated species: {sp_comp["is_plant_associated"].sum():,}')
print(f'Refined cohort species: {len(refined):,}')
print(f'Novel OGs: {len(novel_ogs)}')

Spark session ready
Pangenome species: 26,511
Plant-associated species: 1,136
Refined cohort species: 25,660
Novel OGs: 50

CACHE_BRIDGE = f'{DATA}/mgnify_pangenome_bridge.csv'

if os.path.exists(CACHE_BRIDGE) and os.path.getsize(CACHE_BRIDGE) > 100:
    mgnify_species = pd.read_csv(CACHE_BRIDGE)
    print(f'Loaded cached bridge: {len(mgnify_species)} rows')
else:
    # Get all MGnify species with taxonomy across biomes
    mgnify_species = spark.sql("""
        SELECT s.species_id,
               s.biome_id,
               s.gtdb_genus,
               s.gtdb_species,
               COUNT(DISTINCT g.genome_id) AS n_genomes
        FROM kescience_mgnify.species s
        JOIN kescience_mgnify.genome g 
             ON s.species_id = g.species_id AND s.biome_id = g.biome_id
        WHERE s.biome_id IN ('tomato-rhizosphere', 'maize-rhizosphere',
                             'barley-rhizosphere', 'soil')
        GROUP BY s.species_id, s.biome_id, s.gtdb_genus, s.gtdb_species
    """).toPandas()
    mgnify_species.to_csv(CACHE_BRIDGE, index=False)
    print(f'Retrieved MGnify species: {len(mgnify_species)} rows')

print(f'\nMGnify species by biome:')
print(mgnify_species.groupby('biome_id').agg(
    n_species=('species_id', 'nunique'),
    n_genera=('gtdb_genus', 'nunique'),
    n_genomes=('n_genomes', 'sum')
))

# Extract pangenome genus names (strip g__ prefix for matching)
pan_genera = set(sp_comp['genus'].dropna().unique())
# MGnify genus names — check format
mg_genera = set(mgnify_species['gtdb_genus'].dropna().unique())

# Check if MGnify uses 'g__' prefix or bare names
print(f'\nMGnify genus format examples: {list(mg_genera)[:5]}')
print(f'Pangenome genus format examples: {list(pan_genera)[:5]}')

# Build genus-level bridge
# If MGnify uses bare names, add g__ prefix for matching
if not any(g.startswith('g__') for g in list(mg_genera)[:10] if g):
    mgnify_species['pan_genus'] = 'g__' + mgnify_species['gtdb_genus'].fillna('')
else:
    mgnify_species['pan_genus'] = mgnify_species['gtdb_genus']

genus_overlap = pan_genera & set(mgnify_species['pan_genus'].unique())
print(f'\nGenus-level overlap: {len(genus_overlap)} genera')

# Species-level bridge: normalize MGnify species names to match pangenome format
# Pangenome: s__Genus_species--RS_GCFxxx  MGnify: Genus species or similar
# We'll do genus-level matching since species naming conventions differ
mgnify_species['has_pangenome_match'] = mgnify_species['pan_genus'].isin(pan_genera)
matched = mgnify_species[mgnify_species['has_pangenome_match']]
print(f'MGnify species with pangenome genus match: {matched["species_id"].nunique()}')
print(f'MGnify genomes in matched genera: {matched["n_genomes"].sum()}')

Loaded cached bridge: 20473 rows

MGnify species by biome:
                    n_species  n_genera  n_genomes
biome_id                                          
barley-rhizosphere         86        52        103
maize-rhizosphere         336       212        431
soil                    19472      4229      20908
tomato-rhizosphere        579       309        666

MGnify genus format examples: ['DASWKP01', 'DATLFI01', 'DAOUEX01', 'PALSA-600', 'DASPNM01']
Pangenome genus format examples: ['g__JAACPJ01', 'g__MMGLQ5-1', 'g__MGBC157735', 'g__Lautropia', 'g__Pusillimonas_C']

Genus-level overlap: 1185 genera
MGnify species with pangenome genus match: 9603
MGnify genomes in matched genera: 10888

# Build genus × biome presence matrix for rhizosphere biomes
rhizo_biomes = ['tomato-rhizosphere', 'maize-rhizosphere', 'barley-rhizosphere']
rhizo = mgnify_species[mgnify_species['biome_id'].isin(rhizo_biomes)]

# Genus-level presence per biome
genus_biome = (rhizo.groupby(['pan_genus', 'biome_id'])
    .agg(n_species=('species_id', 'nunique'),
         n_genomes=('n_genomes', 'sum'))
    .reset_index())

# Pivot to matrix
genus_biome_matrix = genus_biome.pivot_table(
    index='pan_genus', columns='biome_id', values='n_species', fill_value=0)

# Filter to genera in pangenome
genus_biome_matrix = genus_biome_matrix[genus_biome_matrix.index.isin(pan_genera)]

print(f'Genera in rhizosphere biomes (with pangenome match): {len(genus_biome_matrix)}')

# Genera in all 3 biomes
in_all = genus_biome_matrix[(genus_biome_matrix > 0).all(axis=1)]
print(f'Genera in ALL 3 rhizosphere biomes: {len(in_all)}')

# Genera specific to one biome
for biome in rhizo_biomes:
    other_biomes = [b for b in rhizo_biomes if b != biome]
    specific = genus_biome_matrix[
        (genus_biome_matrix[biome] > 0) & 
        (genus_biome_matrix[other_biomes] == 0).all(axis=1)
    ]
    print(f'Genera unique to {biome}: {len(specific)}')

# Save
host_spec = genus_biome_matrix.copy()
host_spec['in_all_3'] = (host_spec > 0).all(axis=1)
host_spec['n_biomes'] = (host_spec[rhizo_biomes] > 0).sum(axis=1)

# Merge with pangenome cohort info
genus_cohort = (refined.groupby('genus')['cohort_refined']
    .agg(lambda x: x.value_counts().index[0]).reset_index()
    .rename(columns={'cohort_refined': 'dominant_cohort'}))

host_spec = host_spec.reset_index().merge(
    genus_cohort, left_on='pan_genus', right_on='genus', how='left')

host_spec.to_csv(f'{DATA}/mgnify_host_specificity.csv', index=False)
print(f'\nSaved: {DATA}/mgnify_host_specificity.csv')

# Show top shared genera with their cohorts
print(f'\nTop genera in all 3 rhizosphere biomes:')
shared = host_spec[host_spec['in_all_3'] == True].sort_values(
    rhizo_biomes, ascending=False).head(20)
print(shared[['pan_genus', 'dominant_cohort'] + rhizo_biomes].to_string(index=False))

Genera in rhizosphere biomes (with pangenome match): 265
Genera in ALL 3 rhizosphere biomes: 17
Genera unique to tomato-rhizosphere: 117
Genera unique to maize-rhizosphere: 54
Genera unique to barley-rhizosphere: 5

Saved: ../data/mgnify_host_specificity.csv

Top genera in all 3 rhizosphere biomes:
             pan_genus dominant_cohort  tomato-rhizosphere  maize-rhizosphere  barley-rhizosphere
      g__Pseudomonas_E     dual-nature                18.0               14.0                 7.0
       g__Streptomyces     dual-nature                18.0                7.0                 1.0
         g__Variovorax     dual-nature                 5.0                3.0                 1.0
           g__Telluria     dual-nature                 5.0                2.0                 4.0
         g__Acidovorax     dual-nature                 5.0                2.0                 1.0
   g__Stenotrophomonas     dual-nature                 4.0                5.0                 1.0
       g__Enterobacter     dual-nature                 4.0                3.0                 1.0
            g__Devosia     dual-nature                 4.0                2.0                 2.0
        g__Sphingobium     dual-nature                 4.0                2.0                 2.0
      g__Luteolibacter      pathogenic                 4.0                2.0                 1.0
g__Pseudoxanthomonas_A     dual-nature                 4.0                1.0                 2.0
       g__Arthrobacter     dual-nature                 3.0                2.0                 2.0
   g__Mucilaginibacter     dual-nature                 3.0                1.0                 3.0
      g__Agrobacterium     dual-nature                 2.0                1.0                 1.0
         g__Pedobacter     dual-nature                 1.0                1.0                 5.0
   g__Phenylobacterium     dual-nature                 1.0                1.0                 1.0
           g__Priestia     dual-nature                 1.0                1.0                 1.0

CACHE_KEGG_BIOME = f'{DATA}/mgnify_kegg_biome_profiles.csv'

if os.path.exists(CACHE_KEGG_BIOME) and os.path.getsize(CACHE_KEGG_BIOME) > 100:
    kegg_biome = pd.read_csv(CACHE_KEGG_BIOME)
    print(f'Loaded cached KEGG biome profiles: {len(kegg_biome)} rows')
else:
    # Get KEGG module profiles per biome (all modules, not just plant-specific)
    kegg_biome = spark.sql("""
        SELECT km.biome_id,
               km.kegg_module_id,
               SUM(km.gene_count) AS total_genes,
               COUNT(DISTINCT km.genome_id) AS n_genomes
        FROM kescience_mgnify.genome_kegg_module km
        WHERE km.biome_id IN ('tomato-rhizosphere', 'maize-rhizosphere',
                              'barley-rhizosphere', 'soil')
        GROUP BY km.biome_id, km.kegg_module_id
    """).toPandas()
    kegg_biome.to_csv(CACHE_KEGG_BIOME, index=False)
    print(f'Retrieved KEGG biome profiles: {len(kegg_biome)} rows')

# Normalize by number of genomes per biome
biome_genome_counts = mgnify_species.groupby('biome_id')['n_genomes'].sum()
kegg_biome = kegg_biome.merge(
    biome_genome_counts.reset_index().rename(columns={'n_genomes': 'biome_total_genomes'}),
    on='biome_id', how='left')
kegg_biome['prevalence'] = kegg_biome['n_genomes'] / kegg_biome['biome_total_genomes']

# Plant-interaction modules
plant_modules = {
    'M00175': 'Nitrogen fixation',
    'M00332': 'Type III secretion',
    'M00333': 'Type IV secretion',
    'M00048': 'IAA biosynthesis',
    'M00127': 'Thiamine biosynthesis',
    'M00116': 'Menaquinone biosynthesis',
    'M00019': 'Valine/isoleucine biosynthesis',
    'M00432': 'Leucine biosynthesis',
}

print('KEGG Module Prevalence Across Biomes (plant-interaction relevant):')
print(f'{"Module":<8} {"Name":<25} ', end='')
for b in ['tomato-rhizosphere', 'maize-rhizosphere', 'barley-rhizosphere', 'soil']:
    print(f'{b[:6]:>8}', end='')
print()

for mod_id, name in plant_modules.items():
    print(f'{mod_id:<8} {name:<25} ', end='')
    for biome in ['tomato-rhizosphere', 'maize-rhizosphere', 'barley-rhizosphere', 'soil']:
        row = kegg_biome[(kegg_biome['biome_id'] == biome) & 
                         (kegg_biome['kegg_module_id'] == mod_id)]
        if len(row) > 0:
            print(f'{row.iloc[0]["prevalence"]:>7.1%}', end='')
        else:
            print(f'{"0.0%":>8}', end='')
    print()

# Identify modules differentially present in rhizosphere vs soil
# Pivot to biome × module prevalence matrix
kegg_pivot = kegg_biome.pivot_table(
    index='kegg_module_id', columns='biome_id', values='prevalence', fill_value=0)

if 'soil' in kegg_pivot.columns:
    for biome in rhizo_biomes:
        if biome in kegg_pivot.columns:
            kegg_pivot[f'{biome}_vs_soil'] = kegg_pivot[biome] - kegg_pivot['soil']
    
    # Most enriched in rhizosphere vs soil
    for biome in rhizo_biomes:
        col = f'{biome}_vs_soil'
        if col in kegg_pivot.columns:
            top_enriched = kegg_pivot.nlargest(5, col)[[biome, 'soil', col]]
            print(f'\nTop 5 modules enriched in {biome} vs soil:')
            print(top_enriched)

Loaded cached KEGG biome profiles: 2630 rows
KEGG Module Prevalence Across Biomes (plant-interaction relevant):
Module   Name                        tomato  maize-  barley    soil
M00175   Nitrogen fixation            2.0%   2.8%    0.0%   2.5%
M00332   Type III secretion          24.0%  21.6%  22.3%  12.3%
M00333   Type IV secretion           24.8%  22.5%  34.0%  20.7%
M00048   IAA biosynthesis            82.9%  75.6%  79.6%  76.9%
M00127   Thiamine biosynthesis       76.7%  70.8%  74.8%  67.9%
M00116   Menaquinone biosynthesis    79.0%  73.3%  76.7%  73.0%
M00019   Valine/isoleucine biosynthesis   80.6%  73.8%  79.6%  75.5%
M00432   Leucine biosynthesis        71.6%  70.1%  70.9%  70.3%

Top 5 modules enriched in tomato-rhizosphere vs soil:
biome_id        tomato-rhizosphere      soil  tomato-rhizosphere_vs_soil
kegg_module_id                                                          
M00238                    0.367868  0.077721                    0.290146
M00769                    0.496997  0.219198                    0.277799
M00136                    0.490991  0.218385                    0.272606
M00615                    0.453453  0.183327                    0.270126
M00438                    0.363363  0.106084                    0.257280

Top 5 modules enriched in maize-rhizosphere vs soil:
biome_id        maize-rhizosphere      soil  maize-rhizosphere_vs_soil
kegg_module_id                                                        
M00238                   0.385151  0.077721                   0.307429
M00615                   0.426914  0.183327                   0.243587
M00344                   0.591647  0.401999                   0.189648
M00190                   0.568445  0.381433                   0.187013
M00769                   0.401392  0.219198                   0.182194

Top 5 modules enriched in barley-rhizosphere vs soil:
biome_id        barley-rhizosphere      soil  barley-rhizosphere_vs_soil
kegg_module_id                                                          
M00238                    0.543689  0.077721                    0.465968
M00769                    0.563107  0.219198                    0.343908
M00615                    0.485437  0.183327                    0.302110
M00344                    0.699029  0.401999                    0.297030
M00136                    0.514563  0.218385                    0.296178

CACHE_MOBILOME = f'{DATA}/mgnify_mobilome.csv'

if os.path.exists(CACHE_MOBILOME) and os.path.getsize(CACHE_MOBILOME) > 100:
    mobilome = pd.read_csv(CACHE_MOBILOME)
    print(f'Loaded cached mobilome data: {len(mobilome)} rows')
else:
    # Schema: element_id, genome_id, biome_id, contig_id, start_pos, end_pos,
    #         tool, mobile_element_type, taxonomy, flanking_site
    mobilome = spark.sql("""
        SELECT s.gtdb_genus,
               s.gtdb_species,
               mob.mobile_element_type,
               mob.tool,
               COUNT(*) AS n_elements,
               COUNT(DISTINCT mob.genome_id) AS n_genomes
        FROM kescience_mgnify.gene_mobilome mob
        JOIN kescience_mgnify.genome g
             ON mob.genome_id = g.genome_id AND mob.biome_id = g.biome_id
        JOIN kescience_mgnify.species s
             ON g.species_id = s.species_id AND g.biome_id = s.biome_id
        WHERE mob.biome_id = 'soil'
        GROUP BY s.gtdb_genus, s.gtdb_species, mob.mobile_element_type, mob.tool
    """).toPandas()
    mobilome.to_csv(CACHE_MOBILOME, index=False)
    print(f'Retrieved mobilome data: {len(mobilome)} rows')

print(f'\nMobile element types:')
print(mobilome.groupby('mobile_element_type').agg(
    n_entries=('n_elements', 'sum'),
    n_genera=('gtdb_genus', 'nunique')
))

print(f'\nDetection tools:')
print(mobilome.groupby('tool')['n_elements'].sum())

# Bridge to pangenome genera
mobilome['pan_genus'] = 'g__' + mobilome['gtdb_genus'].fillna('')
mob_pangenome = mobilome[mobilome['pan_genus'].isin(pan_genera)]
print(f'\nMobilome entries in pangenome genera: {len(mob_pangenome)}')
print(f'Pangenome genera with mobilome data: {mob_pangenome["pan_genus"].nunique()}')

# Compare mobilome burden: plant-associated vs non-plant genera
plant_genera = set(refined[refined['is_plant_associated'] == True]['genus'].dropna().unique())

mob_genus_summary = (mob_pangenome.groupby('pan_genus')
    .agg(total_elements=('n_elements', 'sum'),
         n_genomes=('n_genomes', 'sum'),
         n_types=('mobile_element_type', 'nunique'))
    .reset_index())
mob_genus_summary['elements_per_genome'] = (mob_genus_summary['total_elements'] / 
                                             mob_genus_summary['n_genomes'])
mob_genus_summary['is_plant'] = mob_genus_summary['pan_genus'].isin(plant_genera)

plant_mob = mob_genus_summary[mob_genus_summary['is_plant']]['elements_per_genome']
nonplant_mob = mob_genus_summary[~mob_genus_summary['is_plant']]['elements_per_genome']

if len(plant_mob) > 5 and len(nonplant_mob) > 5:
    stat, pval = stats.mannwhitneyu(plant_mob, nonplant_mob, alternative='two-sided')
    print(f'\nMobilome burden (elements/genome):')
    print(f'  Plant-associated genera: median={plant_mob.median():.1f}, n={len(plant_mob)}')
    print(f'  Non-plant genera: median={nonplant_mob.median():.1f}, n={len(nonplant_mob)}')
    print(f'  Mann-Whitney U p={pval:.2e}')

Retrieved mobilome data: 17323 rows

Mobile element types:
                                  n_entries  n_genera
mobile_element_type                                  
AICE_with_Rep_and_Tra                     7         5
AICE_without_identified_DR              194        81
CDS                                    3023       298
ICE_with_T4SS                            24        21
ICE_without_identified_DR               226       129
IME                                     123        53
IME_without_identified_DR               997       386
IS110_with_TIR                         5446       750
IS1182_with_TIR                        1763       385
IS1380_with_TIR                         867       219
IS1595_with_TIR                        1161       254
IS1634_with_TIR                         772       168
IS1_with_TIR                             16         4
IS200/IS605_without_TIR                 688       164
IS21_with_TIR                          1860       470
IS256_with_TIR                         1848       444
IS30_with_TIR                           569       221
IS3_with_TIR                           2715       609
IS481_with_TIR                         1245       391
IS4_with_TIR                           1177       285
IS4_without_TIR                          45        24
IS5_with_TIR                           2667       440
IS5_without_TIR                          70        29
IS607_with_TIR                           12         5
IS630_with_TIR                         3154       457
IS66_with_TIR                          1367       300
IS6_with_TIR                            379        74
IS6_without_TIR                          23        10
IS701_with_TIR                          801       212
IS91_with_TIR                           557       154
IS982_with_TIR                          189        75
ISAS1_with_TIR                          503       122
ISAS1_without_TIR                        31        10
ISAZO13_with_TIR                        194        70
ISH3_with_TIR                            42        13
ISH3_without_TIR                          2         1
ISKRA4_with_TIR                         151        50
ISL3_with_TIR                           859       270
ISNCY_with_TIR                         1138       280
attC_site                              2960       237
complete_integron                       423       237
direct_repeat_element                   306        68
new_with_TIR                            175        90
phage_circular                           26        19
phage_linear                            418       104
phage_plasmid                             3         3
plasmid                               13315      1071
prophage                               1136       391
terminal_inverted_repeat_element      63254      1445
viral_sequence                         2445       776

Detection tools:
tool
ICEfinder          1877
ISEScan           95740
IntegronFinder     3383
Prodigal           3023
VIRify              717
geNomad           16623
geNomad_VIRify        3
Name: n_elements, dtype: int64

Mobilome entries in pangenome genera: 8423
Pangenome genera with mobilome data: 563

Mobilome burden (elements/genome):
  Plant-associated genera: median=3.7, n=109
  Non-plant genera: median=2.8, n=454
  Mann-Whitney U p=1.49e-05

CACHE_BGC = f'{DATA}/mgnify_bgc_profiles.csv'

if os.path.exists(CACHE_BGC) and os.path.getsize(CACHE_BGC) > 100:
    bgc_profiles = pd.read_csv(CACHE_BGC)
    print(f'Loaded cached BGC profiles: {len(bgc_profiles)} rows')
else:
    # NB09 already has the summary at genus level — re-query with more detail
    bgc_profiles = spark.sql("""
        SELECT s.gtdb_genus,
               bgc.nearest_mibig_class,
               COUNT(*) AS n_bgcs,
               COUNT(DISTINCT bgc.genome_id) AS n_genomes,
               AVG(bgc.score) AS avg_score
        FROM kescience_mgnify.gene_bgc bgc
        JOIN kescience_mgnify.genome g 
             ON bgc.genome_id = g.genome_id AND bgc.biome_id = g.biome_id
        JOIN kescience_mgnify.species s 
             ON g.species_id = s.species_id AND g.biome_id = s.biome_id
        WHERE bgc.biome_id = 'soil'
        GROUP BY s.gtdb_genus, bgc.nearest_mibig_class
    """).toPandas()
    bgc_profiles.to_csv(CACHE_BGC, index=False)
    print(f'Retrieved BGC profiles: {len(bgc_profiles)} rows')

bgc_profiles['pan_genus'] = 'g__' + bgc_profiles['gtdb_genus'].fillna('')
bgc_pan = bgc_profiles[bgc_profiles['pan_genus'].isin(pan_genera)]

print(f'BGC entries in pangenome genera: {len(bgc_pan)}')
print(f'Pangenome genera with BGC data: {bgc_pan["pan_genus"].nunique()}')

# BGC class distribution
print(f'\nBGC class distribution (pangenome genera):')
print(bgc_pan.groupby('nearest_mibig_class').agg(
    total_bgcs=('n_bgcs', 'sum'),
    n_genera=('pan_genus', 'nunique')
).sort_values('total_bgcs', ascending=False))

# Compare plant vs non-plant genera
bgc_pan['is_plant'] = bgc_pan['pan_genus'].isin(plant_genera)

bgc_by_group = bgc_pan.groupby(['is_plant', 'nearest_mibig_class'])['n_bgcs'].sum().unstack(fill_value=0)
if len(bgc_by_group) > 0:
    print(f'\nBGC classes — plant-associated vs non-plant genera:')
    for group in [True, False]:
        if group in bgc_by_group.index:
            label = 'Plant' if group else 'Non-plant'
            total = bgc_by_group.loc[group].sum()
            print(f'\n  {label} genera (total BGCs: {total:,}):')
            for cls in bgc_by_group.columns:
                n = bgc_by_group.loc[group, cls]
                pct = n / total * 100 if total > 0 else 0
                print(f'    {cls}: {n:,} ({pct:.1f}%)')

# Siderophore-type BGCs in plant-associated genera (relevant for PGP)
siderophore_bgc = bgc_pan[
    bgc_pan['nearest_mibig_class'].str.contains('Siderophore|NRP', case=False, na=False) &
    bgc_pan['is_plant']
]
if len(siderophore_bgc) > 0:
    print(f'\nPlant-associated genera with NRP/siderophore BGCs: {siderophore_bgc["pan_genus"].nunique()}')
    print(siderophore_bgc.groupby('pan_genus')['n_bgcs'].sum().sort_values(ascending=False).head(10))

Retrieved BGC profiles: 8089 rows
BGC entries in pangenome genera: 2823
Pangenome genera with BGC data: 579

BGC class distribution (pangenome genera):
                             total_bgcs  n_genera
nearest_mibig_class                              
Polyketide                         3795       443
Saccharide                         3024       472
Other                              2794       422
NRP                                2421       333
Terpene                            2251       343
RiPP                               2053       351
NRP Polyketide                      714       159
NRP Saccharide                      187        92
Alkaloid                            146        66
Polyketide Saccharide                71        35
NRP Terpene                          57        33
NRP Other Polyketide                 25        19
Polyketide Terpene                   20        15
Other Saccharide                     16        13
Alkaloid Terpene                      9         5
Alkaloid Polyketide                   9         8
NRP Polyketide Saccharide             6         5
Other Polyketide Saccharide           4         4
Saccharide Terpene                    3         3
NRP RiPP                              2         1
Other Polyketide                      1         1

BGC classes — plant-associated vs non-plant genera:

  Plant genera (total BGCs: 6,484):
    Alkaloid: 54 (0.8%)
    Alkaloid Polyketide: 3 (0.0%)
    Alkaloid Terpene: 4 (0.1%)
    NRP: 902 (13.9%)
    NRP Other Polyketide: 7 (0.1%)
    NRP Polyketide: 374 (5.8%)
    NRP Polyketide Saccharide: 1 (0.0%)
    NRP RiPP: 2 (0.0%)
    NRP Saccharide: 51 (0.8%)
    NRP Terpene: 14 (0.2%)
    Other: 1,025 (15.8%)
    Other Polyketide: 1 (0.0%)
    Other Polyketide Saccharide: 2 (0.0%)
    Other Saccharide: 3 (0.0%)
    Polyketide: 1,501 (23.1%)
    Polyketide Saccharide: 33 (0.5%)
    Polyketide Terpene: 5 (0.1%)
    RiPP: 818 (12.6%)
    Saccharide: 858 (13.2%)
    Saccharide Terpene: 1 (0.0%)
    Terpene: 825 (12.7%)

  Non-plant genera (total BGCs: 11,124):
    Alkaloid: 92 (0.8%)
    Alkaloid Polyketide: 6 (0.1%)
    Alkaloid Terpene: 5 (0.0%)
    NRP: 1,519 (13.7%)
    NRP Other Polyketide: 18 (0.2%)
    NRP Polyketide: 340 (3.1%)
    NRP Polyketide Saccharide: 5 (0.0%)
    NRP RiPP: 0 (0.0%)
    NRP Saccharide: 136 (1.2%)
    NRP Terpene: 43 (0.4%)
    Other: 1,769 (15.9%)
    Other Polyketide: 0 (0.0%)
    Other Polyketide Saccharide: 2 (0.0%)
    Other Saccharide: 13 (0.1%)
    Polyketide: 2,294 (20.6%)
    Polyketide Saccharide: 38 (0.3%)
    Polyketide Terpene: 15 (0.1%)
    RiPP: 1,235 (11.1%)
    Saccharide: 2,166 (19.5%)
    Saccharide Terpene: 2 (0.0%)
    Terpene: 1,426 (12.8%)

Plant-associated genera with NRP/siderophore BGCs: 84
pan_genus
g__Mycobacterium     406
g__Pseudomonas_E     125
g__UBA5704           124
g__Streptomyces      113
g__Bradyrhizobium     53
g__Amycolatopsis      46
g__Nocardioides       39
g__Bog-105            28
g__Arthrobacter       25
g__Chitinophaga       24
Name: n_bgcs, dtype: int64

# Genus-level cross-validation
# Pangenome: plant-associated genera
pan_plant_genus = set(sp_comp[sp_comp['is_plant_associated'] == True]['genus'].dropna())

# MGnify: genera in rhizosphere biomes
mg_rhizo_genus = set(mgnify_species[
    mgnify_species['biome_id'].isin(rhizo_biomes)]['pan_genus'].dropna())

# MGnify: genera in soil biome
mg_soil_genus = set(mgnify_species[
    mgnify_species['biome_id'] == 'soil']['pan_genus'].dropna())

# Overlap analysis
both_plant = pan_plant_genus & mg_rhizo_genus
pan_only = pan_plant_genus - mg_rhizo_genus
mg_only = mg_rhizo_genus - pan_plant_genus

print(f'Genus-level classification concordance:')
print(f'  Pangenome plant-associated genera: {len(pan_plant_genus)}')
print(f'  MGnify rhizosphere genera: {len(mg_rhizo_genus)}')
print(f'  Overlap (both): {len(both_plant)}')
print(f'  Pangenome-only plant genera: {len(pan_only)}')
print(f'  MGnify-only rhizosphere genera: {len(mg_only)}')

concordance = len(both_plant) / len(pan_plant_genus | mg_rhizo_genus) if len(pan_plant_genus | mg_rhizo_genus) > 0 else 0
print(f'  Jaccard concordance: {concordance:.1%}')

# Notable discrepancies
print(f'\nTop pangenome plant genera NOT in MGnify rhizosphere:')
pan_only_refined = refined[refined['genus'].isin(pan_only)]
pan_only_counts = pan_only_refined.groupby('genus').size().sort_values(ascending=False).head(10)
for g, n in pan_only_counts.items():
    print(f'  {g}: {n} species in pangenome')

# Do pangenome plant genera appear in MGnify soil instead?
pan_plant_in_soil = pan_plant_genus & mg_soil_genus
print(f'\nPangenome plant genera found in MGnify soil (not rhizosphere): '
      f'{len(pan_plant_in_soil - mg_rhizo_genus)}')

Genus-level classification concordance:
  Pangenome plant-associated genera: 487
  MGnify rhizosphere genera: 438
  Overlap (both): 97
  Pangenome-only plant genera: 390
  MGnify-only rhizosphere genera: 341
  Jaccard concordance: 11.7%

Top pangenome plant genera NOT in MGnify rhizosphere:
  g__Streptococcus: 200 species in pangenome
  g__Corynebacterium: 117 species in pangenome
  g__Vibrio: 115 species in pangenome
  g__Bradyrhizobium: 76 species in pangenome
  g__Rhizobium: 63 species in pangenome
  g__Halomonas: 59 species in pangenome
  g__Clostridium: 50 species in pangenome
  g__Micromonospora: 44 species in pangenome
  g__Lactobacillus: 40 species in pangenome
  g__Acetobacter: 31 species in pangenome

Pangenome plant genera found in MGnify soil (not rhizosphere): 93

# Novel OGs were identified by their eggNOG_OGs IDs
# We can check if their COG categories are enriched in rhizosphere vs soil

# Load the NB09 novel OG annotations with COG categories
novel_detail = pd.read_csv(f'{DATA}/novel_og_annotations.csv')
print(f'Novel OGs: {len(novel_detail)}')

# Get COG categories for novel OGs
if 'cog_category' in novel_detail.columns:
    novel_cogs = novel_detail['cog_category'].dropna().unique()
    print(f'Novel OG COG categories: {novel_cogs}')
elif 'COG_category' in novel_detail.columns:
    novel_cogs = novel_detail['COG_category'].dropna().unique()
    print(f'Novel OG COG categories: {novel_cogs}')
else:
    print('COG category column not found in novel OG annotations')
    print('Available columns:', novel_detail.columns.tolist())
    novel_cogs = []

# Load MGnify COG profiles from NB09 cache
cog_profiles = pd.read_csv(f'{DATA}/mgnify_rhizo_vs_soil_cog.csv')
print(f'\nMGnify COG profiles: {len(cog_profiles)} biome-category pairs')

# Normalize COG proportions per biome
cog_totals = cog_profiles.groupby('biome_id')['total_genes'].sum().reset_index()
cog_totals.columns = ['biome_id', 'biome_total_genes']
cog_profiles = cog_profiles.merge(cog_totals, on='biome_id')
cog_profiles['proportion'] = cog_profiles['total_genes'] / cog_profiles['biome_total_genes']

# Compare rhizosphere vs soil proportions for novel OG COG categories
rhizo_cog = cog_profiles[cog_profiles['biome_id'].isin(rhizo_biomes)]
soil_cog = cog_profiles[cog_profiles['biome_id'] == 'soil']

# Average across rhizosphere biomes
rhizo_avg = rhizo_cog.groupby('cog_category')['proportion'].mean().reset_index()
rhizo_avg.columns = ['cog_category', 'rhizo_prop']
soil_avg = soil_cog[['cog_category', 'proportion']].rename(columns={'proportion': 'soil_prop'})

cog_compare = rhizo_avg.merge(soil_avg, on='cog_category', how='outer').fillna(0)
cog_compare['enrichment'] = cog_compare['rhizo_prop'] / cog_compare['soil_prop'].clip(lower=1e-6)
cog_compare = cog_compare.sort_values('enrichment', ascending=False)

print(f'\nCOG category enrichment (rhizosphere vs soil):')
print(cog_compare[['cog_category', 'rhizo_prop', 'soil_prop', 'enrichment']].head(15).to_string(index=False))

# Check if novel OG COG categories are enriched in rhizosphere
if len(novel_cogs) > 0:
    print(f'\nNovel OG COG categories in MGnify enrichment:')
    for cog in novel_cogs:
        row = cog_compare[cog_compare['cog_category'] == cog]
        if len(row) > 0:
            r = row.iloc[0]
            direction = 'enriched' if r['enrichment'] > 1 else 'depleted'
            print(f'  {cog}: rhizo={r["rhizo_prop"]:.4f} soil={r["soil_prop"]:.4f} '
                  f'{r["enrichment"]:.2f}x ({direction})')
        else:
            print(f'  {cog}: not found in MGnify COG data')

Novel OGs: 50
Novel OG COG categories: <ArrowStringArray>
['C', 'S', 'P', 'O', 'CH', 'F', 'G', 'Q', 'L', 'E', 'H', 'I', 'K']
Length: 13, dtype: str

MGnify COG profiles: 96 biome-category pairs

COG category enrichment (rhizosphere vs soil):
cog_category  rhizo_prop  soil_prop  enrichment
           N    0.016322   0.011287    1.446040
           W    0.000311   0.000226    1.376627
           -    0.067707   0.055409    1.221944
           K    0.076670   0.064983    1.179845
           P    0.057165   0.049116    1.163883
           U    0.023100   0.021326    1.083196
           S    0.187457   0.175793    1.066347
           G    0.061043   0.057813    1.055866
           A    0.000183   0.000178    1.029480
           T    0.051939   0.051092    1.016581
           M    0.061171   0.061924    0.987836
           H    0.036587   0.037534    0.974779
           D    0.010472   0.010816    0.968198
           F    0.022420   0.023161    0.967999
           Z    0.000475   0.000503    0.944820

Novel OG COG categories in MGnify enrichment:
  C: rhizo=0.0564 soil=0.0704 0.80x (depleted)
  S: rhizo=0.1875 soil=0.1758 1.07x (enriched)
  P: rhizo=0.0572 soil=0.0491 1.16x (enriched)
  O: rhizo=0.0311 soil=0.0346 0.90x (depleted)
  CH: not found in MGnify COG data
  F: rhizo=0.0224 soil=0.0232 0.97x (depleted)
  G: rhizo=0.0610 soil=0.0578 1.06x (enriched)
  Q: rhizo=0.0270 soil=0.0332 0.81x (depleted)
  L: rhizo=0.0385 soil=0.0482 0.80x (depleted)
  E: rhizo=0.0774 soil=0.0820 0.94x (depleted)
  H: rhizo=0.0366 soil=0.0375 0.97x (depleted)
  I: rhizo=0.0378 soil=0.0437 0.87x (depleted)
  K: rhizo=0.0767 soil=0.0650 1.18x (enriched)

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(16, 14))

# Panel A: Genus overlap Venn-like bar chart
categories = ['Pangenome\nonly', 'Both', 'MGnify\nonly']
values = [len(pan_only), len(both_plant), len(mg_only)]
colors_a = ['#4ECDC4', '#FF6B6B', '#45B7D1']
axes[0,0].bar(categories, values, color=colors_a, edgecolor='black')
axes[0,0].set_ylabel('Number of genera')
axes[0,0].set_title('A) Plant-Associated Genera: Pangenome vs MGnify')
for i, v in enumerate(values):
    axes[0,0].text(i, v + 2, str(v), ha='center', fontweight='bold')

# Panel B: Host-specificity across rhizosphere biomes
biome_genus_counts = {
    biome: genus_biome_matrix[biome][genus_biome_matrix[biome] > 0].shape[0]
    for biome in rhizo_biomes if biome in genus_biome_matrix.columns
}
if biome_genus_counts:
    biome_labels = [b.replace('-rhizosphere', '') for b in biome_genus_counts.keys()]
    axes[0,1].bar(biome_labels, list(biome_genus_counts.values()), 
                  color=['tomato', 'gold', 'wheat'], edgecolor='black')
    axes[0,1].set_ylabel('Number of genera (with pangenome match)')
    axes[0,1].set_title('B) Genera Per Rhizosphere Biome')
    if len(in_all) > 0:
        axes[0,1].axhline(y=len(in_all), color='red', linestyle='--', 
                          label=f'In all 3: {len(in_all)}')
        axes[0,1].legend()

# Panel C: BGC class distribution plant vs non-plant
if len(bgc_by_group) > 0:
    top_classes = bgc_pan.groupby('nearest_mibig_class')['n_bgcs'].sum().nlargest(6).index
    x_pos = np.arange(len(top_classes))
    w = 0.35
    for i, group in enumerate([True, False]):
        if group in bgc_by_group.index:
            vals = [bgc_by_group.loc[group].get(cls, 0) for cls in top_classes]
            label = 'Plant-associated' if group else 'Non-plant'
            color = 'forestgreen' if group else 'gray'
            axes[1,0].bar(x_pos + (i-0.5)*w, vals, w, label=label, color=color, alpha=0.7)
    axes[1,0].set_xticks(x_pos)
    axes[1,0].set_xticklabels(top_classes, rotation=45, ha='right', fontsize=8)
    axes[1,0].set_ylabel('Number of BGCs')
    axes[1,0].set_title('C) BGC Classes: Plant vs Non-Plant Genera (Soil)')
    axes[1,0].legend()

# Panel D: COG enrichment (rhizosphere vs soil)
if len(cog_compare) > 0:
    top_cog = cog_compare.head(15)
    y_pos = np.arange(len(top_cog))
    colors_d = ['forestgreen' if e > 1 else 'gray' for e in top_cog['enrichment']]
    axes[1,1].barh(y_pos, top_cog['enrichment'], color=colors_d, edgecolor='black', alpha=0.7)
    axes[1,1].set_yticks(y_pos)
    axes[1,1].set_yticklabels(top_cog['cog_category'], fontsize=8)
    axes[1,1].axvline(x=1.0, color='red', linestyle='--', alpha=0.5)
    axes[1,1].set_xlabel('Enrichment (Rhizosphere / Soil)')
    axes[1,1].set_title('D) COG Category Enrichment in Rhizosphere')
    axes[1,1].invert_yaxis()

plt.tight_layout()
plt.savefig(f'{FIG}/mgnify_integration.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved: {FIG}/mgnify_integration.png')

Saved: ../figures/mgnify_integration.png

print('\n' + '='*80)
print('NB11 SUMMARY: MGnify Integration & Cross-Validation')
print('='*80)

print(f'\n1. Taxonomy Bridge:')
print(f'   MGnify species across 4 biomes: {mgnify_species["species_id"].nunique():,}')
print(f'   Genus-level overlap with pangenome: {len(genus_overlap)}')
print(f'   MGnify species with pangenome genus match: {matched["species_id"].nunique()}')

print(f'\n2. Host-Specificity (H6):')
print(f'   Genera in all 3 rhizosphere biomes: {len(in_all)}')
for biome in rhizo_biomes:
    if biome in genus_biome_matrix.columns:
        n = (genus_biome_matrix[biome] > 0).sum()
        print(f'   Genera in {biome}: {n}')

print(f'\n3. Cross-Validation:')
print(f'   Pangenome plant genera: {len(pan_plant_genus)}')
print(f'   MGnify rhizosphere genera: {len(mg_rhizo_genus)}')
print(f'   Overlap: {len(both_plant)} (Jaccard={concordance:.1%})')

print(f'\n4. Mobilome (H4 validation):')
if len(mobilome) > 0:
    print(f'   Total mobilome entries: {len(mobilome)}')
    print(f'   Pangenome genera with mobilome data: {mob_pangenome["pan_genus"].nunique()}')

print(f'\n5. BGC Profiles:')
print(f'   Pangenome genera with BGC data: {bgc_pan["pan_genus"].nunique()}')
print(f'   Total BGCs in pangenome genera: {bgc_pan["n_bgcs"].sum():,}')

print(f'\nOutputs:')
print(f'  data/mgnify_pangenome_bridge.csv')
print(f'  data/mgnify_host_specificity.csv')
print(f'  data/mgnify_mobilome.csv')
print(f'  data/mgnify_bgc_profiles.csv')
print(f'  data/mgnify_kegg_biome_profiles.csv')
print(f'  figures/mgnify_integration.png')

================================================================================
NB11 SUMMARY: MGnify Integration & Cross-Validation
================================================================================

1. Taxonomy Bridge:
   MGnify species across 4 biomes: 20,473
   Genus-level overlap with pangenome: 1185
   MGnify species with pangenome genus match: 9603

2. Host-Specificity (H6):
   Genera in all 3 rhizosphere biomes: 17
   Genera in tomato-rhizosphere: 200
   Genera in maize-rhizosphere: 131
   Genera in barley-rhizosphere: 40

3. Cross-Validation:
   Pangenome plant genera: 487
   MGnify rhizosphere genera: 438
   Overlap: 97 (Jaccard=11.7%)

4. Mobilome (H4 validation):
   Total mobilome entries: 17323
   Pangenome genera with mobilome data: 563

5. BGC Profiles:
   Pangenome genera with BGC data: 579
   Total BGCs in pangenome genera: 17,608

Outputs:
  data/mgnify_pangenome_bridge.csv
  data/mgnify_host_specificity.csv
  data/mgnify_mobilome.csv
  data/mgnify_bgc_profiles.csv
  data/mgnify_kegg_biome_profiles.csv
  figures/mgnify_integration.png

11 Mgnify Integration

NB11 — MGnify Integration & Cross-Validation¶

1. MGnify–Pangenome Taxonomy Bridge¶

2. Host-Specificity Analysis (H6)¶

3. KEGG Module Profiles Across Biomes¶

4. Mobilome Data for H4 Validation¶

5. BGC Profiles for Plant-Associated Genera¶

6. Cross-Validate Pangenome vs MGnify Classifications¶

7. Cross-Validate Novel OGs Against MGnify¶