from berdl_notebook_utils.setup_spark_session import get_spark_session
spark = get_spark_session()

species_df = spark.sql("""
    SELECT gtdb_species_clade_id, no_genomes, no_gene_clusters,
           no_core, no_aux_genome, no_singleton_gene_clusters
    FROM kbase_ke_pangenome.pangenome
    WHERE no_genomes >= 50
    ORDER BY no_genomes DESC
""")

species_pd = species_df.toPandas()
print(f"Species with ≥50 genomes: {len(species_pd)}")
print(f"\nGenome count distribution:")
print(f"  50-100:    {len(species_pd[(species_pd.no_genomes >= 50) & (species_pd.no_genomes < 100)])}")
print(f"  100-200:   {len(species_pd[(species_pd.no_genomes >= 100) & (species_pd.no_genomes < 200)])}")
print(f"  200-500:   {len(species_pd[(species_pd.no_genomes >= 200) & (species_pd.no_genomes < 500)])}")
print(f"  500-1000:  {len(species_pd[(species_pd.no_genomes >= 500) & (species_pd.no_genomes < 1000)])}")
print(f"  1000+:     {len(species_pd[species_pd.no_genomes >= 1000])}")
species_pd.head(10)

Species with ≥50 genomes: 457

Genome count distribution:
  50-100:    231
  100-200:   101
  200-500:   80
  500-1000:  19
  1000+:     26

total_clusters = species_pd['no_core'] + species_pd['no_aux_genome'] + species_pd['no_singleton_gene_clusters'].fillna(0)
species_pd['aux_fraction'] = species_pd['no_aux_genome'] / total_clusters
species_pd['core_fraction'] = species_pd['no_core'] / total_clusters

print(f"Auxiliary gene fraction (of total incl singletons):")
print(f"  Mean:   {species_pd['aux_fraction'].mean():.3f}")
print(f"  Median: {species_pd['aux_fraction'].median():.3f}")
print(f"  Min:    {species_pd['aux_fraction'].min():.3f}")
print(f"  Max:    {species_pd['aux_fraction'].max():.3f}")
print(f"\nSpecies with >20% auxiliary genes: {len(species_pd[species_pd.aux_fraction > 0.2])}")
print(f"Species with >30% auxiliary genes: {len(species_pd[species_pd.aux_fraction > 0.3])}")
print(f"\nAuxiliary cluster count (no_aux_genome):")
print(f"  Mean:   {species_pd['no_aux_genome'].mean():.0f}")
print(f"  Median: {species_pd['no_aux_genome'].median():.0f}")

Auxiliary gene fraction (of total incl singletons):
  Mean:   0.569
  Median: 0.579
  Min:    0.092
  Max:    0.709

Species with >20% auxiliary genes: 454
Species with >30% auxiliary genes: 453

Auxiliary cluster count (no_aux_genome):
  Mean:   24504
  Median: 16811

import pandas as pd

# Sample 2 species from each genome count bin
bins = [(50, 100), (100, 200), (200, 500), (500, 1000), (1000, 50000)]
sample_species = []
for lo, hi in bins:
    subset = species_pd[(species_pd.no_genomes >= lo) & (species_pd.no_genomes < hi)]
    if len(subset) >= 2:
        sample_species.extend(subset.sample(min(2, len(subset)), random_state=42)['gtdb_species_clade_id'].tolist())

print(f"Sampling {len(sample_species)} species for COG coverage check:")
for sp in sample_species:
    short = sp.split('--')[0].replace('s__', '')
    n = species_pd[species_pd.gtdb_species_clade_id == sp]['no_genomes'].values[0]
    print(f"  {short} (n={n})")

Sampling 10 species for COG coverage check:
  Limisoma_sp000437795 (n=51)
  Staphylococcus_simulans (n=78)
  Salinibacter_ruber (n=110)
  Streptococcus_pseudopneumoniae (n=127)
  Lactococcus_lactis (n=291)
  Streptococcus_equi (n=472)
  Neisseria_gonorrhoeae (n=954)
  Klebsiella_quasipneumoniae (n=801)
  Enterococcus_B_faecium (n=2533)
  Mycobacterium_abscessus (n=1825)

# Check COG annotation coverage for sampled species
cog_coverage = []
for sp in sample_species:
    result = spark.sql(f"""
        SELECT
            '{sp}' as species,
            COUNT(*) as total_clusters,
            SUM(CASE WHEN ann.COG_category IS NOT NULL AND ann.COG_category != '-' THEN 1 ELSE 0 END) as has_cog,
            SUM(CASE WHEN gc.is_auxiliary = true THEN 1 ELSE 0 END) as aux_clusters,
            SUM(CASE WHEN gc.is_auxiliary = true AND ann.COG_category IS NOT NULL AND ann.COG_category != '-' THEN 1 ELSE 0 END) as aux_with_cog
        FROM kbase_ke_pangenome.gene_cluster gc
        LEFT JOIN kbase_ke_pangenome.eggnog_mapper_annotations ann
            ON gc.gene_cluster_id = ann.query_name
        WHERE gc.gtdb_species_clade_id = '{sp}'
    """).toPandas()
    cog_coverage.append(result)
    short = sp.split('--')[0].replace('s__', '')
    print(f"  Done: {short}")

cov_df = pd.concat(cog_coverage, ignore_index=True)
cov_df['cog_pct'] = (cov_df['has_cog'] / cov_df['total_clusters'] * 100).round(1)
cov_df['aux_cog_pct'] = (cov_df['aux_with_cog'] / cov_df['aux_clusters'] * 100).round(1)
cov_df['short_name'] = cov_df['species'].apply(lambda x: x.split('--')[0].replace('s__', ''))
print("\nCOG annotation coverage:")
print(cov_df[['short_name', 'total_clusters', 'has_cog', 'cog_pct', 'aux_clusters', 'aux_with_cog', 'aux_cog_pct']].to_string(index=False))
print(f"\nMean overall COG coverage: {cov_df['cog_pct'].mean():.1f}%")
print(f"Mean auxiliary COG coverage: {cov_df['aux_cog_pct'].mean():.1f}%")

  Done: Limisoma_sp000437795

  Done: Staphylococcus_simulans

  Done: Salinibacter_ruber

  Done: Streptococcus_pseudopneumoniae

  Done: Lactococcus_lactis

  Done: Streptococcus_equi

  Done: Neisseria_gonorrhoeae

  Done: Klebsiella_quasipneumoniae

  Done: Enterococcus_B_faecium

  Done: Mycobacterium_abscessus

COG annotation coverage:
                    short_name  total_clusters  has_cog  cog_pct  aux_clusters  aux_with_cog  aux_cog_pct
          Limisoma_sp000437795           14963     6260     41.8         13323          4820         36.2
       Staphylococcus_simulans            9588     4879     50.9          7416          2867         38.7
            Salinibacter_ruber           21676     8990     41.5         19263          6951         36.1
Streptococcus_pseudopneumoniae           11450     4571     39.9          9853          3111         31.6
            Lactococcus_lactis           34803    13390     38.5         33101         11807         35.7
            Streptococcus_equi           17743     6875     38.7         15913          5286         33.2
         Neisseria_gonorrhoeae           19895     5272     26.5         18119          3794         20.9
    Klebsiella_quasipneumoniae           78755    32744     41.6         74651         28744         38.5
        Enterococcus_B_faecium           75395    25520     33.8         73424         23717         32.3
       Mycobacterium_abscessus           79402    23147     29.2         75332         19568         26.0

Mean overall COG coverage: 38.2%
Mean auxiliary COG coverage: 32.9%

# Get phylum for each target species
# gtdb_taxonomy_r214v1 has separate columns: genome_id, phylum, class, order, etc.
taxonomy_df = spark.sql("""
    SELECT p.gtdb_species_clade_id,
           FIRST(gt.phylum) as phylum
    FROM kbase_ke_pangenome.pangenome p
    JOIN kbase_ke_pangenome.genome g ON p.gtdb_species_clade_id = g.gtdb_species_clade_id
    JOIN kbase_ke_pangenome.gtdb_taxonomy_r214v1 gt ON g.genome_id = gt.genome_id
    WHERE p.no_genomes >= 50
    GROUP BY p.gtdb_species_clade_id
""").toPandas()

species_pd = species_pd.merge(taxonomy_df, on='gtdb_species_clade_id', how='left')

print(f"Phylum distribution of {len(species_pd)} target species:")
print(species_pd['phylum'].value_counts().head(15).to_string())

Phylum distribution of 457 target species:
phylum
p__Pseudomonadota       193
p__Bacillota            112
p__Bacillota_A           49
p__Bacteroidota          41
p__Actinomycetota        32
p__Campylobacterota      10
p__Spirochaetota          6
p__Chlamydiota            4
p__Verrucomicrobiota      3
p__Bacillota_C            2
p__Cyanobacteriota        2
p__Halobacteriota         1
p__Thermoproteota         1
p__Acidobacteriota        1

targets = species_pd[
    (species_pd['no_aux_genome'] >= 100) &
    (species_pd['aux_fraction'] > 0.10)
].copy()

targets['short_name'] = targets['gtdb_species_clade_id'].apply(
    lambda x: x.split('--')[0].replace('s__', '')
)

print(f"Target species after filtering: {len(targets)} (of {len(species_pd)} with ≥50 genomes)")
print(f"  Removed: {len(species_pd) - len(targets)} species with <100 aux clusters or <10% aux fraction")
print(f"\nGenome count range: {targets['no_genomes'].min()} - {targets['no_genomes'].max()}")
print(f"Auxiliary cluster range: {targets['no_aux_genome'].min():.0f} - {targets['no_aux_genome'].max():.0f}")
print(f"Phyla represented: {targets['phylum'].nunique()}")

targets.to_csv('../data/target_species.csv', index=False)
print(f"\nSaved to data/target_species.csv")
targets.head(10)

Target species after filtering: 456 (of 457 with ≥50 genomes)
  Removed: 1 species with <100 aux clusters or <10% aux fraction

Genome count range: 50 - 14526
Auxiliary cluster range: 455 - 438925
Phyla represented: 14

Saved to data/target_species.csv

print(f"\n=== NB01 Summary ===")
print(f"Total species with ≥50 genomes: {len(species_pd)}")
print(f"Target species (≥100 aux clusters, >10% aux fraction): {len(targets)}")
print(f"Phyla represented: {targets['phylum'].nunique()}")
print(f"\nReady for NB02: gene content clustering on {len(targets)} species")

=== NB01 Summary ===
Total species with ≥50 genomes: 457
Target species (≥100 aux clusters, >10% aux fraction): 456
Phyla represented: 14

Ready for NB02: gene content clustering on 456 species

	gtdb_species_clade_id	no_genomes	no_gene_clusters	no_core	no_aux_genome	no_singleton_gene_clusters
0	s__Staphylococcus_aureus--RS_GCF_001027105.1	14526	147914	2083	145831	86127.0
1	s__Klebsiella_pneumoniae--RS_GCF_000742135.1	14240	443124	4199	438925	276743.0
2	s__Salmonella_enterica--RS_GCF_000006945.2	11402	266371	3639	262732	149757.0
3	s__Streptococcus_pneumoniae--RS_GCF_001457635.1	8434	116845	1475	115370	67191.0
4	s__Mycobacterium_tuberculosis--RS_GCF_000195955.2	6903	143670	3741	139929	97549.0
5	s__Pseudomonas_aeruginosa--RS_GCF_001457615.1	6760	256093	5199	250894	120728.0
6	s__Acinetobacter_baumannii--RS_GCF_009759685.1	6647	211211	2957	208254	131031.0
7	s__Clostridioides_difficile--RS_GCF_001077535.1	2604	98935	3078	95857	55410.0
8	s__Enterococcus_B_faecium--RS_GCF_001544255.1	2533	75395	1971	73424	39204.0
9	s__Enterobacter_hormaechei_A--RS_GCF_001729745.1	2453	157037	3574	153463	95270.0

	gtdb_species_clade_id	no_genomes	no_gene_clusters	no_core	no_aux_genome	no_singleton_gene_clusters	aux_fraction	core_fraction	phylum	short_name
0	s__Staphylococcus_aureus--RS_GCF_001027105.1	14526	147914	2083	145831	86127.0	0.623100	0.008900	p__Bacillota	Staphylococcus_aureus
1	s__Klebsiella_pneumoniae--RS_GCF_000742135.1	14240	443124	4199	438925	276743.0	0.609731	0.005833	p__Pseudomonadota	Klebsiella_pneumoniae
2	s__Salmonella_enterica--RS_GCF_000006945.2	11402	266371	3639	262732	149757.0	0.631373	0.008745	p__Pseudomonadota	Salmonella_enterica
3	s__Streptococcus_pneumoniae--RS_GCF_001457635.1	8434	116845	1475	115370	67191.0	0.626888	0.008015	p__Bacillota	Streptococcus_pneumoniae
4	s__Mycobacterium_tuberculosis--RS_GCF_000195955.2	6903	143670	3741	139929	97549.0	0.580091	0.015509	p__Actinomycetota	Mycobacterium_tuberculosis
5	s__Pseudomonas_aeruginosa--RS_GCF_001457615.1	6760	256093	5199	250894	120728.0	0.665817	0.013797	p__Pseudomonadota	Pseudomonas_aeruginosa
6	s__Acinetobacter_baumannii--RS_GCF_009759685.1	6647	211211	2957	208254	131031.0	0.608499	0.008640	p__Pseudomonadota	Acinetobacter_baumannii
7	s__Clostridioides_difficile--RS_GCF_001077535.1	2604	98935	3078	95857	55410.0	0.621057	0.019942	p__Bacillota_A	Clostridioides_difficile
8	s__Enterococcus_B_faecium--RS_GCF_001544255.1	2533	75395	1971	73424	39204.0	0.640704	0.017199	p__Bacillota	Enterococcus_B_faecium
9	s__Enterobacter_hormaechei_A--RS_GCF_001729745.1	2453	157037	3574	153463	95270.0	0.608239	0.014165	p__Pseudomonadota	Enterobacter_hormaechei_A

Nb01 Species Selection

NB01: Species Selection & Data Availability¶

1. Select species with ≥50 genomes¶

2. Compute auxiliary gene fraction¶

3. Check COG annotation coverage on a sample of species¶

4. Add taxonomy for context¶

5. Filter and save target species¶