# Initialize Spark session (BERDL JupyterHub — no import needed)
spark = get_spark_session()
spark.sql('SELECT 1 AS ok').show()
print('Spark session ready.')

+---+
| ok|
+---+
|  1|
+---+

Spark session ready.

import pandas as pd
import numpy as np
import os

DATA_DIR = '../data'
os.makedirs(DATA_DIR, exist_ok=True)

# Get all carbon source experiments from Fitness Browser
fb_carbon_exps = spark.sql("""
    SELECT orgId, expName, expGroup, condition_1, concentration_1, units_1,
           condition_2, media
    FROM kescience_fitnessbrowser.experiment
    WHERE expGroup = 'carbon source'
""").toPandas()

print(f'Total carbon source experiments: {len(fb_carbon_exps)}')
print(f'Organisms with carbon source data: {fb_carbon_exps["orgId"].nunique()}')
print(f'Distinct carbon sources: {fb_carbon_exps["condition_1"].nunique()}')
fb_carbon_exps.head(10)

Total carbon source experiments: 1838
Organisms with carbon source data: 36
Distinct carbon sources: 154

# Per-organism summary: how many distinct carbon sources and experiments
# Drop rows where condition_1 is NaN (some experiments have no carbon source listed)
fb_carbon_exps = fb_carbon_exps.dropna(subset=['condition_1'])

fb_org_summary = fb_carbon_exps.groupby('orgId').agg(
    n_experiments=('expName', 'nunique'),
    n_carbon_sources=('condition_1', 'nunique'),
    carbon_sources=('condition_1', lambda x: ', '.join(sorted(x.dropna().astype(str).unique())[:10]))
).reset_index().sort_values('n_carbon_sources', ascending=False)

print(f'Organisms with >= 10 carbon sources: {(fb_org_summary["n_carbon_sources"] >= 10).sum()}')
fb_org_summary.head(30)

Organisms with >= 10 carbon sources: 29

# Get organism metadata
# organism table columns: orgId, division, genus, species, strain, taxonomyId
fb_organisms = spark.sql("""
    SELECT orgId, division, genus, species, strain, taxonomyId
    FROM kescience_fitnessbrowser.organism
""").toPandas()

# Also get total experiment counts per organism (all conditions)
fb_all_exps = spark.sql("""
    SELECT orgId, expGroup, COUNT(*) as n_exps
    FROM kescience_fitnessbrowser.experiment
    GROUP BY orgId, expGroup
""").toPandas()

print(f'Total FB organisms: {len(fb_organisms)}')
fb_organisms.head(10)

Total FB organisms: 48

# Merge organism metadata with carbon source summary
fb_merged = fb_org_summary.merge(fb_organisms, on='orgId', how='inner')
fb_merged['full_name'] = fb_merged['genus'] + ' ' + fb_merged['species'] + ' ' + fb_merged['strain'].fillna('')
fb_merged = fb_merged.sort_values('n_carbon_sources', ascending=False)

print(f'FB organisms with carbon source experiments: {len(fb_merged)}')
print(f'\nTop 20 by carbon source count:')
fb_merged[['orgId', 'full_name', 'n_carbon_sources', 'n_experiments', 'taxonomyId']].head(20)

FB organisms with carbon source experiments: 36

Top 20 by carbon source count:

# Experiment group distribution across all FB organisms
exp_group_dist = fb_all_exps.groupby('expGroup')['n_exps'].sum().sort_values(ascending=False)
print('Experiment types across all FB organisms:')
print(exp_group_dist.to_string())

Experiment types across all FB organisms:
expGroup
stress                              2854
carbon source                       1838
nitrogen source                     1093
nutrient                             269
nutrient t1                          184
respiratory growth                   145
lb                                   139
temperature                           98
pH                                    90
plant                                 85
nutrient t2                           80
motility                              66
marine broth                          65
control                               39
iron                                  36
magnetic pulldown                     34
rich t1                               30
mixed community                       26
starvation                            26
supernatant                           26
rich t2                               24
anaerobic                             23
in planta                             23
mouse                                 20
r2a control                           19
vitamin                               19
supernatant control:fungal media      18
metal limitation                      18
bg11                                  18
fermentative growth                   17
reactor                               15
rich moyls4                           14
r2a                                   12
m63_quarter_strength                  11
survival                              11
pye                                   10
cpg                                   10
xylem sap                              8
reactor_pregrowth                      7
r2a control with 0.2% methanol         6
r2a control with 0.2x vogels           6
sulfur source                          4
varel_bryant_medium_glucose            4
kb                                     4
alp rich                               3
no stress control                      3
alp richb                              1
bhis                                   1

# First check what tables BacDive has
spark.sql('SHOW TABLES IN kescience_bacdive').show(truncate=False)

+-----------------+----------------------+-----------+
|namespace        |tableName             |isTemporary|
+-----------------+----------------------+-----------+
|kescience_bacdive|strain                |false      |
|kescience_bacdive|physiology            |false      |
|kescience_bacdive|metabolite_utilization|false      |
|kescience_bacdive|sequence_info         |false      |
|kescience_bacdive|enzyme                |false      |
|kescience_bacdive|taxonomy              |false      |
|kescience_bacdive|culture_condition     |false      |
|kescience_bacdive|isolation             |false      |
+-----------------+----------------------+-----------+

# Check BacDive metabolite_utilization schema
spark.sql('DESCRIBE kescience_bacdive.metabolite_utilization').show(truncate=False)

+-------------+---------+-------+
|col_name     |data_type|comment|
+-------------+---------+-------+
|bacdive_id   |int      |NULL   |
|compound_name|string   |NULL   |
|chebi_id     |string   |NULL   |
|utilization  |string   |NULL   |
+-------------+---------+-------+

# Sample BacDive metabolite_utilization data
spark.sql("""
    SELECT * FROM kescience_bacdive.metabolite_utilization LIMIT 10
""").show(truncate=False)

+----------+------------------------------+--------+-----------+
|bacdive_id|compound_name                 |chebi_id|utilization|
+----------+------------------------------+--------+-----------+
|146940    |lactose                       |NULL    |+          |
|146940    |melibiose                     |NULL    |+          |
|146940    |sucrose                       |NULL    |+          |
|146940    |esculin                       |NULL    |+          |
|146940    |amygdalin                     |NULL    |+          |
|146940    |methyl alpha-D-glucopyranoside|NULL    |-          |
|146940    |galactitol                    |NULL    |-          |
|146940    |myo-inositol                  |NULL    |-          |
|146940    |ribitol                       |NULL    |-          |
|146940    |methyl beta-D-xylopyranoside  |NULL    |-          |
+----------+------------------------------+--------+-----------+

# Get BacDive utilization summary — how many compounds tested per genus/species
# BacDive uses NCBI taxonomy names, FB uses its own orgIds
# We'll need to match by genus+species

# First, get the distinct genera in our FB candidate list
fb_genera = fb_merged['genus'].unique().tolist()
genera_str = "', '".join(fb_genera)

print(f'FB genera to match in BacDive: {len(fb_genera)}')
print(fb_genera)

FB genera to match in BacDive: 24
['Pseudomonas', 'Bacteroides', 'Klebsiella', 'Burkholderia', 'Sinorhizobium', 'Phaeobacter', 'Escherichia', 'Herbaspirillum', 'Cupriavidus', 'Shewanella', 'Acidovorax', 'Dinoroseobacter', 'Echinicola', 'Marinobacter', 'Pedobacter', 'Azospirillum', 'Caulobacter', 'Sphingomonas', 'Dyella', 'Paraburkholderia', 'Dechlorosoma', 'Desulfovibrio', 'Magnetospirillum', 'Dickeya']

# Check BacDive strain table schema
spark.sql('DESCRIBE kescience_bacdive.strain').show(truncate=False)
# Also check a sample
spark.sql('SELECT * FROM kescience_bacdive.strain LIMIT 3').show(truncate=False)

+----------------------+---------+-------+
|col_name              |data_type|comment|
+----------------------+---------+-------+
|bacdive_id            |int      |NULL   |
|dsm_number            |string   |NULL   |
|description           |string   |NULL   |
|ncbi_taxid            |string   |NULL   |
|ncbi_taxid_match_level|string   |NULL   |
|species_name          |string   |NULL   |
|strain_designation    |string   |NULL   |
|type_strain           |string   |NULL   |
|keywords              |string   |NULL   |
|doi                   |string   |NULL   |
+----------------------+---------+-------+

+----------+----------+----------------------------------------------------------------------+----------+----------------------+----------------+----------------------+-----------+--------+----------------------------------+
|bacdive_id|dsm_number|description                                                           |ncbi_taxid|ncbi_taxid_match_level|species_name    |strain_designation    |type_strain|keywords|doi                               |
+----------+----------+----------------------------------------------------------------------+----------+----------------------+----------------+----------------------+-----------+--------+----------------------------------+
|102353    |NULL      |Arthrobacter sp. ST024865 is a bacterium of the family Micrococcaceae.|1667      |species               |Arthrobacter sp.|ST024865(HKI), 531/108|no         |Bacteria|10.13145/bacdive102353.20251217.10|
|102354    |NULL      |Arthrobacter sp. ST024866 is a bacterium of the family Micrococcaceae.|1667      |species               |Arthrobacter sp.|ST024866(HKI), 531/118|no         |Bacteria|10.13145/bacdive102354.20251217.10|
|102355    |NULL      |Arthrobacter sp. ST024873 is a bacterium of the family Micrococcaceae.|1667      |species               |Arthrobacter sp.|ST024873(HKI), 531/101|no         |Bacteria|10.13145/bacdive102355.20251217.10|
+----------+----------+----------------------------------------------------------------------+----------+----------------------+----------------+----------------------+-----------+--------+----------------------------------+

# Get BacDive strains that match our FB genera
# BacDive strain table: bacdive_id, species_name, ncbi_taxid, etc. (no genus column)
# species_name contains the full name like "Pseudomonas fluorescens"
# We'll filter by species_name starting with our genera

# Build OR condition for genus matching
genus_conditions = " OR ".join([f"species_name LIKE '{g} %'" for g in fb_genera])

bacdive_strains = spark.sql(f"""
    SELECT bacdive_id, species_name, ncbi_taxid, strain_designation, type_strain
    FROM kescience_bacdive.strain
    WHERE {genus_conditions}
""").toPandas()

# Extract genus from species_name
bacdive_strains['genus'] = bacdive_strains['species_name'].str.split().str[0]
bacdive_strains['species_epithet'] = bacdive_strains['species_name'].str.split().str[1:].str.join(' ')

print(f'BacDive strains matching FB genera: {len(bacdive_strains)}')
print(f'BacDive genera matched: {bacdive_strains["genus"].nunique()}')
bacdive_strains.groupby('genus').size().sort_values(ascending=False).head(20)

BacDive strains matching FB genera: 6437
BacDive genera matched: 23

genus
Escherichia         1899
Pseudomonas         1879
Klebsiella           456
Burkholderia         428
Sphingomonas         373
Shewanella           219
Bacteroides          178
Cupriavidus          159
Pedobacter           149
Paraburkholderia     139
Acidovorax           135
Marinobacter          97
Desulfovibrio         82
Azospirillum          55
Herbaspirillum        50
Caulobacter           37
Dyella                36
Magnetospirillum      20
Dickeya               17
Phaeobacter           14
dtype: int64

# Get metabolite utilization data for matched strains
# First check the metabolite_utilization join key
if len(bacdive_strains) > 0:
    matched_ids = bacdive_strains['bacdive_id'].tolist()
    
    # Register as temp view for efficient join
    spark.createDataFrame(
        pd.DataFrame({'bacdive_id': [str(x) for x in matched_ids]})
    ).createOrReplaceTempView('matched_strains')
    
    # metabolite_utilization likely joins on bacdive_id or strain_id — check schema first
    mu_schema = spark.sql('DESCRIBE kescience_bacdive.metabolite_utilization').toPandas()
    print('metabolite_utilization columns:')
    print(mu_schema['col_name'].tolist())
    
    # Find the join key (bacdive_id or strain_id)
    mu_cols = set(mu_schema['col_name'])
    if 'bacdive_id' in mu_cols:
        join_col = 'bacdive_id'
    elif 'strain_id' in mu_cols:
        join_col = 'strain_id'
    else:
        join_col = None
        print('WARNING: No obvious join key found')
    
    if join_col:
        bacdive_util = spark.sql(f"""
            SELECT mu.*
            FROM kescience_bacdive.metabolite_utilization mu
            JOIN matched_strains ms ON CAST(mu.{join_col} AS STRING) = ms.bacdive_id
        """).toPandas()
        
        print(f'\nBacDive utilization records for matched strains: {len(bacdive_util)}')
        if len(bacdive_util) > 0:
            print(f'Distinct compounds: {bacdive_util["compound"].nunique() if "compound" in bacdive_util.columns else "N/A"}')
            # Show utilization value distribution
            util_col = 'utilization' if 'utilization' in bacdive_util.columns else None
            if util_col:
                print(f'\nUtilization values:')
                print(bacdive_util[util_col].value_counts())
    else:
        bacdive_util = pd.DataFrame()
else:
    print('No BacDive strains matched — will proceed with FB data only')
    bacdive_util = pd.DataFrame()

metabolite_utilization columns:
['bacdive_id', 'compound_name', 'chebi_id', 'utilization']

BacDive utilization records for matched strains: 89787
Distinct compounds: N/A

Utilization values:
utilization
-           44244
+           35949
produced     9390
+/-           203
Name: count, dtype: int64

# Merge BacDive utilization with strain metadata to get genus/species
if len(bacdive_util) > 0:
    # Determine the join key used in metabolite_utilization
    bd_join_col = 'bacdive_id' if 'bacdive_id' in bacdive_util.columns else 'strain_id'
    
    bacdive_util = bacdive_util.merge(
        bacdive_strains[['bacdive_id', 'genus', 'species_epithet']].rename(
            columns={'bacdive_id': bd_join_col}
        ),
        on=bd_join_col, how='left'
    )
    
    # Filter to explicit +/- tests (exclude 'produced' and '+/-')
    util_col = 'utilization' if 'utilization' in bacdive_util.columns else None
    
    if util_col:
        # Summarize: per genus, how many compounds have +/- data
        bacdive_genus_summary = bacdive_util.groupby('genus').agg(
            n_compounds=('compound', 'nunique') if 'compound' in bacdive_util.columns else ('bacdive_id', 'count'),
            n_positive=(util_col, lambda x: (x == '+').sum()),
            n_negative=(util_col, lambda x: (x == '-').sum()),
            n_records=(util_col, 'count')
        ).sort_values('n_compounds', ascending=False)
        
        print('BacDive compound coverage per genus (matched to FB):')
        print(bacdive_genus_summary.head(20))
    else:
        print('No utilization column found — checking available columns:')
        print(bacdive_util.columns.tolist())
        bacdive_genus_summary = pd.DataFrame()
else:
    bacdive_genus_summary = pd.DataFrame()

BacDive compound coverage per genus (matched to FB):
                  n_compounds  n_positive  n_negative  n_records
genus                                                           
Escherichia             32719       11579       17083      32719
Pseudomonas             20273        8235        9857      20273
Klebsiella               7894        4941        2242       7894
Sphingomonas             6091        1857        3750       6091
Burkholderia             4725        2437        1828       4725
Pedobacter               2324         903        1252       2324
Paraburkholderia         2286        1194         897       2286
Bacteroides              1979         941         809       1979
Cupriavidus              1967         419        1364       1967
Shewanella               1906         701         957       1906
Marinobacter             1542         471         957       1542
Acidovorax               1539         495         890       1538
Dyella                   1044         359         628       1044
Herbaspirillum            926         325         546        926
Azospirillum              793         390         334        793
Echinicola                574         218         340        574
Caulobacter               390          72         275        390
Dickeya                   377         241          95        377
Phaeobacter               237         123          95        237
Desulfovibrio              99           9           8         99

# Try to load existing FB-pangenome link table
link_path = '../../conservation_vs_fitness/data/fb_pangenome_link.tsv'
if os.path.exists(link_path):
    fb_pg_link = pd.read_csv(link_path, sep='\t')
    print(f'Loaded existing FB-pangenome link table: {len(fb_pg_link)} links')
    print(f'Organisms covered: {fb_pg_link["orgId"].nunique() if "orgId" in fb_pg_link.columns else "unknown"}')
    fb_pg_link.head()
else:
    print(f'Link table not found at {link_path}')
    print('Will match via taxonomy instead')
    fb_pg_link = None

Link table not found at ../../conservation_vs_fitness/data/fb_pangenome_link.tsv
Will match via taxonomy instead

# Match FB organisms to pangenome species clades via taxonomy
# Strategy: use genus + species to find matching GTDB species clades

# Get FB organism genus+species for matching
fb_for_match = fb_merged[['orgId', 'genus', 'species', 'strain', 'taxonomyId', 'full_name']].copy()

# Query GTDB taxonomy for matching species
# GTDB species names use format s__Genus_species
gtdb_species = spark.sql("""
    SELECT DISTINCT gtdb_species_clade_id, GTDB_species
    FROM kbase_ke_pangenome.gtdb_species_clade
""").toPandas()

print(f'Total GTDB species clades: {len(gtdb_species)}')

# Build a lookup: genus_species -> gtdb_species_clade_id
# GTDB format: s__Genus_species (possibly with suffix like _A, _B)
gtdb_species['genus_species'] = gtdb_species['GTDB_species'].str.replace('s__', '').str.replace('_', ' ')

# Match FB organisms
matches = []
for _, row in fb_for_match.iterrows():
    fb_name = f"{row['genus']} {row['species']}"
    # Try exact match first
    exact = gtdb_species[gtdb_species['genus_species'] == fb_name]
    if len(exact) > 0:
        for _, m in exact.iterrows():
            matches.append({
                'orgId': row['orgId'],
                'fb_name': fb_name,
                'gtdb_species_clade_id': m['gtdb_species_clade_id'],
                'GTDB_species': m['GTDB_species'],
                'match_type': 'exact'
            })
    else:
        # Try genus-level match with GTDB suffixes (_A, _B, etc.)
        partial = gtdb_species[gtdb_species['genus_species'].str.startswith(fb_name)]
        if len(partial) > 0:
            for _, m in partial.iterrows():
                matches.append({
                    'orgId': row['orgId'],
                    'fb_name': fb_name,
                    'gtdb_species_clade_id': m['gtdb_species_clade_id'],
                    'GTDB_species': m['GTDB_species'],
                    'match_type': 'partial'
                })
        else:
            matches.append({
                'orgId': row['orgId'],
                'fb_name': fb_name,
                'gtdb_species_clade_id': None,
                'GTDB_species': None,
                'match_type': 'no_match'
            })

fb_gtdb_matches = pd.DataFrame(matches)
print(f'\nMatch results:')
print(fb_gtdb_matches['match_type'].value_counts())
print(f'\nUnmatched organisms:')
print(fb_gtdb_matches[fb_gtdb_matches['match_type'] == 'no_match'][['orgId', 'fb_name']])

Total GTDB species clades: 27690

Match results:
match_type
no_match    18
exact       16
partial      2
Name: count, dtype: int64

Unmatched organisms:
                 orgId                      fb_name
0               WCS417           Pseudomonas simiae
2               Putida           Pseudomonas putida
3         pseudo3_N2E3      Pseudomonas fluorescens
4       pseudo5_N2C3_1      Pseudomonas fluorescens
6                BFirm    Burkholderia phytofirmans
7         pseudo1_N1B4      Pseudomonas fluorescens
8         pseudo6_N2E2      Pseudomonas fluorescens
11              psRCH2         Pseudomonas stutzeri
12  pseudo13_GW456_L13      Pseudomonas fluorescens
16                SB2B       Shewanella amazonensis
17                ANA3               Shewanella sp.
18     acidovorax_3H11               Acidovorax sp.
20                Cola      Echinicola vietnamensis
22             Pedo557               Pedobacter sp.
25               Caulo       Caulobacter crescentus
30                  PS         Dechlorosoma suillum
31                Miya       Desulfovibrio vulgaris
32             Magneto  Magnetospirillum magneticum

# For matched species, get pangenome stats (genome count, gene clusters)
matched_clades = fb_gtdb_matches[fb_gtdb_matches['gtdb_species_clade_id'].notna()]['gtdb_species_clade_id'].unique()

if len(matched_clades) > 0:
    clade_list = "', '".join(matched_clades)
    pangenome_stats = spark.sql(f"""
        SELECT gtdb_species_clade_id, 
               CAST(no_genomes AS INT) as no_genomes,
               CAST(no_gene_clusters AS INT) as no_gene_clusters,
               CAST(no_core AS INT) as no_core,
               CAST(no_aux_genome AS INT) as no_aux
        FROM kbase_ke_pangenome.pangenome
        WHERE gtdb_species_clade_id IN ('{clade_list}')
    """).toPandas()
    
    print(f'Pangenome stats for {len(pangenome_stats)} matched clades:')
    pangenome_stats.head(20)
else:
    print('No matched clades found')
    pangenome_stats = pd.DataFrame()

Pangenome stats for 17 matched clades:

# Get a representative genome_id for each matched species clade
if len(matched_clades) > 0:
    rep_genomes = spark.sql(f"""
        SELECT gtdb_species_clade_id, genome_id
        FROM kbase_ke_pangenome.genome
        WHERE gtdb_species_clade_id IN ('{clade_list}')
    """).toPandas()
    
    # Take one representative per clade
    rep_genomes_dedup = rep_genomes.groupby('gtdb_species_clade_id').first().reset_index()
    rep_genome_ids = rep_genomes_dedup['genome_id'].tolist()
    
    # GapMind genome IDs lack RS_/GB_ prefix
    gapmind_ids = [g.replace('RS_', '').replace('GB_', '') for g in rep_genome_ids]
    gapmind_ids_str = "', '".join(gapmind_ids)
    
    gapmind_summary = spark.sql(f"""
        SELECT genome_id, metabolic_category, 
               COUNT(DISTINCT pathway) as n_pathways,
               SUM(CASE WHEN score_category = 'complete' THEN 1 ELSE 0 END) as n_complete
        FROM kbase_ke_pangenome.gapmind_pathways
        WHERE genome_id IN ('{gapmind_ids_str}')
        GROUP BY genome_id, metabolic_category
    """).toPandas()
    
    print(f'GapMind data for {gapmind_summary["genome_id"].nunique()} genomes:')
    gapmind_summary.head(20)
else:
    gapmind_summary = pd.DataFrame()

GapMind data for 17 genomes:

# Build the selection table from ALL FB organisms with carbon source data
selection = fb_merged.copy()

# Add pangenome match info (take first match per orgId)
first_match = fb_gtdb_matches.drop_duplicates('orgId')
selection = selection.merge(
    first_match[['orgId', 'gtdb_species_clade_id', 'GTDB_species', 'match_type']],
    on='orgId', how='left'
)

# Add pangenome stats where available
if len(pangenome_stats) > 0:
    selection = selection.merge(
        pangenome_stats[['gtdb_species_clade_id', 'no_genomes', 'no_gene_clusters']],
        on='gtdb_species_clade_id', how='left'
    )

# Add BacDive flag (supplementary — does NOT gate selection)
if len(bacdive_util) > 0:
    bacdive_genera = set(bacdive_genus_summary.index)
    selection['has_bacdive'] = selection['genus'].isin(bacdive_genera)
else:
    selection['has_bacdive'] = False

selection['has_pangenome'] = selection['gtdb_species_clade_id'].notna()

# --- Selection: >= 10 carbon sources, NO pangenome or BacDive requirement ---
eligible = selection[selection['n_carbon_sources'] >= 10].copy()

# For taxonomic diversity, handle Pseudomonas redundancy:
# Multiple P. fluorescens strains — keep top 2, drop extras
pseudo_fluor = eligible[eligible['full_name'].str.contains('fluorescens')]
if len(pseudo_fluor) > 2:
    drop_ids = pseudo_fluor.sort_values('n_carbon_sources', ascending=False).iloc[2:]['orgId'].tolist()
    eligible = eligible[~eligible['orgId'].isin(drop_ids)]
    print(f'Dropped {len(drop_ids)} redundant P. fluorescens strains: {drop_ids}')

# Score: primarily by carbon source count
eligible['score'] = eligible['n_carbon_sources']
eligible = eligible.sort_values('score', ascending=False)

print(f'\nEligible organisms (>= 10 C-sources): {len(eligible)}')
print(f'  With pangenome match: {eligible["has_pangenome"].sum()}')
print(f'  With BacDive data:    {eligible["has_bacdive"].sum()}')
eligible[['orgId', 'full_name', 'n_carbon_sources', 'n_experiments', 
          'has_pangenome', 'has_bacdive', 'gtdb_species_clade_id', 'score']].head(30)

Dropped 3 redundant P. fluorescens strains: ['pseudo1_N1B4', 'pseudo6_N2E2', 'pseudo13_GW456_L13']

Eligible organisms (>= 10 C-sources): 26
  With pangenome match: 14
  With BacDive data:    26

# Select top ~20 (or all eligible if fewer)
TARGET_N = 25  # take up to 25 to ensure good coverage
selected = eligible.head(TARGET_N).copy()

print(f'Selected {len(selected)} genomes')
print(f'  With pangenome match: {selected["has_pangenome"].sum()}')
print(f'  With BacDive data:    {selected["has_bacdive"].sum()}')
print(f'  Mean carbon sources:  {selected["n_carbon_sources"].mean():.1f}')
print(f'  Distinct genera:      {selected["genus"].nunique()}')
print(f'\nTaxonomic distribution:')
print(selected['genus'].value_counts().to_string())
print(f'\nSelected organisms:')
selected[['orgId', 'full_name', 'n_carbon_sources', 'has_pangenome', 'has_bacdive', 'gtdb_species_clade_id']]

Selected 25 genomes
  With pangenome match: 13
  With BacDive data:    25
  Mean carbon sources:  28.0
  Distinct genera:      18

Taxonomic distribution:
genus
Pseudomonas        5
Shewanella         4
Bacteroides        1
Klebsiella         1
Burkholderia       1
Sinorhizobium      1
Phaeobacter        1
Escherichia        1
Herbaspirillum     1
Cupriavidus        1
Acidovorax         1
Dinoroseobacter    1
Echinicola         1
Marinobacter       1
Pedobacter         1
Azospirillum       1
Caulobacter        1
Sphingomonas       1

Selected organisms:

# Get carbon source experiments for selected organisms only
selected_orgIds = selected['orgId'].tolist()
selected_exps = fb_carbon_exps[fb_carbon_exps['orgId'].isin(selected_orgIds)].copy()

# Build carbon source panel
carbon_panel = selected_exps.groupby('condition_1').agg(
    n_organisms=('orgId', 'nunique'),
    n_experiments=('expName', 'nunique'),
    organisms=('orgId', lambda x: ', '.join(sorted(x.unique())))
).reset_index().sort_values('n_organisms', ascending=False)

carbon_panel.rename(columns={'condition_1': 'carbon_source'}, inplace=True)

print(f'Total carbon sources across selected organisms: {len(carbon_panel)}')
print(f'Carbon sources tested in >= 5 organisms: {(carbon_panel["n_organisms"] >= 5).sum()}')
print(f'Carbon sources tested in >= 10 organisms: {(carbon_panel["n_organisms"] >= 10).sum()}')
carbon_panel.head(30)

Total carbon sources across selected organisms: 152
Carbon sources tested in >= 5 organisms: 53
Carbon sources tested in >= 10 organisms: 26

# Create coverage matrix: organisms × carbon sources (top 30)
top_carbons = carbon_panel.head(30)['carbon_source'].tolist()
coverage_matrix = pd.crosstab(
    selected_exps[selected_exps['condition_1'].isin(top_carbons)]['orgId'],
    selected_exps[selected_exps['condition_1'].isin(top_carbons)]['condition_1']
).clip(upper=1)  # binary presence

print(f'Coverage matrix: {coverage_matrix.shape[0]} organisms × {coverage_matrix.shape[1]} carbon sources')
print(f'Mean coverage per organism: {coverage_matrix.sum(axis=1).mean():.1f} carbon sources')
print(f'Mean coverage per carbon source: {coverage_matrix.sum(axis=0).mean():.1f} organisms')
coverage_matrix

Coverage matrix: 25 organisms × 30 carbon sources
Mean coverage per organism: 15.7 carbon sources
Mean coverage per carbon source: 13.1 organisms

# Get gene counts and fitness data stats for selected organisms
selected_org_str = "', '".join(selected_orgIds)

fb_gene_counts = spark.sql(f"""
    SELECT orgId, 
           COUNT(*) as n_genes,
           SUM(CASE WHEN type = '1' THEN 1 ELSE 0 END) as n_protein_coding
    FROM kescience_fitnessbrowser.gene
    WHERE orgId IN ('{selected_org_str}')
    GROUP BY orgId
""").toPandas()

fb_fitness_counts = spark.sql(f"""
    SELECT orgId,
           COUNT(DISTINCT locusId) as n_genes_with_fitness,
           COUNT(DISTINCT expName) as n_fitness_experiments
    FROM kescience_fitnessbrowser.genefitness
    WHERE orgId IN ('{selected_org_str}')
    GROUP BY orgId
""").toPandas()

gene_stats = fb_gene_counts.merge(fb_fitness_counts, on='orgId', how='left')
gene_stats['pct_with_fitness'] = (
    gene_stats['n_genes_with_fitness'] / gene_stats['n_protein_coding'] * 100
).round(1)

print(f'Gene and fitness data summary for {len(gene_stats)} selected organisms:')
print(f'  Mean genes per organism:    {gene_stats["n_genes"].mean():.0f}')
print(f'  Mean % with fitness data:   {gene_stats["pct_with_fitness"].mean():.1f}%')
gene_stats.sort_values('n_genes', ascending=False)

Gene and fitness data summary for 25 selected organisms:
  Mean genes per organism:    5114
  Mean % with fitness data:   83.5%

# Genome manifest
manifest = selected.merge(gene_stats, on='orgId', how='left')
manifest_cols = [
    'orgId', 'full_name', 'genus', 'species', 'strain', 'taxonomyId',
    'n_carbon_sources', 'n_experiments', 'has_pangenome', 'has_bacdive',
    'gtdb_species_clade_id', 'GTDB_species',
    'n_genes', 'n_protein_coding', 'n_genes_with_fitness', 'pct_with_fitness'
]
# Only include columns that exist
manifest_cols = [c for c in manifest_cols if c in manifest.columns]
manifest = manifest[manifest_cols]

manifest.to_csv(f'{DATA_DIR}/genome_manifest.tsv', sep='\t', index=False)
print(f'Saved genome manifest: {len(manifest)} genomes')

# Carbon source panel
carbon_panel.to_csv(f'{DATA_DIR}/carbon_source_panel.tsv', sep='\t', index=False)
print(f'Saved carbon source panel: {len(carbon_panel)} carbon sources')

# Full carbon experiments for selected organisms
selected_exps.to_csv(f'{DATA_DIR}/fb_carbon_experiments.tsv', sep='\t', index=False)
print(f'Saved FB carbon experiments: {len(selected_exps)} experiments')

# BacDive utilization (supplementary — save what we have)
if len(bacdive_util) > 0:
    selected_genera = selected['genus'].unique()
    bacdive_selected = bacdive_util[bacdive_util['genus'].isin(selected_genera)]
    bacdive_selected.to_csv(f'{DATA_DIR}/bacdive_utilization.tsv', sep='\t', index=False)
    print(f'Saved BacDive utilization: {len(bacdive_selected)} records (supplementary)')

print('\nAll NB01 outputs saved to data/')

Saved genome manifest: 25 genomes
Saved carbon source panel: 152 carbon sources
Saved FB carbon experiments: 1510 experiments
Saved BacDive utilization: 85933 records (supplementary)

All NB01 outputs saved to data/

# Final summary
print('=' * 60)
print('NB01 SUMMARY: Genome and Carbon Source Selection')
print('=' * 60)
print(f'Selected genomes:           {len(selected)}')
print(f'  With pangenome match:     {selected["has_pangenome"].sum()}')
print(f'  With BacDive data:        {selected["has_bacdive"].sum()}')
print(f'  Without pangenome:        {(~selected["has_pangenome"]).sum()}')
print(f'Distinct genera:            {selected["genus"].nunique()}')
print(f'Total carbon sources:       {len(carbon_panel)}')
print(f'Carbon sources (>= 5 orgs): {(carbon_panel["n_organisms"] >= 5).sum()}')
print(f'Mean C-sources per org:     {selected["n_carbon_sources"].mean():.1f}')
print(f'Mean experiments per org:   {selected["n_experiments"].mean():.1f}')
print('=' * 60)
print('\nSelection based on Fitness Browser RB-TnSeq data richness.')
print('BacDive phenotypes and pangenome context are supplementary.')

============================================================
NB01 SUMMARY: Genome and Carbon Source Selection
============================================================
Selected genomes:           25
  With pangenome match:     13
  With BacDive data:        25
  Without pangenome:        12
Distinct genera:            18
Total carbon sources:       152
Carbon sources (>= 5 orgs): 53
Mean C-sources per org:     28.0
Mean experiments per org:   60.4
============================================================

Selection based on Fitness Browser RB-TnSeq data richness.
BacDive phenotypes and pangenome context are supplementary.

	orgId	division	genus	species	strain	taxonomyId
0	acidovorax_3H11	Betaproteobacteria	Acidovorax	sp.	GW101-3H11	12916
1	ANA3	Gammaproteobacteria	Shewanella	sp.	ANA-3	94122
2	azobra	Alphaproteobacteria	Azospirillum	brasilense	Sp245	1064539
3	BFirm	Betaproteobacteria	Burkholderia	phytofirmans	PsJN	398527
4	Btheta	Bacteroidetes	Bacteroides	thetaiotaomicron	VPI-5482	226186
5	Burk376	Betaproteobacteria	Paraburkholderia	bryophila	376MFSha3.1	1169143
6	Caulo	Alphaproteobacteria	Caulobacter	crescentus	NA1000	565050
7	Cola	Bacteroidetes	Echinicola	vietnamensis	KMM 6221, DSM 17526	926556
8	Cup4G11	Betaproteobacteria	Cupriavidus	basilensis	FW507-4G11	68895
9	Dda3937	Gammaproteobacteria	Dickeya	dadantii	3937	198628

condition_1	D-Fructose	D-Galactose	D-Gluconic Acid sodium salt	D-Glucosamine Hydrochloride	D-Glucose	D-Maltose monohydrate	D-Mannose	D-Ribose	D-Trehalose dihydrate	D-Xylose	...	Sodium D,L-Lactate	Sodium D-Lactate	Sodium Fumarate dibasic	Sodium L-Lactate	Sodium pyruvate	Sodium succinate dibasic hexahydrate	Tween 20	a-Ketoglutaric acid disodium salt hydrate	casamino acids	m-Inositol
orgId
ANA3	0	0	0	1	0	1	0	0	0	0	...	1	0	1	1	0	1	1	0	1	0
BFirm	1	1	1	1	1	0	1	1	0	1	...	1	1	0	1	1	1	1	0	0	1
Btheta	1	1	0	1	1	1	1	1	1	1	...	0	0	0	0	0	0	0	0	0	0
Caulo	0	0	0	0	1	1	1	0	0	1	...	0	0	0	0	0	0	0	0	0	1
Cola	1	1	0	1	1	1	1	0	1	1	...	0	0	0	0	0	0	0	0	0	0
Cup4G11	1	0	1	0	0	0	0	0	0	0	...	1	1	0	1	1	1	0	1	0	0
Dino	0	0	0	0	1	1	0	0	1	0	...	1	1	1	1	1	1	0	1	1	0
HerbieS	0	1	1	0	1	0	1	1	0	1	...	1	1	0	1	1	1	0	1	0	1
Keio	1	1	1	1	1	1	1	1	1	1	...	1	1	0	0	1	1	0	1	1	0
Korea	1	0	0	0	1	1	0	0	0	1	...	0	0	0	0	0	0	0	0	1	0
Koxy	1	0	1	1	1	1	0	1	1	1	...	1	1	0	1	1	1	0	0	0	1
MR1	0	0	0	0	0	0	0	0	0	0	...	1	0	0	1	1	0	1	0	1	0
Marino	0	0	0	0	0	0	0	0	1	0	...	1	1	1	1	1	1	1	1	1	0
PV4	0	0	0	0	0	1	0	0	0	0	...	1	0	1	1	1	1	0	1	1	0
Pedo557	0	1	0	1	1	1	1	0	1	0	...	0	0	0	0	0	0	0	0	0	0
Phaeo	1	0	0	0	1	0	1	0	1	1	...	1	1	1	1	1	1	0	1	1	1
Putida	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	1	1	1	0	0
SB2B	0	0	0	0	1	1	1	0	0	0	...	1	0	0	1	1	0	1	1	1	0
Smeli	1	1	0	1	1	1	1	1	1	1	...	1	0	0	1	1	1	0	0	0	1
WCS417	1	1	1	1	1	0	1	1	1	1	...	1	1	1	1	1	1	0	1	0	1
acidovorax_3H11	1	1	1	0	1	0	1	1	0	0	...	1	1	0	1	1	1	1	1	1	0
azobra	1	0	1	0	0	0	0	0	0	0	...	1	1	0	1	1	1	0	1	0	0
psRCH2	1	0	1	0	1	1	0	0	0	0	...	1	1	1	1	1	1	1	1	1	0
pseudo3_N2E3	1	0	1	1	1	0	1	1	1	0	...	1	1	1	1	1	1	1	1	1	1
pseudo5_N2C3_1	1	1	1	1	1	0	1	1	1	1	...	1	1	1	1	1	1	1	1	1	1

	orgId	n_genes	n_protein_coding	n_genes_with_fitness	n_fitness_experiments	pct_with_fitness
19	Cup4G11	7449	7358	6384	106	86.8
5	BFirm	7263	7182	5428	113	75.6
6	azobra	6447	5488	4833	91	88.1
18	Smeli	6281	6217	5133	90	82.6
20	pseudo5_N2C3_1	6082	6000	5193	184	86.6
4	pseudo3_N2E3	5854	5766	5028	211	87.2
11	Putida	5661	5563	4778	300	85.9
0	Koxy	5586	5309	4608	208	86.8
15	WCS417	5580	5506	4419	201	80.3
10	Pedo557	5089	4964	4423	177	89.1
14	acidovorax_3H11	5019	4964	3935	140	79.3
13	Btheta	4902	4816	4055	519	84.2
1	HerbieS	4779	4715	3879	96	82.3
17	MR1	4745	4467	3782	176	84.7
21	Cola	4684	4625	3954	202	85.5
2	Keio	4610	4146	3789	168	91.4
23	Dino	4562	4192	3187	186	76.0
12	ANA3	4552	4360	3668	107	84.1
24	Marino	4470	4410	3650	255	82.8
7	psRCH2	4336	4265	3349	350	78.5
22	Korea	4245	4149	3393	162	81.8
8	PV4	3993	3859	3009	160	78.0
3	Phaeo	3944	3875	3099	274	80.0
9	Caulo	3943	3886	3312	198	85.2
16	SB2B	3785	3645	3121	190	85.6

01 Genome Carbon Selection

NB01: Genome and Carbon Source Selection¶

Outputs¶

1. Fitness Browser: Carbon Source Experiments¶

2. Fitness Browser: Organism Metadata¶

3. Experiment Group Distribution¶

4. BacDive: Growth Phenotype Data¶

5. Map FB Organisms to Pangenome¶

6. GapMind Pathway Coverage¶

7. Select Top ~20 Genomes¶

8. Carbon Source Panel¶

9. Extract Fitness Data Summary for Selected Organisms¶

10. Save Outputs¶

	orgId	expName	expGroup	condition_1	concentration_1	units_1	condition_2	media
0	acidovorax_3H11	set1IT061	carbon source	D-Glucose	20	mM	NaN	RCH2_defined_noCarbon
1	acidovorax_3H11	set1IT062	carbon source	D-Fructose	20	mM	NaN	RCH2_defined_noCarbon
2	acidovorax_3H11	set1IT064	carbon source	L-Arabinose	20	mM	NaN	RCH2_defined_noCarbon
3	acidovorax_3H11	set1IT065	carbon source	D-Galactose	20	mM	NaN	RCH2_defined_noCarbon
4	acidovorax_3H11	set1IT066	carbon source	D-Ribose	20	mM	NaN	RCH2_defined_noCarbon
5	acidovorax_3H11	set1IT067	carbon source	a-Ketoglutaric acid disodium salt hydrate	20	mM	NaN	RCH2_defined_noCarbon
6	acidovorax_3H11	set1IT068	carbon source	D-Gluconic Acid sodium salt	20	mM	NaN	RCH2_defined_noCarbon
7	acidovorax_3H11	set1IT070	carbon source	Sodium D-Lactate	20	mM	NaN	RCH2_defined_noCarbon
8	acidovorax_3H11	set1IT071	carbon source	Sodium L-Lactate	20	mM	NaN	RCH2_defined_noCarbon
9	acidovorax_3H11	set1IT072	carbon source	Sodium D,L-Lactate	20	mM	NaN	RCH2_defined_noCarbon

	orgId	n_experiments	n_carbon_sources	carbon_sources
27	WCS417	116	51	2'-Deoxyinosine, 2-Deoxy-D-Ribose, 2-Deoxy-D-r...
2	Btheta	112	47	(+)-Arabinogalactan, 1,4-B-D-Galactobiose, 6-O...
24	Putida	128	47	1,2-Propanediol, 1,3-Butandiol, 1,4-Butanediol...
33	pseudo3_N2E3	82	46	Carnitine Hydrochloride, Cytidine, D-Alanine, ...
34	pseudo5_N2C3_1	85	45	Carnitine Hydrochloride, Cytidine, D-Alanine, ...
15	Koxy	84	41	2'-Deoxycytidine, 2'-Deoxyinosine, 2-Deoxy-D-r...
1	BFirm	49	40	2-Deoxy-D-Ribose, 2-Deoxy-D-ribonic acid lithi...
32	pseudo1_N1B4	60	39	Carnitine Hydrochloride, Cytidine, D-Alanine, ...
35	pseudo6_N2E2	77	36	4-Hydroxybenzoic Acid, Carnitine Hydrochloride...
26	Smeli	33	33	Beta-Lactose, D-Arabinose, D-Cellobiose, D-Fru...
23	Phaeo	99	32	D-Cellobiose, D-Fructose, D-Glucose, D-Mannose...
30	psRCH2	66	30	D-Alanine, D-Fructose, D-Gluconic Acid sodium ...
31	pseudo13_GW456_L13	49	28	D-Alanine, D-Fructose, D-Galactose, D-Gluconic...
13	Keio	68	28	D-Fructose, D-Galactose, D-Galacturonic Acid m...
12	HerbieS	52	26	D-Galactose, D-Galacturonic Acid monohydrate, ...
6	Cup4G11	42	24	4-Hydroxybenzoic Acid, D-Fructose, D-Gluconic ...
25	SB2B	64	23	Adenosine, Cytidine, D-Cellobiose, D-Glucose, ...
0	ANA3	47	23	Cytidine, D-Glucosamine Hydrochloride, D-Malto...
28	acidovorax_3H11	41	21	D-Fructose, D-Galactose, D-Gluconic Acid sodiu...
10	Dino	43	20	D-Glucose, D-Maltose monohydrate, D-Trehalose ...
5	Cola	57	19	Beta-Lactose, D-Cellobiose, D-Fructose, D-Gala...
18	Marino	54	19	D-Alanine, D-Trehalose dihydrate, Ethanol, Gly...
22	Pedo557	35	17	Agro_defined_trehalose, Beta-Lactose, D-Cellob...
16	MR1	31	16	Adenosine, Cytidine, Gelatin, Gly-DL-Asp, Gly-...
29	azobra	29	15	D-Fructose, D-Gluconic Acid sodium salt, Ethan...
4	Caulo	42	14	Beta-Lactose, D-Cellobiose, D-Glucose, D-Malto...
21	PV4	24	13	D-Maltose monohydrate, Gelatin, L-Glutamine, L...
14	Korea	27	11	D-Cellobiose, D-Fructose, D-Glucose, D-Maltose...
11	Dyella79	19	10	D-Cellobiose, D-Fructose, D-Galactose, D-Gluco...
3	Burk376	30	7	2'-Deoxyinosine, 2-Deoxy-D-Ribose, 2-Deoxy-D-r...

	orgId	full_name	n_carbon_sources	n_experiments	taxonomyId
0	WCS417	Pseudomonas simiae WCS417	51	116	321846
1	Btheta	Bacteroides thetaiotaomicron VPI-5482	47	112	226186
2	Putida	Pseudomonas putida KT2440	47	128	160488
3	pseudo3_N2E3	Pseudomonas fluorescens FW300-N2E3	46	82	294
4	pseudo5_N2C3_1	Pseudomonas fluorescens FW300-N2C3	45	85	294
5	Koxy	Klebsiella michiganensis M5al	41	84	290337
6	BFirm	Burkholderia phytofirmans PsJN	40	49	398527
7	pseudo1_N1B4	Pseudomonas fluorescens FW300-N1B4	39	60	294
8	pseudo6_N2E2	Pseudomonas fluorescens FW300-N2E2	36	77	294
9	Smeli	Sinorhizobium meliloti 1021	33	33	266834
10	Phaeo	Phaeobacter inhibens BS107	32	99	391619
11	psRCH2	Pseudomonas stutzeri RCH2	30	66	644801
12	pseudo13_GW456_L13	Pseudomonas fluorescens GW456-L13	28	49	294
13	Keio	Escherichia coli BW25113	28	68	511145
14	HerbieS	Herbaspirillum seropedicae SmR1	26	52	757424
15	Cup4G11	Cupriavidus basilensis FW507-4G11	24	42	68895
16	SB2B	Shewanella amazonensis SB2B	23	64	326297
17	ANA3	Shewanella sp. ANA-3	23	47	94122
18	acidovorax_3H11	Acidovorax sp. GW101-3H11	21	41	12916
19	Dino	Dinoroseobacter shibae DFL-12	20	43	398580

	orgId	full_name	n_carbon_sources	n_experiments	has_pangenome	has_bacdive	gtdb_species_clade_id	score
0	WCS417	Pseudomonas simiae WCS417	51	116	False	True	NaN	51
1	Btheta	Bacteroides thetaiotaomicron VPI-5482	47	112	True	True	s__Bacteroides_thetaiotaomicron--RS_GCF_000011...	47
2	Putida	Pseudomonas putida KT2440	47	128	False	True	NaN	47
3	pseudo3_N2E3	Pseudomonas fluorescens FW300-N2E3	46	82	False	True	NaN	46
4	pseudo5_N2C3_1	Pseudomonas fluorescens FW300-N2C3	45	85	False	True	NaN	45
5	Koxy	Klebsiella michiganensis M5al	41	84	True	True	s__Klebsiella_michiganensis--RS_GCF_002925905.1	41
6	BFirm	Burkholderia phytofirmans PsJN	40	49	False	True	NaN	40
9	Smeli	Sinorhizobium meliloti 1021	33	33	True	True	s__Sinorhizobium_meliloti--RS_GCF_017876815.1	33
10	Phaeo	Phaeobacter inhibens BS107	32	99	True	True	s__Phaeobacter_inhibens--RS_GCF_000473105.1	32
11	psRCH2	Pseudomonas stutzeri RCH2	30	66	False	True	NaN	30
13	Keio	Escherichia coli BW25113	28	68	True	True	s__Escherichia_coli_E--RS_GCF_011881725.1	28
14	HerbieS	Herbaspirillum seropedicae SmR1	26	52	True	True	s__Herbaspirillum_seropedicae--RS_GCF_001040945.1	26
15	Cup4G11	Cupriavidus basilensis FW507-4G11	24	42	True	True	s__Cupriavidus_basilensis--RS_GCF_008801925.2	24
16	SB2B	Shewanella amazonensis SB2B	23	64	False	True	NaN	23
17	ANA3	Shewanella sp. ANA-3	23	47	False	True	NaN	23
18	acidovorax_3H11	Acidovorax sp. GW101-3H11	21	41	False	True	NaN	21
19	Dino	Dinoroseobacter shibae DFL-12	20	43	True	True	s__Dinoroseobacter_shibae--RS_GCF_000018145.1	20
20	Cola	Echinicola vietnamensis KMM 6221, DSM 17526	19	57	False	True	NaN	19
21	Marino	Marinobacter adhaerens HP15	19	54	True	True	s__Marinobacter_adhaerens--RS_GCF_000166295.1	19
22	Pedo557	Pedobacter sp. GW460-11-11-14-LB5	17	35	False	True	NaN	17
23	MR1	Shewanella oneidensis MR-1	16	31	True	True	s__Shewanella_oneidensis--RS_GCF_000146165.2	16
24	azobra	Azospirillum brasilense Sp245	15	29	True	True	s__Azospirillum_brasilense--RS_GCF_001315015.1	15
25	Caulo	Caulobacter crescentus NA1000	14	42	False	True	NaN	14
26	PV4	Shewanella loihica PV-4	13	24	True	True	s__Shewanella_loihica--RS_GCF_000016065.1	13
27	Korea	Sphingomonas koreensis DSMZ 15582	11	27	True	True	s__Sphingomonas_koreensis--RS_GCF_002797435.1	11
28	Dyella79	Dyella japonica UNC79MFTsu3.2	10	19	True	True	s__Dyella_japonica_C--RS_GCF_000725385.1	10

	carbon_source	n_organisms	n_experiments	organisms
121	Sodium D,L-Lactate	19	46	ANA3, BFirm, Cup4G11, Dino, HerbieS, Keio, Kox...
40	D-Glucose	19	97	BFirm, Btheta, Caulo, Cola, Dino, HerbieS, Kei...
124	Sodium L-Lactate	18	39	ANA3, BFirm, Cup4G11, Dino, HerbieS, Koxy, MR1...
128	Sodium pyruvate	18	34	BFirm, Cup4G11, Dino, HerbieS, Keio, Koxy, MR1...
129	Sodium succinate dibasic hexahydrate	18	34	ANA3, BFirm, Cup4G11, Dino, HerbieS, Keio, Kox...
81	L-Glutamine	17	30	BFirm, Cup4G11, Dino, HerbieS, Korea, Koxy, MR...
86	L-Malic acid disodium salt monohydrate	17	33	ANA3, BFirm, Cup4G11, Dino, HerbieS, Keio, Kox...
35	D-Fructose	15	27	BFirm, Btheta, Cola, Cup4G11, Keio, Korea, Kox...
143	a-Ketoglutaric acid disodium salt hydrate	15	30	Cup4G11, Dino, HerbieS, Keio, Marino, PV4, Pha...
122	Sodium D-Lactate	14	27	BFirm, Cup4G11, Dino, HerbieS, Keio, Koxy, Mar...
47	D-Mannose	14	25	BFirm, Btheta, Caulo, Cola, HerbieS, Keio, Ped...
110	N-Acetyl-D-Glucosamine	14	29	ANA3, BFirm, Btheta, Caulo, Keio, Korea, MR1, ...
64	Glycerol	13	22	Dino, HerbieS, Keio, Koxy, Marino, Phaeo, Smel...
45	D-Maltose monohydrate	13	28	ANA3, Btheta, Caulo, Cola, Dino, Keio, Korea, ...
146	casamino acids	13	28	ANA3, Dino, Keio, Korea, MR1, Marino, PV4, Pha...
54	D-Trehalose dihydrate	12	30	Btheta, Cola, Dino, Keio, Koxy, Marino, Pedo55...
116	Potassium acetate	12	30	ANA3, Keio, Koxy, Marino, PV4, Phaeo, Putida, ...
55	D-Xylose	12	22	BFirm, Btheta, Caulo, Cola, HerbieS, Keio, Kor...
38	D-Gluconic Acid sodium salt	11	19	BFirm, Cup4G11, HerbieS, Keio, Koxy, WCS417, a...
39	D-Glucosamine Hydrochloride	11	17	ANA3, BFirm, Btheta, Cola, Keio, Koxy, Pedo557...
89	L-Proline	11	23	BFirm, Caulo, Cup4G11, HerbieS, Marino, Phaeo,...
49	D-Ribose	10	14	BFirm, Btheta, HerbieS, Keio, Koxy, Smeli, WCS...
136	Tween 20	10	19	ANA3, BFirm, MR1, Marino, Putida, SB2B, acidov...
74	L-Arabinose	10	16	ANA3, BFirm, Btheta, Cola, HerbieS, Pedo557, S...
80	L-Glutamic acid monopotassium salt monohydrate	10	24	Caulo, Dino, Korea, Marino, Phaeo, Putida, SB2...
36	D-Galactose	10	18	BFirm, Btheta, Cola, HerbieS, Keio, Pedo557, S...
76	L-Asparagine	9	17	BFirm, Cup4G11, HerbieS, Phaeo, WCS417, acidov...
123	Sodium Fumarate dibasic	9	20	ANA3, Dino, Marino, PV4, Phaeo, WCS417, psRCH2...
148	m-Inositol	9	18	BFirm, Caulo, HerbieS, Koxy, Phaeo, Smeli, WCS...
73	L-Alanine	9	16	ANA3, BFirm, Caulo, Cup4G11, Koxy, WCS417, psR...