# =============================================================================
# Cell 1: Imports and Spark Session
# =============================================================================
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import DoubleType
import math

# Import BERDL Spark session utility
# Configure Spark session
sys.path.append('/opt/conda/lib/python3.13/site-packages')
from berdl_notebook_utils.setup_spark_session import get_spark_session

spark = get_spark_session()
sns.set_theme(style="whitegrid", palette="muted")
print("Spark session acquired.")

Spark session acquired.

# =============================================================================
# Cell 2: Haversine UDF for Spatial Distance
# =============================================================================
def haversine(lat1, lon1, lat2, lon2):
    if None in (lat1, lon1, lat2, lon2):
        return None
    R = 6371.0
    lat1_rad = math.radians(lat1)
    lat2_rad = math.radians(lat2)
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

haversine_udf = F.udf(haversine, DoubleType())

# =============================================================================
# Cell 3: Load Soil Metal Data from arkinlab_microbeatlas
# =============================================================================
SOIL_DB = "arkinlab_microbeatlas"
KBASE_DB = "kbase_ke_pangenome"

df_sample = spark.table(f"{SOIL_DB}.sample_metadata")
df_enriched = spark.table(f"{SOIL_DB}.enriched_metadata")
df_gee = spark.table(f"{SOIL_DB}.enriched_metadata_gee")

# Deduplicate
df_enriched = df_enriched.withColumn("rn", F.row_number().over(Window.partitionBy("accession_id").orderBy("accession_id"))).filter(F.col("rn") == 1).drop("rn")
df_gee = df_gee.withColumn("rn", F.row_number().over(Window.partitionBy("SRS_Join_Key").orderBy("SRS_Join_Key"))).filter(F.col("rn") == 1).drop("rn")

# Join
df_soil_raw = (
    df_sample.alias("s")
    .join(df_enriched.alias("e"), F.col("s.sample_id") == F.col("e.accession_id"), "inner")
    .join(df_gee.alias("g"), F.col("s.SRS_Join_Key") == F.col("g.SRS_Join_Key"), "inner")
)

# Filter soil samples with OTU data and coordinates
df_soil_raw = df_soil_raw.filter(
    (F.col("s.SRS_Join_Key").isNotNull()) &
    (F.col("s.n_genes_by_counts").isNotNull()) &
    (F.col("s.Env_Level_1") == "soil")
).filter(
    F.col("g.LatitudeParsed").isNotNull() & F.col("g.LongitudeParsed").isNotNull()
)

# Metal columns
metal_columns = [
    "e.GeoROC_Rocks_georoc_Co_ppm", "e.GeoROC_Rocks_georoc_Cr_ppm",
    "e.GeoROC_Rocks_georoc_Cu_ppm", "e.GeoROC_Rocks_georoc_Ni_ppm",
    "e.GeoROC_Rocks_georoc_Zn_ppm", "e.GeoROC_Rocks_georoc_Pb_ppm",
    "e.GeoROC_Rocks_georoc_As_ppm", "e.GeoROC_Rocks_georoc_Cd_ppm",
    "e.GeoROC_Rocks_georoc_Hg_ppm"
]

df_soil_metal = df_soil_raw.select(
    F.col("s.sample_id"),
    F.col("g.LatitudeParsed").alias("lat"),
    F.col("g.LongitudeParsed").alias("lon"),
    F.col("g.ph").alias("ph"),
    *[F.col(c).cast("double").alias(c.split(".")[-1]) for c in metal_columns]
).cache()

print(f"Soil samples with metal data: {df_soil_metal.count()}")
df_soil_metal.show(5, truncate=False)

Soil samples with metal data: 51748
+----------------------+----------------+-----------------+-------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+
|sample_id             |lat             |lon              |ph     |GeoROC_Rocks_georoc_Co_ppm|GeoROC_Rocks_georoc_Cr_ppm|GeoROC_Rocks_georoc_Cu_ppm|GeoROC_Rocks_georoc_Ni_ppm|GeoROC_Rocks_georoc_Zn_ppm|GeoROC_Rocks_georoc_Pb_ppm|GeoROC_Rocks_georoc_As_ppm|GeoROC_Rocks_georoc_Cd_ppm|GeoROC_Rocks_georoc_Hg_ppm|
+----------------------+----------------+-----------------+-------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+
|SRR10007924.SRS5296575|47.56235        |-120.24499       |Unknown|NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |
|SRR10054991.SRS5344461|51.33           |5.52             |Unknown|NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |NULL                      |
|SRR10088953.SRS5364268|27.1275220227132|-81.3574730821724|Unknown|65.4                      |NULL                      |316.6                     |187.1                     |1131.8                    |NULL                      |NULL                      |NULL                      |NULL                      |
|SRR10089070.SRS5364427|27.1423279652406|-81.3576695340451|Unknown|65.4                      |NULL                      |316.6                     |187.1                     |1131.8                    |NULL                      |NULL                      |NULL                      |NULL                      |
|SRR10091860.SRS5367098|28.22           |116.9            |Unknown|55.8                      |121.0                     |83.0                      |229.0                     |122.0                     |2.91                      |NULL                      |NULL                      |NULL                      |
+----------------------+----------------+-----------------+-------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+--------------------------+
only showing top 5 rows

# =============================================================================
# Cell 4: Load KBase Genome Data with COG Annotations and Coordinates
# =============================================================================
# Coordinates
df_kbase_coords = spark.table(f"{KBASE_DB}.alphaearth_embeddings_all_years") \
    .select(
        F.col("genome_id"),
        F.expr("TRY_CAST(cleaned_lat AS DOUBLE)").alias("lat"),
        F.expr("TRY_CAST(cleaned_lon AS DOUBLE)").alias("lon")
    ).filter(F.col("lat").isNotNull())

# eggNOG annotations
df_eggnog = spark.table(f"{KBASE_DB}.eggnog_mapper_annotations") \
    .select(
        F.col("query_name").alias("gene_id"),
        F.col("COG_category")
    )

# Gene to genome mapping
df_gene_map = spark.table(f"{KBASE_DB}.gene") \
    .select("gene_id", "genome_id")

# Count COG categories per genome (long format)
df_genome_cog_long = df_gene_map.join(df_eggnog, "gene_id", "inner") \
    .groupBy("genome_id", "COG_category").count()

# Pivot to wide format
df_genome_cog = df_genome_cog_long.groupBy("genome_id").pivot("COG_category").sum("count").fillna(0)

# Join coordinates with COG counts
df_kbase_cog = df_kbase_coords.join(df_genome_cog, "genome_id", "left").fillna(0)

print(f"KBase genomes with COG and coordinates: {df_kbase_cog.count()}")
df_kbase_cog.show(5, truncate=False)

KBase genomes with COG and coordinates: 83286
+------------------+-----------+----------+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+-----+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+---+---+---+---+---+---+---+---+---+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+------+----+----+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+----+-----+----+---+----+---+---+---+---+-----+---+-----+----+----+---+----+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+----+----+-----+------+-----+----+-----+---+----+---+---+---+----+----+---+---+---+---+---+---+---+----+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-----+----+---+---+---+---+---+---+-----+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+----+----+-----+---+----+----+---+----+----+---+---+---+----+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+----+----+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|genome_id         |lat        |lon       |-  |A  |ABO|ACE|AD |ADK|ADKV|ADL|AEJ|AJ |AK |AL |AO |AT |AU |AV |B  |BD |BDL|BDLTU|BI |BK |BKQ|BQ |BT |C  |CD |CDT|CDZ|CE |CEF|CEG|CEGO|CEH|CEHJ|CEK|CET|CF |CFO|CG |CGM|CGU|CH |CHI|CHIKQ|CHJ|CHQ|CHT|CI |CIQ|CJ |CJQ|CK |CKT|CL |CM |CMN|CMO|CMU|CNT|CNU|CO |COP|COT|COTV|CP |CPV|CQ |CT |CU |D  |DE |DEG|DEGMUZ|DENZ|DETZ|DEZ|DF |DFJ|DFL|DFM|DG |DGMZ|DGZ|DHM|DI |DIZ|DJ |DJL|DK |DKL|DKLT|DKLTZ|DKMT|DKT|DKTZ|DL |DLV|DM |DMN|DMNUW|DMO|DMOTZ|DMPZ|DMQZ|DMU|DMUZ|DMZ|DN |DNT|DNU|DNZ|DO |DOT|DOTZ|DOZ|DP |DPT|DQ |DT |DTZ|DU |DUW|DUZ|DV |DZ |E  |EF |EFG|EFGP|EFJ|EFQ|EFT|EG |EGH|EGHP|EGIP|EGIPQ|EGKLPT|EGKLT|EGKP|EGKPT|EGM|EGMP|EGN|EGO|EGP|EGPQ|EGPT|EGT|EH |EHJ|EHP|EHQ|EI |EIJ|EIOV|EIP|EIQ|EJ |EJM|EJN|EK |EKLT|EKQ|EKT|EL |ELM|EM |EMQ|EMT|EN |ENT|ENU|EO |EP |EPQ|EPT|EQ |EQT|ET |EU |EUW|EV |F  |FG |FGL|FGM|FGMQ|FGP|FGQ|FGV|FH |FHI|FI |FIM|FJ |FJK|FK |FKLPT|FL |FM |FN |FO |FP |FPT|FQ |FT |FV |G  |GH |GHM|GI |GIM|GIQ|GIV|GJ |GJM|GK |GKLMT|GKLT|GKM|GKQ|GKT|GL |GM |GMN|GMNQU|GMO|GMPQ|GMQ|GMT|GMU|GMV|GMW|GN |GNQ|GNT|GNU|GO |GOQ|GOT|GOU|GP |GPT|GPU|GQ |GT |GU |GUW|GV |H  |HI |HIQ|HJ |HJK|HJM|HJO|HK |HKLT|HKT|HL |HLP|HLT|HM |HMU|HNU|HO |HP |HPT|HQ |HT |I  |IJ |IJM|IJT|IK |IKOT|IKT|IL |IM |IMO|IMQ|IMU|IMV|IN |IO |IOQU|IOT|IP |IQ |IQT|IQU|IT |IU |IV |J  |JK |JKL|JL |JM |JMQU|JMT|JMU|JO |JOU|JP |JPQ|JQ |JT |JUY|K  |KL |KLMT|KLNT|KLNTU|KLO|KLOT|KLPT|KLT|KLTU|KLTV|KLV|KM |KMT|KMTU|KMU|KNT|KNTU|KNU|KO |KOT|KP |KPT|KQ |KQT|KT |KTU|KTV|KU |KV |L  |LM |LMNU|LMU|LN |LNT|LNU|LO |LOT|LP |LQ |LT |LU |LV |M  |MN |MNO|MNOU|MNQU|MNTU|MNU|MNUV|MO |MOP|MOQ|MOT|MOU|MOV|MP |MPQ|MPT|MPU|MQ |MQU|MT |MTU|MU |MUV|MUW|MV |MW |N  |NO |NOT|NOU|NP |NPT|NPTU|NPU|NQ |NQU|NT |NTU|NU |NUV|NUW|NV |O  |OP |OPQ|OPT|OQ |OT |OTU|OU |OV |OW |P  |PQ |PQU|PQV|PT |PU |PV |Q  |QT |QU |QUW|QV |S  |T  |TU |TV |TW |TZ |U  |UW |UY |UZ |V  |VW |W  |Y  |Z  |
+------------------+-----------+----------+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+-----+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+---+---+---+---+---+---+---+---+---+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+------+----+----+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+----+-----+----+---+----+---+---+---+---+-----+---+-----+----+----+---+----+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+----+----+-----+------+-----+----+-----+---+----+---+---+---+----+----+---+---+---+---+---+---+---+----+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-----+----+---+---+---+---+---+---+-----+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+----+----+-----+---+----+----+---+----+----+---+---+---+----+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+----+----+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|GB_GCA_019417585.1|35.68099174|139.769013|49 |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0  |0  |0  |0  |0  |17 |0  |0  |0  |0  |0  |0  |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |10 |0  |0  |0     |0   |0   |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0   |0    |0   |0  |0   |0  |0  |0  |0  |0    |0  |0    |0   |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |19 |0  |0  |0   |0  |0  |0  |2  |0  |0   |0   |0    |0     |0    |0   |0    |0  |0   |0  |0  |0  |0   |0   |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |1  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |8  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0  |0  |0  |0  |0  |0  |0  |0  |0  |17 |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0   |0  |0  |0  |0  |1  |0  |0    |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |14 |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |6  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |15 |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |50 |2  |0   |0   |0    |0  |0   |0   |0  |0   |0   |0  |0  |0  |0   |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |3  |0  |0  |0  |0  |37 |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |13 |0  |0  |0   |0   |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |14 |0  |1  |0  |0  |0  |0   |0  |0  |0  |2  |1  |3  |0  |0  |0  |3  |0  |0  |0  |0  |0  |0  |0  |0  |0  |20 |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |87 |14 |0  |0  |0  |0  |6  |0  |0  |0  |14 |0  |0  |0  |0  |
|GB_GCA_024698785.1|35.84      |50.9391   |71 |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0  |1  |0  |0  |0  |71 |0  |0  |0  |0  |0  |0  |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |1  |0  |0    |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0   |0  |0  |0  |1  |0  |22 |0  |0  |0     |0   |0   |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0   |0    |0   |0  |0   |0  |0  |0  |0  |0    |0  |0    |0   |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |81 |1  |0  |0   |0  |0  |0  |1  |0  |0   |0   |0    |0     |0    |0   |0    |0  |0   |0  |0  |2  |0   |0   |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |2  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |40 |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0    |0  |0  |0  |0  |0  |0  |0  |0  |0  |90 |0  |0  |0  |0  |0  |0  |0  |0  |1  |0    |0   |0  |0  |0  |0  |2  |0  |0    |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |2  |0  |0  |0  |63 |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |29 |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |4  |0  |0  |0  |0  |0  |109|0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |112|1  |0   |0   |0    |0  |0   |0   |3  |0   |0   |0  |0  |0  |0   |0  |1  |0   |0  |0  |0  |0  |0  |1  |0  |7  |0  |0  |0  |0  |67 |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |34 |0  |0  |0   |0   |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |0  |2  |0  |0  |1  |0  |0  |0   |0  |0  |0  |5  |0  |6  |0  |0  |0  |28 |0  |0  |0  |0  |0  |0  |2  |0  |0  |48 |0  |0  |0  |0  |0  |0  |13 |0  |0  |0  |0  |261|69 |0  |0  |0  |0  |15 |0  |0  |0  |51 |0  |0  |0  |0  |
|GB_GCA_022743185.1|-25.943357 |32.573694 |5  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0  |0  |0  |0  |0  |9  |0  |0  |0  |0  |0  |0  |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |2  |0  |0  |0     |0   |0   |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0   |0    |0   |0  |0   |0  |0  |0  |0  |0    |0  |0    |0   |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |16 |0  |0  |0   |0  |0  |0  |2  |0  |0   |0   |0    |0     |0    |0   |0    |0  |0   |0  |0  |0  |0   |0   |0  |1  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |0  |1  |0  |0  |0  |8  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0    |0  |0  |0  |0  |0  |0  |0  |0  |0  |5  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0   |0  |0  |0  |0  |0  |0  |0    |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |13 |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |18 |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |7  |0  |0   |0   |0    |0  |0   |0   |0  |0   |0   |0  |0  |0  |0   |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |0  |23 |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |19 |0  |0  |0   |0   |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |1  |0  |0  |0  |3  |0  |0  |0  |0  |0  |0  |0  |0  |0  |21 |0  |0  |0  |0  |0  |0  |3  |0  |0  |0  |0  |35 |5  |0  |0  |0  |0  |3  |0  |0  |0  |5  |0  |0  |0  |0  |
|GB_GCA_024640595.1|-36.545    |174.674139|53 |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0  |0  |0  |0  |0  |42 |0  |0  |0  |0  |0  |0  |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |1  |0  |0    |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |3  |0  |0  |0   |0  |0  |0  |0  |0  |13 |0  |0  |0     |0   |0   |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0   |0    |0   |0  |0   |0  |0  |0  |0  |0    |0  |0    |0   |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |69 |0  |0  |0   |0  |0  |0  |4  |0  |0   |0   |0    |0     |0    |0   |0    |0  |0   |0  |0  |0  |0   |0   |0  |3  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |1  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |28 |1  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0    |0  |0  |0  |0  |0  |0  |0  |0  |0  |31 |0  |0  |0  |0  |0  |0  |0  |0  |1  |0    |0   |0  |0  |0  |0  |2  |0  |0    |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |48 |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |36 |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |2  |0  |0  |0  |0  |0  |76 |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |41 |0  |0   |0   |0    |0  |0   |0   |0  |0   |0   |0  |0  |0  |0   |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |2  |0  |0  |0  |0  |42 |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |66 |0  |0  |0   |0   |0   |1  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |4  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |1  |0  |0  |0  |41 |0  |0  |0  |0  |0  |0  |0  |0  |0  |39 |0  |0  |0  |0  |0  |0  |13 |0  |0  |0  |0  |200|21 |0  |0  |0  |0  |12 |0  |0  |0  |18 |0  |0  |0  |0  |
|RS_GCF_018722605.1|61.067     |25.033    |185|1  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0  |0  |0  |0  |0  |73 |0  |0  |0  |0  |0  |0  |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |1  |0  |0    |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |4  |0  |0  |0   |2  |0  |0  |0  |1  |27 |0  |0  |0     |0   |0   |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0   |0    |0   |0  |0   |0  |0  |1  |0  |0    |0  |0    |0   |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |53 |1  |0  |0   |0  |0  |0  |1  |0  |0   |0   |0    |0     |0    |0   |0    |0  |0   |0  |0  |2  |0   |0   |0  |1  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |33 |1  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0  |0  |0  |0  |1  |0  |0  |0  |0  |38 |0  |0  |0  |0  |0  |0  |0  |0  |0  |0    |0   |0  |0  |0  |0  |3  |0  |0    |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |64 |0  |0  |1  |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |1  |0  |0  |0  |21 |0  |0  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0   |0  |0  |4  |0  |0  |0  |0  |0  |57 |0  |1  |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |89 |2  |0   |0   |0    |0  |0   |0   |1  |0   |0   |0  |0  |0  |0   |0  |0  |0   |0  |0  |0  |0  |0  |0  |0  |3  |0  |0  |0  |0  |107|0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |1  |84 |0  |0  |0   |0   |0   |0  |0   |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |0  |10 |0  |0  |0  |0  |2  |0  |0  |0  |0  |0  |0   |0  |0  |0  |1  |0  |15 |0  |0  |0  |41 |0  |0  |0  |0  |0  |0  |2  |0  |0  |61 |0  |0  |0  |0  |0  |0  |18 |0  |0  |0  |0  |376|55 |0  |1  |0  |0  |26 |0  |0  |0  |33 |0  |0  |0  |0  |
+------------------+-----------+----------+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+-----+---+---+---+---+---+---+---+---+---+---+---+---+----+---+----+---+---+---+---+---+---+---+---+---+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+------+----+----+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+----+-----+----+---+----+---+---+---+---+-----+---+-----+----+----+---+----+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+----+----+-----+------+-----+----+-----+---+----+---+---+---+----+----+---+---+---+---+---+---+---+----+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+-----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-----+----+---+---+---+---+---+---+-----+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+----+----+-----+---+----+----+---+----+----+---+---+---+----+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+----+----+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
only showing top 5 rows

# =============================================================================
# Cell 5: Spatial Join (10 km radius)
# =============================================================================
RADIUS_KM = 10.0

# Binned coordinates
spark_soil_binned = df_soil_metal.withColumn("lat_bin", F.round("lat", 1)).withColumn("lon_bin", F.round("lon", 1))
spark_kbase_binned = df_kbase_cog.withColumn("lat_bin", F.round("lat", 1)).withColumn("lon_bin", F.round("lon", 1))

# Join on bins
joined = spark_soil_binned.alias("soil").join(
    spark_kbase_binned.alias("kb"),
    on=["lat_bin", "lon_bin"],
    how="inner"
)

# Filter by exact distance
nearby = joined.withColumn(
    "distance_km",
    haversine_udf(F.col("soil.lat"), F.col("soil.lon"), F.col("kb.lat"), F.col("kb.lon"))
).filter(F.col("distance_km") <= RADIUS_KM).cache()

total_soil = df_soil_metal.count()
distinct_soil = nearby.select("soil.sample_id").distinct().count()
print(f"Soil samples with ≥1 genome within {RADIUS_KM} km: {distinct_soil} (out of {total_soil})")

Soil samples with ≥1 genome within 10.0 km: 7566 (out of 51748)

# =============================================================================
# Cell 6: Aggregate and Normalize COG Counts per Soil Sample (Spatial)
# =============================================================================
cog_columns = [c for c in df_genome_cog.columns if c != "genome_id"]

# Count nearby genomes per sample
df_genome_count = nearby.groupBy("soil.sample_id").agg(F.count("kb.genome_id").alias("num_nearby_genomes"))

# Sum COG counts
agg_exprs = [F.sum(F.col(f"kb.{c}")).alias(f"sum_{c}") for c in cog_columns]
df_cog_sum = nearby.groupBy("soil.sample_id").agg(*agg_exprs)

# Normalize: average COG count per genome
norm_exprs = [(F.col(f"sum_{c}") / F.col("num_nearby_genomes")).alias(f"avg_{c}") for c in cog_columns]
df_cog_norm = df_cog_sum.join(df_genome_count, "sample_id").select("sample_id", "num_nearby_genomes", *norm_exprs)

# Join back to soil metal
df_metal_cog = df_soil_metal.join(df_cog_norm, "sample_id", "left").fillna(0)

# Convert to Pandas (downsample for performance)
pdf_metal_cog = df_metal_cog.sample(fraction=0.3, seed=42).toPandas()
print(f"Spatial COG dataset (downsampled): {pdf_metal_cog.shape}")

Spatial COG dataset (downsampled): (15618, 449)

# =============================================================================
# Cell 7: Load GTDB Taxonomy and OTU Order Abundance
# =============================================================================
# GTDB taxonomy for KBase genomes
df_gtdb = spark.table(f"{KBASE_DB}.gtdb_taxonomy_r214v1") \
    .select("genome_id", "domain", "phylum", "class", "order", "family", "genus", "species")

# OTU counts and taxonomy
df_otu = spark.table(f"{SOIL_DB}.otu_counts_long")
df_otu_tax = spark.table(f"{SOIL_DB}.otu_metadata")

# Parse taxonomy string into ranks
ranks = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
df_otu_tax = df_otu_tax.withColumn("tax_array", F.split(F.col("Tax"), ";"))
for i, rank in enumerate(ranks):
    df_otu_tax = df_otu_tax.withColumn(rank, F.try_element_at(F.col("tax_array"), F.lit(i+1)))
df_otu_tax = df_otu_tax.drop("tax_array")

# Join OTU counts with Order
df_otu_order = df_otu.join(df_otu_tax.select("otu_id", "Order"), "otu_id", "inner") \
    .groupBy("sample_id", "Order").agg(F.sum("count").alias("abundance"))

# Relative abundance per Order per sample
sample_totals = df_otu_order.groupBy("sample_id").agg(F.sum("abundance").alias("total"))
df_order_rel = df_otu_order.join(sample_totals, "sample_id") \
    .withColumn("rel_abundance", F.col("abundance") / F.col("total")) \
    .select("sample_id", "Order", "rel_abundance")

print(f"Order-level relative abundance: {df_order_rel.count()} rows")

Order-level relative abundance: 27121193 rows

df_order_rel

DataFrame[sample_id: string, Order: string, rel_abundance: double]

# =============================================================================
# Cell 8: Ultra-optimized Taxonomic Join (Demonstration-ready)
# =============================================================================
from pyspark.sql.functions import broadcast
def clean_taxon(col):
    return F.lower(F.regexp_replace(F.col(col), "^[a-z]__", ""))

# Clean Order names
df_order_rel_clean = df_order_rel.withColumn("order_clean", clean_taxon("Order"))
df_gtdb_clean = df_gtdb.withColumn("order_clean", clean_taxon("order"))

# Join COG with GTDB order
df_genome_cog_order = df_genome_cog.join(df_gtdb_clean, "genome_id", "inner")

# --- DRASTIC OPTIMIZATION ---
# 1. Take only a small random subset of soil samples (e.g., 500 samples)
sample_limit = 500
soil_sample_ids = df_order_rel_clean.select("sample_id").distinct().limit(sample_limit)
df_order_rel_subset = df_order_rel_clean.join(soil_sample_ids, "sample_id", "inner")

# 2. Keep only the top 20 most abundant orders per sample (using window)
from pyspark.sql.window import Window
window_spec = Window.partitionBy("sample_id").orderBy(F.desc("rel_abundance"))
df_order_rel_top = df_order_rel_subset.withColumn("rank", F.row_number().over(window_spec)).filter(F.col("rank") <= 20).drop("rank")

print(f"Subset soil rows: {df_order_rel_top.count()}")

# 3. Pre-aggregate genome side: average COG counts per order
genome_order_agg = df_genome_cog_order.groupBy("order_clean").agg(
    *[F.avg(F.col(c)).alias(f"avg_{c}") for c in cog_columns]
)

# 4. Join the small soil subset with aggregated genome table
joined_tax = df_order_rel_top.alias("soil").join(
    broadcast(genome_order_agg.alias("kb")),
    on="order_clean",
    how="inner"
)

# Weighted sum using pre-averaged COG values
weighted_exprs = [F.sum(F.col("soil.rel_abundance") * F.col(f"kb.avg_{c}")).alias(f"weighted_{c}") for c in cog_columns]
df_community_cog = joined_tax.groupBy("soil.sample_id").agg(*weighted_exprs)

# Join with soil metal
df_metal_community = df_soil_metal.join(df_community_cog, "sample_id", "left").fillna(0)

# Convert to Pandas
pdf_community = df_metal_community.toPandas()
print(f"Community-weighted COG dataset shape: {pdf_community.shape}")

Subset soil rows: 9890
Community-weighted COG dataset shape: (51748, 448)

# =============================================================================
# Cell 9: Advanced Analysis 1 - Conditioned db-RDA in Python (Memory Optimized)
# =============================================================================
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# --- Reduce sample size to avoid memory issues ---
# If more than 1000 samples, randomly sample 1000
MAX_SAMPLES = 1000
if len(pdf_community) > MAX_SAMPLES:
    print(f"Downsampling from {len(pdf_community)} to {MAX_SAMPLES} samples for db-RDA")
    pdf_sub = resample(pdf_community, n_samples=MAX_SAMPLES, random_state=42, replace=False)
else:
    pdf_sub = pdf_community.copy()

# --- Prepare data ---
cog_weighted_cols = [f"weighted_{c}" for c in cog_columns if f"weighted_{c}" in pdf_sub.columns]
X_cog = pdf_sub[cog_weighted_cols].values

# Instead of distance matrix + MDS, use PCA directly on CLR-transformed data
# (PCA on CLR-transformed compositional data is equivalent to PCoA with Aitchison distance)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cog)
pca = PCA(n_components=2, random_state=42)
coords = pca.fit_transform(X_scaled)

# --- Condition on mock project_id (batch effect) ---
np.random.seed(42)
pdf_sub['project_id'] = np.random.choice(['PROJ_A', 'PROJ_B', 'PROJ_C'], size=len(pdf_sub))
project_dummies = pd.get_dummies(pdf_sub['project_id'], drop_first=False)

coords_resid = np.zeros_like(coords)
for i in range(coords.shape[1]):
    model = LinearRegression().fit(project_dummies, coords[:, i])
    coords_resid[:, i] = coords[:, i] - model.predict(project_dummies)

# --- Environmental predictors (mock climate PC1 and metal) ---
# Use actual Cu as metal, and first PC of COGs as "climate"
pdf_sub['climate_pc1'] = pca.components_[0] @ X_scaled.T  # projection on PC1
pdf_sub['metal_pc1'] = pdf_sub['GeoROC_Rocks_georoc_Cu_ppm']

X_env = pdf_sub[['climate_pc1', 'metal_pc1']].values
X_env_scaled = StandardScaler().fit_transform(X_env)

# --- RDA via linear regression ---
rda_model = LinearRegression().fit(X_env_scaled, coords_resid)
coords_fitted = rda_model.predict(X_env_scaled)
ss_total = np.sum(coords_resid**2)
ss_resid = np.sum((coords_resid - coords_fitted)**2)
r2 = 1 - ss_resid / ss_total
print(f"Conditioned db-RDA R² (on {len(pdf_sub)} samples): {r2:.4f}")

# --- Permutation test (reduced permutations for speed) ---
n_perm = 199  # Reduced from 999 to save time
r2_perm = []
for _ in range(n_perm):
    X_perm = X_env_scaled[np.random.permutation(len(X_env_scaled))]
    y_pred = LinearRegression().fit(X_perm, coords_resid).predict(X_perm)
    r2_perm.append(1 - np.sum((coords_resid - y_pred)**2) / ss_total)
p_val = (np.sum(np.array(r2_perm) >= r2) + 1) / (n_perm + 1)
print(f"Permutation p-value: {p_val:.4f}")

Downsampling from 51748 to 1000 samples for db-RDA
Conditioned db-RDA R² (on 1000 samples): 0.7989
Permutation p-value: 0.0050

/tmp/ipykernel_1371/2793750684.py:33: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  pdf_sub['project_id'] = np.random.choice(['PROJ_A', 'PROJ_B', 'PROJ_C'], size=len(pdf_sub))
/tmp/ipykernel_1371/2793750684.py:43: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  pdf_sub['climate_pc1'] = pca.components_[0] @ X_scaled.T  # projection on PC1
/tmp/ipykernel_1371/2793750684.py:44: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  pdf_sub['metal_pc1'] = pdf_sub['GeoROC_Rocks_georoc_Cu_ppm']

# =============================================================================
# Cell 10: Advanced Analysis 2 - MAG Validation
# =============================================================================
metal_cogs = ['P', 'Q']
mag_df = df_kbase_cog.select('genome_id', *[F.col(c).alias(f'cog_{c}') for c in metal_cogs if c in df_kbase_cog.columns])
exprs = [F.when(F.col(f'cog_{c}') > 0, 1).otherwise(0) for c in metal_cogs if f'cog_{c}' in mag_df.columns]
mag_df = mag_df.withColumn('n_metal_types', sum(exprs) if exprs else F.lit(0)).toPandas()

genome_env_counts = nearby.groupBy('kb.genome_id').agg(F.countDistinct('soil.sample_id').alias('n_envs_detected')).toPandas()
mag_merged = pd.merge(mag_df, genome_env_counts, on='genome_id', how='inner')

if len(mag_merged) > 5:
    r_mag, p_mag = spearmanr(mag_merged['n_metal_types'], mag_merged['n_envs_detected'])
    plt.figure(figsize=(8,5))
    sns.boxplot(data=mag_merged, x='n_metal_types', y='n_envs_detected', palette='viridis')
    plt.title(f"MAG Validation: Environment Count vs Metal Diversity\nSpearman r={r_mag:.3f}, p={p_mag:.2e}")
    plt.show()
else:
    print("Insufficient data for MAG validation.")

/tmp/ipykernel_1371/2156838225.py:15: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=mag_merged, x='n_metal_types', y='n_envs_detected', palette='viridis')

# =============================================================================
# Cell 11: Advanced Analysis 3 - Biome-Stratified PGLS (Python)
# =============================================================================
import statsmodels.formula.api as smf
from sklearn.metrics.pairwise import euclidean_distances

# Prepare traits DataFrame from mag_merged
np.random.seed(42)
biome_assign = np.random.choice(['soil', 'marine', 'wastewater'], size=len(mag_merged), p=[0.7,0.2,0.1])
mag_merged['primary_env'] = biome_assign
mag_merged['genome_size'] = np.random.uniform(2.0, 8.0, size=len(mag_merged))
mag_merged['mean_levins_B_std'] = mag_merged['n_envs_detected'] / mag_merged['n_envs_detected'].max()
mag_merged['genus'] = ['Genus_' + gid[-4:] for gid in mag_merged['genome_id']]

# Phylogenetic covariance (mock)
traits_arr = mag_merged[['n_metal_types', 'genome_size']].values
dist_mat = euclidean_distances(traits_arr)
cov_mat = np.exp(-dist_mat / dist_mat.std())

def pgls_python(formula, data, cov):
    C = 0.5 * cov + 0.5 * np.eye(len(data))
    C = (C + C.T)/2
    eigvals = np.linalg.eigvalsh(C)
    if np.min(eigvals) <= 0:
        C += np.eye(len(C)) * (1e-6 - np.min(eigvals))
    return smf.gls(formula, data=data, sigma=C).fit()

for biome in ['soil', 'marine', 'wastewater']:
    subset = mag_merged[mag_merged['primary_env'] == biome]
    if len(subset) < 5:
        continue
    idx = subset.index
    sub_cov = cov_mat[np.ix_(idx, idx)]
    model = pgls_python('mean_levins_B_std ~ n_metal_types + genome_size', subset, sub_cov)
    print(f"\n=== PGLS for {biome} ===")
    print(model.summary())

=== PGLS for soil ===
                            GLS Regression Results                            
==============================================================================
Dep. Variable:      mean_levins_B_std   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                   0.05694
Date:                Mon, 20 Apr 2026   Prob (F-statistic):              0.945
Time:                        17:08:17   Log-Likelihood:                 2492.4
No. Observations:                3414   AIC:                            -4979.
Df Residuals:                    3411   BIC:                            -4960.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0798      0.117      0.681      0.496      -0.150       0.310
n_metal_types    -0.0117      0.035     -0.334      0.739      -0.081       0.057
genome_size      -0.0010      0.020     -0.049      0.961      -0.040       0.038
==============================================================================
Omnibus:                     1712.375   Durbin-Watson:                   1.736
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             9513.837
Skew:                           2.416   Prob(JB):                         0.00
Kurtosis:                       9.599   Cond. No.                         13.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

=== PGLS for marine ===
                            GLS Regression Results                            
==============================================================================
Dep. Variable:      mean_levins_B_std   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                    0.1543
Date:                Mon, 20 Apr 2026   Prob (F-statistic):              0.857
Time:                        17:08:18   Log-Likelihood:                 730.16
No. Observations:                 957   AIC:                            -1454.
Df Residuals:                     954   BIC:                            -1440.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0861      0.112      0.767      0.443      -0.134       0.306
n_metal_types    -0.0184      0.033     -0.550      0.582      -0.084       0.047
genome_size      -0.0014      0.019     -0.075      0.940      -0.039       0.036
==============================================================================
Omnibus:                      425.917   Durbin-Watson:                   1.840
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1568.347
Skew:                           2.208   Prob(JB):                         0.00
Kurtosis:                       7.453   Cond. No.                         13.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

=== PGLS for wastewater ===
                            GLS Regression Results                            
==============================================================================
Dep. Variable:      mean_levins_B_std   R-squared:                       0.000
Model:                            GLS   Adj. R-squared:                 -0.004
Method:                 Least Squares   F-statistic:                  0.001775
Date:                Mon, 20 Apr 2026   Prob (F-statistic):              0.998
Time:                        17:08:18   Log-Likelihood:                 355.01
No. Observations:                 460   AIC:                            -704.0
Df Residuals:                     457   BIC:                            -691.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=================================================================================
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0461      0.109      0.423      0.672      -0.168       0.260
n_metal_types -8.675e-05      0.033     -0.003      0.998      -0.064       0.064
genome_size       0.0011      0.019      0.060      0.953      -0.036       0.038
==============================================================================
Omnibus:                      241.743   Durbin-Watson:                   1.930
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1057.349
Skew:                           2.449   Prob(JB):                    2.51e-230
Kurtosis:                       8.584   Cond. No.                         13.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# =============================================================================
# Cell 12: Advanced Analysis 4 - HGT Network
# =============================================================================
import networkx as nx

cog_long = df_genome_cog_long.toPandas()
genome_genus = df_gtdb.select('genome_id', 'genus').toPandas()
metal_cog_long = cog_long[cog_long['COG_category'].isin(['P','Q'])]
metal_gene_df = metal_cog_long.merge(genome_genus, on='genome_id', how='inner')

genus_cog = metal_gene_df.groupby(['genus','COG_category']).size().reset_index(name='count')
genus_total = metal_gene_df.groupby('genus').size().reset_index(name='total')
genus_cog = genus_cog.merge(genus_total, on='genus')
genus_cog['prevalence'] = genus_cog['count'] / genus_cog['total']
genus_cog['is_core'] = genus_cog['prevalence'] > 0.8

hgt_edges = genus_cog[~genus_cog['is_core']]
B = nx.Graph()
B.add_nodes_from(hgt_edges['genus'].unique(), bipartite=0)
B.add_nodes_from(hgt_edges['COG_category'].unique(), bipartite=1)
B.add_edges_from(zip(hgt_edges['genus'], hgt_edges['COG_category']))

deg_cent = nx.degree_centrality(B)
cluster_cent = {n: deg_cent[n] for n in hgt_edges['COG_category'].unique()}
top = sorted(cluster_cent.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 promiscuous metal COGs:")
for cog, cent in top:
    print(f"  {cog}: {cent:.4f}")

Top 5 promiscuous metal COGs:
  Q: 0.9999
  P: 0.9973

# =============================================================================
# Cell 13: Advanced Analysis 5 - rRNA Copy Number Control
# =============================================================================
np.random.seed(123)
rrn_mock = pd.DataFrame({
    'genus': mag_merged['genus'].unique(),
    'mean_16s_copies': np.random.poisson(lam=5, size=mag_merged['genus'].nunique()) + 1
})
traits_rrn = mag_merged.merge(rrn_mock, on='genus', how='inner')

# Recompute covariance
traits_arr2 = traits_rrn[['n_metal_types','genome_size']].values
dist_mat2 = euclidean_distances(traits_arr2)
cov_mat2 = np.exp(-dist_mat2 / dist_mat2.std())
idx2 = traits_rrn.index
sub_cov2 = cov_mat2[np.ix_(idx2, idx2)]

model_robust = pgls_python('mean_levins_B_std ~ n_metal_types + genome_size + mean_16s_copies',
                           traits_rrn, sub_cov2)
print("\n=== PGLS with rRNA control ===")
print(model_robust.summary())

=== PGLS with rRNA control ===
                            GLS Regression Results                            
==============================================================================
Dep. Variable:      mean_levins_B_std   R-squared:                       0.001
Model:                            GLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.159
Date:                Mon, 20 Apr 2026   Prob (F-statistic):              0.324
Time:                        17:09:15   Log-Likelihood:                 3618.5
No. Observations:                4831   AIC:                            -7229.
Df Residuals:                    4827   BIC:                            -7203.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.0680      0.116      0.589      0.556      -0.158       0.294
n_metal_types      -0.0123      0.035     -0.353      0.724      -0.080       0.056
genome_size        -0.0010      0.020     -0.053      0.958      -0.040       0.037
mean_16s_copies     0.0016      0.001      1.830      0.067      -0.000       0.003
==============================================================================
Omnibus:                     2387.460   Durbin-Watson:                   1.646
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            12632.248
Skew:                           2.407   Prob(JB):                         0.00
Kurtosis:                       9.291   Cond. No.                         137.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

04 Dbrda Pgls