refactor,fix: update exposome script to Python. restructure

changelog: - fix: exposome script has been ported to Python and tested - fix: corresponding link-ml schema for exposome summary file has been updated - refactor: codebase has been reorganized to centralize utility functions in a top-level directory rather than importing from subdirectories
PNNL-CompBio · Jan 15, 2025 · d6ddcba · d6ddcba
1 parent de1cf2e
commit d6ddcba
Show file tree

Hide file tree

Showing 13 changed files with 265 additions and 10 deletions.
diff --git a/sampleChemMapping/src/__init__.py → __init__.py b/sampleChemMapping/src/__init__.py → __init__.py
diff --git a/build_script.py b/build_script.py
@@ -8,7 +8,7 @@
 from typing import Optional
 import pandas as pd
 from tqdm import tqdm
-from sampleChemMapping.src.mapping import load_mapping_reference, get_mapping_file
+from src.mapping import load_mapping_reference, get_mapping_file
 
 # DEFINE OUTPUT DIRECTORY
 OUTPUT_DIR = "."  # "/tmp"
@@ -118,7 +118,8 @@ def runExposome(chem_id_file):
     """
     run exposome data pull
     """
-    cmd = f"Rscript exposome/exposome_summary_stats.R {chem_id_file}"
+    # cmd = f"Rscript exposome/exposome_summary_stats.R {chem_id_file}"
+    cmd = f"python exposome/exposome_summary_stats.py {chem_id_file}"
     tqdm.write(cmd)
     os.system(cmd)
     return [os.path.join(OUTPUT_DIR, "exposomeGeneStats.csv")]

diff --git a/exposome/__init__.py b/exposome/__init__.py
diff --git a/exposome/exposome_summary_stats.py b/exposome/exposome_summary_stats.py
@@ -0,0 +1,252 @@
+"""exposome_summary_stats: Converted from R to Python with AI Incubator."""
+
+# ===============================================
+#  IMPORTS
+# ===============================================
+import json
+import re
+import requests
+import sys
+from os.path import abspath, dirname, join
+import pandas as pd
+
+sys.path.append(dirname(dirname(abspath(__file__))))
+from src.format import snakeify_all_columns
+
+# ===============================================
+#  CONFIG
+# ===============================================
+OUTPUT_DIR = "."
+
+PROJ2NAME = {
+    "ADIPO": "Human adipocyte cell lines",
+    "HEPG2": "Human Hepg2 cell lines",
+    "MCF10A": "Human MCF10A cell lines",
+    "TG-GATEs": "Human TG-GATEs",
+}
+
+
+# ===============================================
+#  FUNCTIONS
+# ===============================================
+def check_args():  # Check for the path to the chemical mapping file
+    if len(sys.argv) < 2:
+        print("Need to call script with path to chemical mapping file.")
+        sys.exit()
+
+
+def _load_projects(
+    url: str = "https://montilab.bu.edu/Xposome-API/projects?all=Yes",
+    verbose: bool = True,
+) -> list[str]:
+    """Load list of projects from exposome API.
+
+    Parameters
+    ----------
+    url : str, optional
+        URL to projects, by default "https://montilab.bu.edu/Xposome-API/projects?all=Yes"
+    verbose : bool, optional
+        If True, prints stdout, by default True
+
+    Returns
+    -------
+    list[str]
+        List of projects
+
+    Raises
+    -------
+    HTTPError
+        If invalid URL, terminates.
+    """
+    res = requests.get(url)
+    res.raise_for_status()  # Raises HTTPError for invalid
+    projects = json.loads(res.json()[0])
+
+    if verbose:
+        print(f"Data from {len(projects)} projects found.")
+
+    return projects
+
+
+def _load_chemicals(project: str) -> pd.DataFrame:
+    """Load chemicals associated with project
+
+    Parameters
+    ----------
+    project : str
+        Project (e.g. ADIPO, HEPG2, etc.)
+
+    Returns
+    -------
+    pd.DataFrame
+        Tabulated chemical information with the following columns:
+            Chemical_Id: str
+            Project: str
+            Chemical_Name: str
+            CAS: str
+    """
+    url = f"https://montilab.bu.edu/Xposome-API/chemicals?projects={project}&chemical_ids=all"
+    res = requests.get(url)
+    if res.status_code == 200:
+        return pd.DataFrame(json.loads(res.json()[0]))
+    return
+
+
+def format_concentration(
+    df: pd.DataFrame, column_name: str = "Concentration"
+) -> pd.DataFrame:
+    """
+    Splits a concentration column into separate numeric and unit columns,
+    only recognizing units 'uM' or 'mg/kg' from a larger string.
+
+    Parameters:
+    - df: pandas DataFrame containing the concentration column.
+    - column_name: str, name of the column to be split (default is 'Concentration').
+
+    Returns:
+    - DataFrame with separate 'Concentration' and 'Unit' columns.
+
+    Written by AI incubator 01/2025
+    """
+
+    def extract_unit_and_numeric(c):
+        unit_match = re.search(r"(\d*\.?\d+)\s*(uM|mg/kg)", c)
+        if unit_match:
+            return float(unit_match.group(1)), unit_match.group(2)
+        return None, None
+
+    # Apply the extraction function to split concentration and unit
+    df["Concentration"], df["Unit"] = zip(
+        *df[column_name].apply(extract_unit_and_numeric)
+    )
+
+    # Remove rows where units are not recognized
+    df = df.dropna(subset=["Unit", "Concentration"])
+
+    # Rearrange columns to place 'Concentration' and 'Unit' adjacently
+    cols = [c for c in df.columns if c not in ["Concentration", "Unit"]]
+    df = df[cols + ["Concentration", "Unit"]]
+
+    return df
+
+
+def summarize(
+    df: pd.DataFrame,
+    id_var: str = "Gene",
+    prefix: str = "ModZScore ",
+    value_name: str = "ModZScore",
+) -> pd.DataFrame:
+    if " " not in prefix:
+        prefix = f"{prefix} "
+
+    df = df.melt(
+        id_vars=[id_var],
+        value_vars=[c for c in df.columns if prefix in c],
+        var_name="Concentration",
+        value_name=value_name,
+    )
+    df["Concentration"] = [c.lstrip(prefix) for c in df["Concentration"]]
+    return df
+
+
+def getGoTerms(chemical_id, project):
+    url = f"https://montilab.bu.edu/Xposome-API/gs_enrichment?project={project}&chemical_id={chemical_id}"
+    res = requests.get(url)
+    if res.status_code != 200:
+        return pd.DataFrame()
+
+    # Load GO terms enrichment summary statistics data
+    summary = pd.json_normalize(res.json())
+
+    if len(summary) > 0:
+        # Reshape into 3 cols: GenesetName, Concentration, and GSScore
+        summary = summarize(
+            summary, id_var="GenesetName", prefix="GS Score ", value_name="GSScore"
+        )
+        summary[["Condition", "Concentration"]] = summary["Concentration"].str.split(
+            "_", expand=True
+        )
+        summary = format_concentration(summary)  # format conc + unit cols
+
+    display_link = f"https://montilab.bu.edu/Xposome/?page={project}&tab=chemical_explorer&chemical_id={chemical_id}&stat=gs_enrichment"
+
+    summary["project"] = project
+    summary["cas_number"] = chemical_id
+    summary["link"] = display_link
+    return snakeify_all_columns(summary)
+
+
+def getGenes(chemical_id, project):
+    url = f"https://montilab.bu.edu/Xposome-API/gene_expression?project={project}&chemical_id={chemical_id}&landmark=FALSE&do.scorecutoff=FALSE"
+    res = requests.get(url)
+    if res.status_code != 200:
+        return pd.DataFrame()
+
+    # Load gene expression summary statistics data
+    summary = pd.DataFrame(json.loads(res.json()[0]))
+
+    if len(summary) > 0:
+        # Reshape into 3 cols: Gene, Concentration, and ModZScore
+        summary = summarize(summary)
+
+        if proj == "TG-GATEs":
+            summary[["Condition", "High", "Concentration"]] = summary[
+                "Concentration"
+            ].str.split("_", expand=True)
+            summary.drop(columns=["High"], inplace=True)
+        elif proj == "MCF10A":
+            summary[["Condition", "Concentration"]] = summary[
+                "Concentration"
+            ].str.split("_", expand=True)
+        else:
+            summary["Condition"] = "WT"
+
+        summary = format_concentration(summary)  # format conc + unit cols
+
+    display_link = f"https://montilab.bu.edu/Xposome/?page={project}&tab=chemical_explorer&chemical_id={chemical_id}&stat=gene_expression"
+
+    summary["project"] = project
+    summary["cas_number"] = chemical_id
+    summary["link"] = display_link
+    return snakeify_all_columns(summary)
+
+
+# ===============================================
+#  RUN SCRIPT
+# ===============================================
+if __name__ == "__main__":
+    check_args()
+
+    # Load all chemicals and projects
+    chems = snakeify_all_columns(pd.read_csv(sys.argv[1], encoding="utf-8-sig"))
+    projects = _load_projects()
+
+    genes, gos = list(), list()
+    for proj in projects:
+        c = _load_chemicals(proj)
+
+        overlap = set(chems["cas_number"]).intersection(set(c["CAS"]))
+        print(f"Found {len(overlap)} CAS ids in common in project {proj}")
+
+        gg = pd.concat([getGenes(chem, proj) for chem in overlap], ignore_index=True)
+        # gt = pd.concat([getGoTerms(chem, proj) for chem in overlap], ignore_index=True)
+        genes.append(gg)  # gos.append(gt)
+
+    # Combine all gene data and include friendly project names
+    genes = pd.concat(genes, ignore_index=True)
+    genes["project_id"] = genes["project"]
+    genes["project"] = [PROJ2NAME[p] for p in genes["project_id"]]
+    # gos = pd.concat(gos, ignore_index=True)
+
+    # Get significant genes
+    genes = genes.loc[genes["mod_z_score"].abs() > 1.63].copy()
+    chems = chems[["cas_number", "chemical_id"]].drop_duplicates()
+
+    genes = (
+        genes.groupby(["project", "cas_number", "concentration", "link", "condition"])
+        .agg(n_genes=("gene", "nunique"))
+        .reset_index()
+        .merge(chems, on="cas_number")
+    )
+
+    genes.to_csv(join(OUTPUT_DIR, "exposomeGeneStats.csv"), index=False)
diff --git a/sampleChemMapping/__init__.py b/sampleChemMapping/__init__.py
@@ -1 +0,0 @@
-from .src import format, mapping, metadata, params, tables

diff --git a/sampleChemMapping/map_samples_to_chemicals.py b/sampleChemMapping/map_samples_to_chemicals.py
@@ -7,13 +7,16 @@
 # Imports
 # =========================================================
 import os
+import sys
 from argparse import ArgumentParser
 from typing import Optional
 
 import numpy as np
 import pandas as pd
 
 from numpy.typing import ArrayLike
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from src.format import rename_duplicates, snakeify, snakeify_all_columns
 from src.mapping import rename_chemical_class
 from src.metadata import build_chem_metadata, get_endpoint_metadata
@@ -31,8 +34,8 @@
 # These pathways refer to absolute pathways in the docker image
 # setting these three parameters, can be appended
 # data_dir = 'https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data'
-out_dir = "/tmp/"
-# out_dir = "./"
+# OUT_DIR = "/tmp"
+OUT_DIR = "."
 
 # Set CompTox API key
 CTX_API_KEY = "5aded20c-9485-11ef-87c3-325096b39f47"

diff --git a/src/__init__.py b/src/__init__.py
diff --git a/sampleChemMapping/src/format.py → src/format.py b/sampleChemMapping/src/format.py → src/format.py
diff --git a/sampleChemMapping/src/mapping.py → src/mapping.py b/sampleChemMapping/src/mapping.py → src/mapping.py
diff --git a/sampleChemMapping/src/metadata.py → src/metadata.py b/sampleChemMapping/src/metadata.py → src/metadata.py
diff --git a/sampleChemMapping/src/params.py → src/params.py b/sampleChemMapping/src/params.py → src/params.py
diff --git a/sampleChemMapping/src/tables.py → src/tables.py b/sampleChemMapping/src/tables.py → src/tables.py
diff --git a/srpAnalytics.yaml b/srpAnalytics.yaml
@@ -321,16 +321,16 @@ classes:
   exposomeGeneStats:
     description: Summary and link to exposome measurements of human genes that are differentailly expressed upon chemical treatment
     slots:
-      - Chemical_ID
-      - Project
+      - chemical_id
+      - project
       - concentration
       - cas_number
       - concentration_unit
     attributes:
-      nGenes:
+      n_genes:
         description:
         range: integer
-      Link:
+      link:
         description:
-      Condition:
+      condition:
         description:
Original file line number	Diff line number	Diff line change
		@@ -1 +0,0 @@
		from .src import format, mapping, metadata, params, tables