Cellular-Semantics · dosumis · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,44 @@
+name: Tests
+
+on:
+  push:
+    branches: ["**"]
+  pull_request:
+    branches: ["**"]
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: "pip"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install -r requirements-dev.txt
+      - name: Add project to PYTHONPATH
+        run: echo "PYTHONPATH=$PWD" >> $GITHUB_ENV
+      - name: Run tests with coverage
+        env:
+          MPLCONFIGDIR: ${{ github.workspace }}/.mpl-cache
+          XDG_CACHE_HOME: ${{ github.workspace }}/.cache
+        run: |
+          mkdir -p "$MPLCONFIGDIR" "$XDG_CACHE_HOME"/fontconfig
+          pytest --cov=src --cov-report=term-missing --cov-report=xml
+      - name: Generate coverage badge
+        run: |
+          mkdir -p badges
+          coverage-badge -o badges/coverage.svg -f
+      - name: Commit coverage badge (main branch only)
+        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+        uses: stefanzweifel/git-auto-commit-action@v5
+        with:
+          commit_message: "chore: update coverage badge"
+          branch: main
diff --git a/Makefile b/Makefile
@@ -6,6 +6,7 @@ XDG_CACHE_HOME ?= $(PWD)/.cache
 ENV_VARS = MPLCONFIGDIR=$(MPLCONFIGDIR) XDG_CACHE_HOME=$(XDG_CACHE_HOME)
 
 CACHE_DIRS = $(MPLCONFIGDIR) $(XDG_CACHE_HOME)/fontconfig
+TEST_PYTHON ?= $(PYTHON)
 
 all: cache_dirs data comparisons figures heatmaps run_reports master_report
 
@@ -36,3 +37,11 @@ run_reports: data cache_dirs
 .PHONY: master_report
 master_report: figures heatmaps run_reports cache_dirs
 	$(ENV_VARS) $(PYTHON) -m src.build_master_report --project $(PROJECT)
+
+.PHONY: test
+test: cache_dirs
+	$(ENV_VARS) $(TEST_PYTHON) -m pytest
+
+.PHONY: coverage
+coverage: cache_dirs
+	$(ENV_VARS) $(TEST_PYTHON) -m pytest --cov=src --cov-report=term-missing
diff --git a/README.md b/README.md
@@ -2,6 +2,9 @@
 
 Pipeline for parsing Perplexity DeepSearch outputs, comparing pseudo-enrichment programs to GO results, and generating figures/reports per project. The current default project is `glioblastoma_perplexity_manual`, but the layout supports multiple projects via per-project subdirectories.
 
+[![Tests](https://github.com/Cellular-Semantics/langpa_validation_tools/actions/workflows/tests.yml/badge.svg)](https://github.com/Cellular-Semantics/langpa_validation_tools/actions/workflows/tests.yml)
+![coverage](https://img.shields.io/badge/coverage-71%25-orange)
-![coverage](https://img.shields.io/badge/coverage-71%25-orange)
+![coverage](badges/coverage.svg)
-![coverage](https://img.shields.io/badge/coverage-71%25-orange)
+![coverage](badges/coverage.svg)
+
 ## Layout
 - `projects/<project>/`: mapping files (`geneset_folder_mapping.csv`, `run_file_mapping.csv`), source spreadsheet (e.g., `media-3 (2).xlsx`), `description.md`.
 - Inputs: `deepsearch/<project>/run_*.md`, `Comparisons/<project>/comparison geneset_*.md`, `schemas/<project>/` (placeholder).
@@ -12,6 +15,8 @@ Pipeline for parsing Perplexity DeepSearch outputs, comparing pseudo-enrichment
 ```bash
 # activate your venv first (requires pandas, numpy, matplotlib, etc.)
 PROJECT=glioblastoma_perplexity_manual make master_report
+make test         # run pytest
+make coverage     # pytest with coverage report
 ```
 Targets: `data` (parse runs), `comparisons` (parse GO tables), `figures`, `heatmaps`, `run_reports`, `master_report`. Environment variables `MPLCONFIGDIR` and `XDG_CACHE_HOME` default to repo-local caches.
 

diff --git a/badges/coverage.svg b/badges/coverage.svg
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,3 +10,16 @@ dependencies = [
     "openai>=1.0.0",
     "python-dotenv",
 ]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "--strict-markers"
+
+[tool.coverage.run]
+source = ["src"]
+branch = true
+omit = ["src/embed_*", "src/build_master_report.py" ]
-omit = ["src/embed_*", "src/build_master_report.py" ]
+omit = ["src/embed_*", "src/build_master_report.py"]
-omit = ["src/embed_*", "src/build_master_report.py" ]
+omit = ["src/embed_*", "src/build_master_report.py"]
+
+[tool.coverage.report]
+show_missing = true
+skip_covered = true
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,3 @@
+pytest
+pytest-cov
+coverage-badge
diff --git a/src/build_component_mapping.py b/src/build_component_mapping.py
@@ -57,10 +57,10 @@ def tokenize(annotation: str) -> list[str]:
     return cleaned
 
 
-def main() -> None:
+def main(argv: list[str] | None = None) -> None:
     parser = argparse.ArgumentParser(description="Build component token mapping for a project.")
     add_project_argument(parser)
-    args = parser.parse_args()
+    args = parser.parse_args(argv)
     paths = resolve_paths(args.project)
     paths.ensure_output_dirs()
     if not paths.s10_file.exists():

diff --git a/src/build_go_terms.py b/src/build_go_terms.py
@@ -26,10 +26,10 @@ def parse_terms(raw: str) -> list[tuple[str, str]]:
     return results
 
 
-def main() -> None:
+def main(argv: list[str] | None = None) -> None:
     parser = argparse.ArgumentParser(description="Extract GO terms from Table S10 for a project.")
     add_project_argument(parser)
-    args = parser.parse_args()
+    args = parser.parse_args(argv)
     paths = resolve_paths(args.project)
     paths.ensure_output_dirs()
     if not paths.s10_file.exists():

diff --git a/src/extract_run_payloads.py b/src/extract_run_payloads.py
@@ -11,7 +11,6 @@
 from .project_paths import add_project_argument, resolve_paths
 
 
-
 def extract_citations(text: str) -> list[dict]:
     citations: list[dict] = []
     pattern = re.compile(r"^\[\^([^\]]+)\]:\s*(\S+)", re.MULTILINE)
@@ -20,7 +19,6 @@ def extract_citations(text: str) -> list[dict]:
     return citations
 
 
-def main() -> None:
 def main(argv: list[str] | None = None) -> None:
     parser = argparse.ArgumentParser(description="Extract DeepSearch payloads and citation footnotes for a project.")
     add_project_argument(parser)
@@ -42,13 +40,13 @@ def main(argv: list[str] | None = None) -> None:
             payload = parse_run(run_file)
 
             rel_folder = payload_dir / folder.name
-            rel_folder.mkdir(exist_ok=True)
+            rel_folder.mkdir(parents=True, exist_ok=True)
             payload_path = rel_folder / f"{run_name}.json"
             payload_path.write_text(json.dumps(payload, indent=2))
 
             citations = extract_citations(text)
             cite_folder = citation_dir / folder.name
-            cite_folder.mkdir(exist_ok=True)
+            cite_folder.mkdir(parents=True, exist_ok=True)
             citation_path = cite_folder / f"{run_name}_citations.json"
             citation_path.write_text(json.dumps(citations, indent=2))
 

diff --git a/src/match_components.py b/src/match_components.py
@@ -17,10 +17,10 @@ def normalize(vectors: np.ndarray) -> np.ndarray:
     return vectors / norms
 
 
-def main() -> None:
+def main(argv: list[str] | None = None) -> None:
     parser = argparse.ArgumentParser(description="Match component embeddings to program embeddings for a project.")
     add_project_argument(parser)
-    args = parser.parse_args()
+    args = parser.parse_args(argv)
     paths = resolve_paths(args.project)
     paths.ensure_output_dirs()
     data_dir = paths.data_dir

diff --git a/src/rename_runs.py b/src/rename_runs.py
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
@@ -0,0 +1 @@
+# Package marker for integration tests.
diff --git a/tests/integration/test_build_component_and_go_and_match.py b/tests/integration/test_build_component_and_go_and_match.py
@@ -0,0 +1,98 @@
+import json
-import json
-import json
+import os
-import os
-import os
+
+import numpy as np
+import pandas as pd
+
+from src.build_component_mapping import main as build_components_main
+from src.build_go_terms import main as build_go_terms_main
+from src.match_components import main as match_components_main
+from src.project_paths import resolve_paths
+
+
+def setup_project(monkeypatch, tmp_path):
+    monkeypatch.chdir(tmp_path)
+    project = "tmp_proj"
+    paths = resolve_paths(project)
+    paths.ensure_output_dirs()
+    return project, paths
+
+
+def test_build_component_mapping_and_go_terms_main(tmp_path, monkeypatch):
+    project, paths = setup_project(monkeypatch, tmp_path)
+    # create Table S10
+    df = pd.DataFrame(
+        {
+            "MetaModule": [0],
+            "annotation": ["OPC-like 1"],
+            "Enriched Pathways": ["Term A (GO:1), Term B (GO:2)"],
+        }
+    )
+    df.to_excel(paths.s10_file, sheet_name="Table S10", index=False)
+    # mapping file
+    pd.DataFrame(
+        {
+            "metamodule": [0],
+            "annotation": ["OPC-like 1"],
+            "original_folder": ["geneset_1"],
+            "new_folder": ["00_Test"],
+        }
+    ).to_csv(paths.mapping_file, index=False)
+
+    build_components_main(["--project", project])
+    build_go_terms_main(["--project", project])
+
+    comp_path = paths.data_dir / "component_mapping.csv"
+    go_path = paths.data_dir / "go_terms.csv"
+    assert comp_path.exists()
+    assert go_path.exists()
+    comp_df = pd.read_csv(comp_path)
+    go_df = pd.read_csv(go_path)
+    assert not comp_df.empty
+    assert set(go_df["go_term"]) == {"Term A", "Term B"}
+
+
+def test_match_components_main(tmp_path, monkeypatch):
+    project, paths = setup_project(monkeypatch, tmp_path)
+    data_dir = paths.data_dir
+    data_dir.mkdir(parents=True, exist_ok=True)
+
+    # component mapping
+    pd.DataFrame(
+        {
+            "annotation": ["Test"],
+            "folder": ["00_Test"],
+            "component_token": ["tok"],
+            "component_key": ["tok"],
+            "component_order": [1],
+            "expanded_name": ["token name"],
+            "source_note": ["note"],
+        }
+    ).to_csv(data_dir / "component_mapping.csv", index=False)
+
+    # component embeddings (single vector)
+    np.save(data_dir / "component_embeddings.npy", np.array([[1.0, 0.0]]))
+    pd.DataFrame(
+        {
+            "component_key": ["tok"],
+            "component_token": ["tok"],
+            "expanded_name": ["token name"],
+        }
+    ).to_csv(data_dir / "component_embeddings_index.csv", index=False)
+
+    # program embeddings (one program in run 1)
+    np.save(data_dir / "embeddings_name.npy", np.array([[1.0, 0.0]]))
+    pd.DataFrame(
+        {
+            "folder": ["00_Test"],
+            "run_index": [1],
+            "program_index": [0],
+            "program_name": ["Prog"],
+        }
+    ).to_csv(data_dir / "embeddings_index.csv", index=False)
+
+    match_components_main(["--project", project])
+    out_path = data_dir / "component_program_matches.csv"
+    df = pd.read_csv(out_path)
+    assert not df.empty
+    assert df.iloc[0]["similarity"] >= 0.99
diff --git a/tests/integration/test_generate_heatmaps.py b/tests/integration/test_generate_heatmaps.py
@@ -0,0 +1,37 @@
+import json
+import os
-import os
-import os
+from pathlib import Path
-from pathlib import Path
-from pathlib import Path
+import tempfile
-import tempfile
-import tempfile
+
+import pandas as pd
+
+from src.generate_heatmaps import generate_heatmaps
+from src.project_paths import resolve_paths
+
+
+def test_generate_heatmaps(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+    project = "tmp_heatmap"
+    paths = resolve_paths(project)
+    paths.ensure_output_dirs()
+
+    pd.DataFrame(
+        {
+            "folder": ["00_Test", "00_Test"],
+            "annotation": ["Test", "Test"],
+            "run_index": [1, 2],
+            "program_index": [0, 0],
+            "program_name": ["A", "B"],
+            "supporting_genes": [json.dumps(["G1", "G2"]), json.dumps(["G2"])],
+        }
+    ).to_csv(paths.data_dir / "deepsearch_programs.csv", index=False)
+
+    pd.DataFrame({"folder": ["00_Test"], "annotation": ["Test"], "duplicate": [False]}).to_csv(
+        paths.data_dir / "deepsearch_duplicate_runs.csv", index=False
+    )
+
+    monkeypatch.setenv("MPLCONFIGDIR", str(tmp_path / ".mpl"))
+    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / ".cache"))
+
+    generate_heatmaps(project)
+    assert (paths.analysis_dir / "confusion_heatmaps" / "00_Test_bubble.png").exists()