Cellular-Semantics · dosumis · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/cnmf_factor_cluster_top_genes_200.csv b/cnmf_factor_cluster_top_genes_200.csv
diff --git a/langpa/pyproject.toml b/langpa/pyproject.toml
@@ -9,6 +9,7 @@ dependencies = [
     "pydantic>=2.7.0",
     "pydantic-ai>=1.16.0",
     "jsonschema>=4.22.0",
+    "citeproc-py>=0.6.0",
     "cellsem-llm-client @ git+https://github.com/Cellular-Semantics/[email protected]",
     "url2ref @ git+https://github.com/Cellular-Semantics/[email protected]",
     "deep-research-client @ git+https://github.com/monarch-initiative/deep-research-client.git@main",

diff --git a/langpa/src/langpa/services/deepsearch_configs.py b/langpa/src/langpa/services/deepsearch_configs.py
@@ -42,15 +42,17 @@ class DeepSearchConfig:
             "search_domain_filter": [
                 "pubmed.ncbi.nlm.nih.gov",
                 "ncbi.nlm.nih.gov/pmc/",
-                "www.ncbi.nlm.nih.gov",
                 "europepmc.org",
                 "biorxiv.org",
                 "nature.com",
                 "cell.com",
                 "science.org",
+                "sciencedirect.com",
+                "frontiersin.org",
+                "journals.plos.org"
             ],
             "reasoning_effort": "high",
-            "search_recency_filter": "month",
+            "search_recency_filter": None,
             "system_prompt": None,  # Will be set dynamically with JSON schema
         },
         timeout=180,
@@ -65,15 +67,24 @@ class DeepSearchConfig:
             "search_domain_filter": [
                 "pubmed.ncbi.nlm.nih.gov",
                 "ncbi.nlm.nih.gov/pmc/",
-                "www.ncbi.nlm.nih.gov",
                 "europepmc.org",
                 "biorxiv.org",
                 "nature.com",
                 "cell.com",
                 "science.org",
+                "sciencedirect.com",
+                "frontiersin.org",
+                "journals.plos.org",
+                "genetics.org",
+                "academic.oup.com",
+                "onlinelibrary.wiley.com",
+                "springer.com",
+                "springerlink.com",
+                "rupress.org",
+                "embopress.org"
             ],
             "reasoning_effort": "high",
-            "search_recency_filter": "month",
+            "search_recency_filter": None,
             "system_prompt": None,  # Will be set dynamically to minimal prompt
         },
         timeout=180,

diff --git a/langpa/src/langpa/services/markdown_reporter.py b/langpa/src/langpa/services/markdown_reporter.py
@@ -93,9 +93,11 @@ def render_from_container(self, container: dict[str, Any]) -> str:
         lines.append("")
         lines.append("## Bibliography")
         if bibliography.compact_entries:
-            for idx, entry in enumerate(bibliography.compact_entries, start=1):
-                lines.append(f"{idx}. {entry}")
+            # Compact entries already have numbers, render as-is
+            for entry in bibliography.compact_entries:
+                lines.append(entry)
         else:
+            # Fallback to manual rendering if no compact entries
             for source_id in bibliography.order:
                 entry = bibliography.entries.get(source_id, {})
                 title = entry.get("title") or entry.get("Title") or entry.get("id") or ""

diff --git a/langpa/tests/unit/test_markdown_reporter.py b/langpa/tests/unit/test_markdown_reporter.py
@@ -159,3 +159,44 @@ def test_required_genes_rendering() -> None:
     assert "Required genes" in markdown
     assert "REQ1" in markdown and "REQ2" in markdown
     assert "[2]" in markdown and "needed" in markdown
+
+
+@pytest.mark.unit
+def test_compact_bibliography_rendering_no_double_numbering() -> None:
+    """Compact bibliography entries should be rendered as-is without extra numbering."""
+    container = {
+        "report": {
+            "context": {"cell_type": "astrocyte"},
+            "input_genes": ["FOO1"],
+            "programs": [
+                {
+                    "program_name": "Test",
+                    "description": "Desc",
+                    "predicted_cellular_impact": ["impact"],
+                    "evidence_summary": "evidence",
+                    "significance_score": 0.5,
+                    "citations": [{"source_id": "1"}],
+                    "supporting_genes": ["FOO1"],
+                }
+            ],
+            "version": "1.0",
+        },
+        "citations": {"1": {"id": "1", "URL": "https://example.com/one"}},
+        "compact_bibliography": {
+            "entries": [
+                "[1] Doe J, Smith A. Example Paper. Nature 2024. 10.1038/example",
+                "[2] Jones B. Another Paper. Science 2023. PMID:12345"
+            ]
+        },
+    }
+
+    generator = MarkdownReportGenerator()
+    markdown = generator.render_from_container(container)
+
+    # Should NOT have double numbering
+    assert "1. [1]" not in markdown
+    assert "2. [2]" not in markdown
+
+    # Should have clean entries
+    assert "[1] Doe J, Smith A. Example Paper" in markdown
+    assert "[2] Jones B. Another Paper" in markdown
diff --git a/langpa_validation_tools/src/langpa_validation_tools/analysis/run_comparison.py b/langpa_validation_tools/src/langpa_validation_tools/analysis/run_comparison.py
@@ -12,7 +12,11 @@
 import pandas as pd
 
 from langpa.services.output_manager import OutputManager
-from langpa_validation_tools.comparison import match_programs
+from langpa_validation_tools.comparison.metrics import (
+    compute_combined_similarity,
+    compute_gene_jaccard,
+    compute_name_similarity,
+)
 
 
 def compare_runs(
@@ -26,7 +30,10 @@ def compare_runs(
     """Compare all DeepSearch runs in a project.
 
     Loads all container files for the project, compares programs across runs,
-    and generates a DataFrame with similarity metrics.
+    and generates a **full pairwise similarity matrix** (not just matched
+    pairs). A boolean ``is_match`` column flags rows whose combined similarity
+    meets or exceeds ``threshold``; bubble plots can then render the complete
+    matrix while reports can still filter to matches.
 
     Args:
         project: Project name (directory under output_dir)
@@ -46,6 +53,9 @@ def compare_runs(
             - gene_jaccard: Gene Jaccard similarity
             - name_similarity: Program name similarity
             - combined_similarity: Combined similarity score
+            - overlap_count: Number of shared genes
+            - genes_a_count / genes_b_count: Gene counts per program
+            - is_match: True if combined_similarity >= threshold
 
     .. code-block:: python
 
@@ -81,7 +91,11 @@ def compare_runs(
             "program_b",
             "gene_jaccard",
             "name_similarity",
-            "combined_similarity"
+            "combined_similarity",
+            "overlap_count",
+            "genes_a_count",
+            "genes_b_count",
+            "is_match",
         ])
 
     # Group containers by query
@@ -109,6 +123,21 @@ def compare_runs(
         containers_by_query[query_name].append((container_path, container_data))
 
     # Compare runs within each query
+    columns = [
+        "query",
+        "run_a",
+        "run_b",
+        "program_a",
+        "program_b",
+        "gene_jaccard",
+        "name_similarity",
+        "combined_similarity",
+        "overlap_count",
+        "genes_a_count",
+        "genes_b_count",
+        "is_match",
+    ]
+
     all_matches = []
 
     for query_name, containers in containers_by_query.items():
@@ -133,50 +162,68 @@ def compare_runs(
                 if not programs_a or not programs_b:
                     continue
 
-                # Match programs
-                matches = match_programs(
-                    programs_a,
-                    programs_b,
-                    threshold=threshold,
-                    return_unmatched=False
-                )
-
-                # Convert matches to DataFrame rows
-                for match in matches:
-                    genes_a = match.program_a["supporting_genes"]
-                    genes_b = match.program_b["supporting_genes"]
-                    overlap_count = len(set(genes_a) & set(genes_b))
-
-                    all_matches.append({
-                        "query": query_name,
-                        "run_a": run_a,
-                        "run_b": run_b,
-                        "program_a": match.program_a["program_name"],
-                        "program_b": match.program_b["program_name"],
-                        "gene_jaccard": match.scores.gene_jaccard,
-                        "name_similarity": match.scores.name_similarity,
-                        "combined_similarity": match.scores.combined,
-                        "overlap_count": overlap_count,
-                        "genes_a_count": len(genes_a),
-                        "genes_b_count": len(genes_b),
-                    })
+                # Full pairwise similarity matrix (no greedy matching)
+                for prog_a in programs_a:
+                    for prog_b in programs_b:
+                        genes_a = prog_a.get("supporting_genes", [])
+                        genes_b = prog_b.get("supporting_genes", [])
+
+                        overlap_count = len(set(genes_a) & set(genes_b))
+                        gene_jac = compute_gene_jaccard(genes_a, genes_b)
+                        name_sim = compute_name_similarity(
+                            prog_a.get("program_name", ""),
+                            prog_b.get("program_name", "")
+                        )
+                        combined = compute_combined_similarity(gene_jac, name_sim)
+
+                        all_matches.append({
+                            "query": query_name,
+                            "run_a": run_a,
+                            "run_b": run_b,
+                            "program_a": prog_a.get("program_name", "N/A"),
+                            "program_b": prog_b.get("program_name", "N/A"),
+                            "gene_jaccard": gene_jac,
+                            "name_similarity": name_sim,
+                            "combined_similarity": combined,
+                            "overlap_count": overlap_count,
+                            "genes_a_count": len(genes_a),
+                            "genes_b_count": len(genes_b),
+                            "is_match": combined >= threshold,
+                        })
 
     # Create DataFrame
-    df = pd.DataFrame(all_matches)
+    df = pd.DataFrame(all_matches, columns=columns)
 
     # Save CSV files if requested
     if save_csv and csv_output_dir:
         csv_output_dir.mkdir(parents=True, exist_ok=True)
 
-        # Save matches
+        # Save full pairwise matrix (still called program_matches for compatibility)
         matches_path = csv_output_dir / "program_matches.csv"
         df.to_csv(matches_path, index=False)
 
-        # For unmatched programs, we need to track which programs were matched
-        # For now, create an empty unmatched file
-        # TODO: Implement proper unmatched tracking in future iteration
+        # Track programs with no matches above threshold (per run)
+        unmatched_records: list[dict[str, str]] = []
+        if not df.empty:
+            # Programs from run_a perspective
+            for (query, run, program), has_match in (
+                df.groupby(["query", "run_a", "program_a"])["is_match"].any().items()
+            ):
+                if not has_match:
+                    unmatched_records.append(
+                        {"query": query, "run": run, "program_name": program}
+                    )
+            # Programs from run_b perspective
+            for (query, run, program), has_match in (
+                df.groupby(["query", "run_b", "program_b"])["is_match"].any().items()
+            ):
+                if not has_match:
+                    unmatched_records.append(
+                        {"query": query, "run": run, "program_name": program}
+                    )
+
         unmatched_path = csv_output_dir / "unmatched_programs.csv"
-        unmatched_df = pd.DataFrame(columns=["query", "run", "program_name"])
+        unmatched_df = pd.DataFrame(unmatched_records or [], columns=["query", "run", "program_name"])
         unmatched_df.to_csv(unmatched_path, index=False)
 
     return df
diff --git a/langpa_validation_tools/src/langpa_validation_tools/reporting/master_report.py b/langpa_validation_tools/src/langpa_validation_tools/reporting/master_report.py
@@ -79,9 +79,14 @@ def _safe_relpath(target: Path, base: Path) -> Path:
         "",
     ]
 
+    # Use only matched pairs for reporting stats if available
+    match_rows = matches_df
+    if "is_match" in matches_df.columns:
+        match_rows = matches_df[matches_df["is_match"]]
+
     # Add summary statistics
-    total_matches = len(matches_df)
-    queries = matches_df["query"].unique() if "query" in matches_df.columns else []
+    total_matches = len(match_rows)
+    queries = match_rows["query"].unique() if "query" in match_rows.columns else []
     num_queries = len(queries)
 
     lines.extend([
@@ -91,9 +96,9 @@ def _safe_relpath(target: Path, base: Path) -> Path:
     ])
 
     if total_matches > 0:
-        avg_gene_jaccard = matches_df["gene_jaccard"].mean()
-        avg_name_sim = matches_df["name_similarity"].mean()
-        avg_combined = matches_df["combined_similarity"].mean()
+        avg_gene_jaccard = match_rows["gene_jaccard"].mean()
+        avg_name_sim = match_rows["name_similarity"].mean()
+        avg_combined = match_rows["combined_similarity"].mean()
 
         lines.extend([
             f"- **Average Gene Jaccard**: {avg_gene_jaccard:.3f}",
@@ -103,12 +108,12 @@ def _safe_relpath(target: Path, base: Path) -> Path:
         ])
 
     # Per-query sections
-    if "query" in matches_df.columns and num_queries > 0:
+    if "query" in match_rows.columns and num_queries > 0:
         lines.append("## Per-Query Analysis")
         lines.append("")
 
         for query in sorted(queries):
-            query_matches = matches_df[matches_df["query"] == query]
+            query_matches = match_rows[match_rows["query"] == query]
 
             lines.extend([
                 f"### Query: {query}",

diff --git a/langpa_validation_tools/src/langpa_validation_tools/visualization/heatmaps.py b/langpa_validation_tools/src/langpa_validation_tools/visualization/heatmaps.py
@@ -23,13 +23,14 @@ def generate_bubble_plot(
     figsize: tuple[int, int] = (12, 8),
     return_fig: bool = False,
 ) -> None | tuple[Any, Any]:
-    """Generate bubble plot visualization of program matches.
+    """Generate bubble plot visualization of program comparisons.
 
-    Creates a scatter plot where each point represents a matched program pair.
-    The x-axis is the program index from run A, the y-axis is the program index
-    from run B. Bubble size represents the number of overlapping genes, and
-    color intensity shows the combined similarity score. Axes are numbered and
-    annotated with gene counts; a legend maps numbers to program names.
+    Creates a scatter plot where each point represents a program pair (full
+    pairwise matrix). The x-axis is the program index from run A, the y-axis is
+    the program index from run B. Bubble size represents the number of
+    overlapping genes, and color intensity shows the combined similarity score.
+    Axes are numbered and annotated with gene counts; a legend maps numbers to
+    program names.
 
     .. code-block:: python
 
@@ -46,7 +47,7 @@ def generate_bubble_plot(
         # Filter by query
         generate_bubble_plot(df, Path("query1_plot.png"), query="0_Gliosis")
     Args:
-        matches_df: DataFrame with program matches (from compare_runs)
+        matches_df: DataFrame with full program comparisons (from compare_runs)
                    Required columns: program_a, program_b, combined_similarity,
                    overlap_count, genes_a_count, genes_b_count
         output_path: Path to save the PNG file
@@ -62,12 +63,20 @@ def generate_bubble_plot(
     else:
         df = matches_df.copy()
 
+    # Drop pairs with zero overlap (no dot should be shown)
+    if "overlap_count" in df:
+        df = df[df["overlap_count"] > 0]
+    else:
+        df = df.copy()
+        df["overlap_count"] = 0
+        df = df[df["overlap_count"] > 0]
+
     # Handle empty DataFrame
     if len(df) == 0:
         fig, ax = plt.subplots(figsize=figsize)
         ax.text(
             0.5, 0.5,
-            "No matches to display",
+            "No overlaps to display",
             ha="center", va="center",
             fontsize=14,
             color="gray"

diff --git a/langpa_validation_tools/tests/integration/test_run_comparison_e2e.py b/langpa_validation_tools/tests/integration/test_run_comparison_e2e.py
@@ -107,7 +107,8 @@ def test_run_comparison_e2e() -> None:
         # Check columns
         expected_columns = [
             "query", "run_a", "run_b", "program_a", "program_b",
-            "gene_jaccard", "name_similarity", "combined_similarity"
+            "gene_jaccard", "name_similarity", "combined_similarity",
+            "overlap_count", "genes_a_count", "genes_b_count", "is_match",
         ]
         for col in expected_columns:
             assert col in df.columns