Cellular-Semantics · hkir-dev · Oct 14, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+.idea
+.env
+
 literature/
 /cellsem_agent/graphs/cl_validation/resources/val_annotations.json
 

diff --git a/Claude.md b/Claude.md
@@ -0,0 +1,11 @@
+You are an experienced developer of agentic workflows in Python. You write well documented code with good test coverage.  You build carefully and incrementally, making sure any user can run tests for each element of the workflows you build as well as for the whole workflow.
+
+This repo features agentic workflows structure using the Pydantic library with graphs specifying how agents are orchestrated into workflows. 
+
+Current aim:
+   - A workflow, 'contextual_gene_list_annotation', for annotating gene lists with predicted functional implications for cells that express genes on the list in some given cell-type/tissue/disease context. This workflow has three steps:
+	 1. A deepsearch query that takes a gene list and a context statement as input. Output will be structured according to a provided JSON schema. This step uses the openAI deepsearch API.  It DOES NOT USE AN AGENT.  The prompts for this have already been written and tested in open-AI chat.  The output (as described by these prompts) consists of a set of ranked list of composite functions.
+	 2. The output of step 2 is passed to an agent that decomposes terms into atomic processes, components, cell types etc and relates them to each other. The results are used to update the JSON outputed from step 1.
+	 3. A third step uses an agent to map these atomic terms to ontolgy terms, updating the JSON again.
+
+
diff --git a/Progress.md b/Progress.md
@@ -0,0 +1,17 @@
+Starting point and progress log:
+
+In this repo you will find:
+  - A command line runner: gene_list_annotation_cli.py for contextual_gene_list_annotation
+    - This is not yet linked to any workflow
+  - A utilities package with code for running openai deepsearch & one shot LLM queries via the API: cellsem_agent/utils/openai/ 
+  - An ontology mapping agent at cellsem_agent/agents/ontology_mapping/
+  - A working service for running deepsearch queries --> structured gene list annotation
+    - This has been tested and includes examples 
+  - An untested service for decomposing compound functions from step one into components and updating the schema
+
+TODO:
+ - [ ] Write workflow as new graph combining components
+ - [ ] Hook workflow up to CLI runner
+ - [ ] Refine prompts and schema
+ - [ ] Extend examples - focussing on things we can use as objective tests.
+
diff --git a/cellsem_agent/__init__.py b/cellsem_agent/__init__.py
@@ -0,0 +1 @@
+"""Utility modules for cellsem-agent."""
diff --git a/cellsem_agent/agents/annotator/annotator_agent.py b/cellsem_agent/agents/annotator/annotator_agent.py
@@ -1,7 +1,6 @@
 """
 Ontology based Annotator Agent.
 """
-
 import logging
 from typing import List, Optional
 

diff --git a/cellsem_agent/agents/ontology_mapping/__init__.py b/cellsem_agent/agents/ontology_mapping/__init__.py
diff --git a/cellsem_agent/agents/ontology_mapping/ontology_mapping_agent.py b/cellsem_agent/agents/ontology_mapping/ontology_mapping_agent.py
@@ -0,0 +1,101 @@
+"""
+Ontology Mapping Agent for mapping terms to multiple ontologies.
+"""
+import logging
+from typing import List, Optional
+
+from pydantic import BaseModel
+from pydantic_ai import Agent
+
+from .ontology_mapping_config import OntologyMappingDependencies
+from .ontology_mapping_tools import search_go, search_cl, search_uberon, search_chebi, search_multi_ontology
+
+ontology_mapping_logger = logging.getLogger(__name__)
+ontology_mapping_logger.setLevel(logging.INFO)
+console = logging.StreamHandler()
+console.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+console.setFormatter(formatter)
+ontology_mapping_logger.addHandler(console)
+
+ontology_mapping_logger.propagate = False
+
+ONTOLOGY_MAPPING_SYSTEM_PROMPT = """
+You are an expert bioinformatics specialist and ontology curator focused on mapping biological terms to standardized ontologies.
+
+You will be provided with a JSON array where each item represents a biological term that may include atomic functions, cellular components, or other relevant biological concepts.
+Your task is to map atomic functions, cellular components, and other biological terms to appropriate terms in multiple ontologies including:
+
+- **GO (Gene Ontology)**: For molecular functions, biological processes, and cellular components
+- **CL (Cell Ontology)**: For cell types and cellular structures
+- **UBERON**: For anatomical structures and tissues
+- **ChEBI**: For chemical compounds and molecular entities
+
+**For each input term, you must**:
+1. **Search multiple ontologies** using the available search tools
+2. **Select the best match** based on semantic similarity and biological accuracy
+3. **Assign confidence scores** (0.0-1.0) based on match quality
+4. **Document the mapping method** used (e.g., "exact_match", "partial_match", "synonym_match")
+
+**Guidelines for high-quality mappings**:
+- Prioritize exact matches over partial matches
+- Use the most specific term available (child terms over parent terms when appropriate)
+- Consider biological context when multiple matches are available
+- For compound terms, try searching individual components if no direct match is found
+- Convert plurals to singular before searching
+- Try alternative phrasings (e.g., "X of Y" vs "Y X")
+
+**Search Strategy**:
+- Always start with the most relevant ontology for the term type
+- Try multiple search strategies if the initial search fails:
+  - Search for synonyms or related terms
+  - Break down compound terms into components
+  - Try different word orders and phrasings
+- If no suitable match is found in any ontology, set cl_id to "NO MATCH found"
+
+**Output Format**:
+For each input term, create a MappingResult object with:
+- `original_term`: The input term exactly as provided
+- `ontology_id`: The matched ontology term ID (e.g., "GO:0008150", "CL:0000000")
+- `ontology_label`: The official label from the ontology
+- `ontology_source`: The ontology name (GO, CL, UBERON, ChEBI)
+- `confidence_score`: Float from 0.0 to 1.0
+- `mapping_method`: Description of how the match was found
+
+Available tools:
+- `search_go`: Search Gene Ontology
+- `search_cl`: Search Cell Ontology
+- `search_uberon`: Search UBERON anatomy ontology
+- `search_chebi`: Search ChEBI chemical ontology
+- `search_multi_ontology`: Search multiple ontologies at once
+"""
+
+class MappingResult(BaseModel):
+    """
+    A mapping result is a span of text and the ontology ID and label for the term it mentions.
+    Use `original_term` for the source text, and `ontology_id` and `ontology_label` for the ID and label of the entity in the ontology.
+    """
+    original_term: str
+    ontology_id: Optional[str] = None
+    ontology_label: Optional[str] = None
+    ontology_source: Optional[str] = None
+    confidence_score: Optional[float] = None
+    mapping_method: Optional[str] = None
+
+class MappingResults(BaseModel):
+    mappings: List[MappingResult]
+
+ontology_mapping_agent = Agent(
+    model="openai:gpt-4o-2024-11-20",
+    deps_type=OntologyMappingDependencies,
+    output_type=MappingResults,
+    system_prompt=ONTOLOGY_MAPPING_SYSTEM_PROMPT,
+    defer_model_check=True,
+)
+
+# Register tools
+ontology_mapping_agent.tool(search_go)
+ontology_mapping_agent.tool(search_cl)
+ontology_mapping_agent.tool(search_uberon)
+ontology_mapping_agent.tool(search_chebi)
+ontology_mapping_agent.tool(search_multi_ontology)
diff --git a/cellsem_agent/agents/ontology_mapping/ontology_mapping_config.py b/cellsem_agent/agents/ontology_mapping/ontology_mapping_config.py
@@ -0,0 +1,42 @@
+"""
+Configuration for the Ontology Mapping agent.
+"""
+from dataclasses import dataclass
+import os
+from typing import List, Optional
+from pathlib import Path
+
+
+@dataclass
+class OntologyMappingDependencies:
+    """
+    Configuration for the Ontology Mapping agent.
+
+    Maps atomic functions and cellular components to terms in OLS ontologies.
+    Target ontologies include GO, CL, UBERON, and others as appropriate.
+    """
+    workdir: Optional[Path] = None
+    target_ontologies: Optional[List[str]] = None
+
+    def __post_init__(self):
+        """Initialize the config with default values."""
+        if self.workdir is None:
+            workdir_path = os.environ.get("WORKDIR", "./workdir")
+            self.workdir = Path(workdir_path)
+            # Create workdir if it doesn't exist
+            self.workdir.mkdir(parents=True, exist_ok=True)
+
+        if self.target_ontologies is None:
+            # Default ontologies for functional and anatomical mapping
+            self.target_ontologies = [
+                "GO",     # Gene Ontology (molecular functions, biological processes, cellular components)
+                "CL",     # Cell Ontology
+                "UBERON", # Uber-anatomy ontology
+                "CHEBI",  # Chemical Entities of Biological Interest
+                "PR"      # Protein Ontology
+            ]
+
+
+def get_config(target_ontologies: Optional[List[str]] = None) -> OntologyMappingDependencies:
+    """Get the Ontology Mapping configuration from environment variables or defaults."""
+    return OntologyMappingDependencies(target_ontologies=target_ontologies)
diff --git a/cellsem_agent/agents/ontology_mapping/ontology_mapping_tools.py b/cellsem_agent/agents/ontology_mapping/ontology_mapping_tools.py
@@ -0,0 +1,136 @@
+"""
+Tools for the Ontology Mapping agent.
+"""
+import os
+import logging
+from typing import List, Tuple, Dict, Any
+
+from oaklib import get_adapter
+from pydantic_ai import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+def search_go(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
+    """
+    Search the Gene Ontology for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+
+    Returns:
+        A list of tuples, each containing a GO ID and a label.
+    """
+    try:
+        adapter = get_adapter("ols:go")
+        results = adapter.basic_search(term)
+        labels = list(adapter.labels(results))
+        logger.info(f"GO search for '{term}' returned {len(labels)} results")
+        return labels
+    except Exception as e:
+        logger.error(f"Error searching GO for term '{term}': {e}")
+        return []
+
+
+def search_cl(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
+    """
+    Search the Cell Ontology for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+
+    Returns:
+        A list of tuples, each containing a CL ID and a label.
+    """
+    try:
+        adapter = get_adapter("ols:cl")
+        results = adapter.basic_search(term)
+        labels = list(adapter.labels(results))
+        logger.info(f"CL search for '{term}' returned {len(labels)} results")
+        return labels
+    except Exception as e:
+        logger.error(f"Error searching CL for term '{term}': {e}")
+        return []
+
+
+def search_uberon(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
+    """
+    Search the UBERON anatomy ontology for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+
+    Returns:
+        A list of tuples, each containing a UBERON ID and a label.
+    """
+    try:
+        adapter = get_adapter("ols:uberon")
+        results = adapter.basic_search(term)
+        labels = list(adapter.labels(results))
+        logger.info(f"UBERON search for '{term}' returned {len(labels)} results")
+        return labels
+    except Exception as e:
+        logger.error(f"Error searching UBERON for term '{term}': {e}")
+        return []
+
+
+def search_chebi(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
+    """
+    Search the ChEBI chemical ontology for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+
+    Returns:
+        A list of tuples, each containing a ChEBI ID and a label.
+    """
+    try:
+        adapter = get_adapter("ols:chebi")
+        results = adapter.basic_search(term)
+        labels = list(adapter.labels(results))
+        logger.info(f"ChEBI search for '{term}' returned {len(labels)} results")
+        return labels
+    except Exception as e:
+        logger.error(f"Error searching ChEBI for term '{term}': {e}")
+        return []
+
+
+def search_multi_ontology(ctx: RunContext[str], term: str, ontologies: List[str] = None) -> Dict[str, List[Tuple[str, str]]]:
+    """
+    Search multiple ontologies for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+        ontologies: List of ontology prefixes to search (e.g., ['GO', 'CL', 'UBERON'])
+
+    Returns:
+        A dictionary mapping ontology names to lists of (ID, label) tuples.
+    """
+    if ontologies is None:
+        ontologies = ['GO', 'CL', 'UBERON', 'ChEBI']
+
+    results = {}
+    search_functions = {
+        'GO': search_go,
+        'CL': search_cl,
+        'UBERON': search_uberon,
+        'ChEBI': search_chebi
+    }
+
+    for ont in ontologies:
+        if ont in search_functions:
+            try:
+                results[ont] = search_functions[ont](ctx, term)
+            except Exception as e:
+                logger.error(f"Error searching {ont} for term '{term}': {e}")
+                results[ont] = []
+        else:
+            logger.warning(f"Ontology {ont} not supported")
+            results[ont] = []
+
+    return results
diff --git a/cellsem_agent/graphs/cxg_annotate/cxg_annotate_graph_v2.py b/cellsem_agent/graphs/cxg_annotate/cxg_annotate_graph_v2.py
@@ -28,7 +28,7 @@
 
 ANNOTATIONS_BATCH_SIZE = 5
 
-IS_TEST_MODE = False
+IS_TEST_MODE = True
 TEST_ANNOTATIONS_COUNT = 4  # Number of annotations to process in test mode
 
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -158,7 +158,7 @@ async def run(self, ctx: GraphRunContext[State]) -> GetGroundings:
                 cc_labels = [{"cc.label": ann['annotation_text']} for ann in batch]
 
                 if not os.path.exists(dataset_cache):
-                    full_text_path = os.path.join(EXPANSIONS_DIR, f"{normalise_file_name(article_pmc)}.txt")
+                    full_text_path = os.path.join(PUBLICATIONS_DIR, f"{normalise_file_name(article_pmc)}.txt")
                     if os.path.exists(full_text_path):
                         with open(full_text_path, 'r', encoding='utf-8') as f:
                             paper_full_text = f.read()

diff --git a/cellsem_agent/graphs/gene_annotator/__init__.py b/cellsem_agent/graphs/gene_annotator/__init__.py