Cellular-Semantics · hkir-dev · Oct 14, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+.idea
+.env
+
 literature/
 /cellsem_agent/graphs/cl_validation/resources/
 

diff --git a/Claude.md b/Claude.md
diff --git a/Progress.md b/Progress.md
diff --git a/cellsem_agent/__init__.py b/cellsem_agent/__init__.py
@@ -0,0 +1 @@
+"""Utility modules for cellsem-agent."""
diff --git a/cellsem_agent/agents/ontology_mapping/__init__.py b/cellsem_agent/agents/ontology_mapping/__init__.py
diff --git a/cellsem_agent/agents/ontology_mapping/ontology_mapping_agent.py b/cellsem_agent/agents/ontology_mapping/ontology_mapping_agent.py
@@ -0,0 +1,83 @@
+"""
+Ontology Mapping Agent for mapping terms to multiple ontologies.
+"""
+import logging
+from pydantic_ai import Agent
+
+from cellsem_agent.graphs.gene_list_annotation.gene_annotation_schemas import MappingResult
+from .ontology_mapping_config import OntologyMappingDependencies
+from .ontology_mapping_tools import search_go, search_cl, search_uberon, search_chebi, search_multi_ontology
+
+ontology_mapping_logger = logging.getLogger(__name__)
+ontology_mapping_logger.setLevel(logging.INFO)
+console = logging.StreamHandler()
+console.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+console.setFormatter(formatter)
+ontology_mapping_logger.addHandler(console)
+
+ontology_mapping_logger.propagate = False
+
+ONTOLOGY_MAPPING_SYSTEM_PROMPT = """
+You are an expert bioinformatics specialist and ontology curator focused on mapping biological terms to standardized ontologies.
+
+Your task is to map atomic functions, cellular components, and other biological terms to appropriate terms in multiple ontologies including:
+
+- **GO (Gene Ontology)**: For molecular functions, biological processes, and cellular components
+- **CL (Cell Ontology)**: For cell types and cellular structures
+- **UBERON**: For anatomical structures and tissues
+- **ChEBI**: For chemical compounds and molecular entities
+
+**For each input term, you must**:
+1. **Search multiple ontologies** using the available search tools
+2. **Select the best match** based on semantic similarity and biological accuracy
+3. **Assign confidence scores** (0.0-1.0) based on match quality
+4. **Document the mapping method** used (e.g., "exact_match", "partial_match", "synonym_match")
+
+**Guidelines for high-quality mappings**:
+- Prioritize exact matches over partial matches
+- Use the most specific term available (child terms over parent terms when appropriate)
+- Consider biological context when multiple matches are available
+- For compound terms, try searching individual components if no direct match is found
+- Convert plurals to singular before searching
+- Try alternative phrasings (e.g., "X of Y" vs "Y X")
+
+**Search Strategy**:
+- Always start with the most relevant ontology for the term type
+- Try multiple search strategies if the initial search fails:
+  - Search for synonyms or related terms
+  - Break down compound terms into components
+  - Try different word orders and phrasings
+- If no suitable match is found in any ontology, set cl_id to "NO MATCH found"
+
+**Output Format**:
+For each input term, create a mapping object with:
+- `original_term`: The input term exactly as provided
+- `ontology_id`: The matched ontology term ID (e.g., "GO:0008150", "CL:0000000")
+- `ontology_label`: The official label from the ontology
+- `ontology_source`: The ontology name (GO, CL, UBERON, ChEBI)
+- `confidence_score`: Float from 0.0 to 1.0
+- `mapping_method`: Description of how the match was found
+
+Available tools:
+- `search_go`: Search Gene Ontology
+- `search_cl`: Search Cell Ontology
+- `search_uberon`: Search UBERON anatomy ontology
+- `search_chebi`: Search ChEBI chemical ontology
+- `search_multi_ontology`: Search multiple ontologies at once
+"""
+
+ontology_mapping_agent = Agent(
+    model="openai:gpt-4o-2024-11-20",
+    deps_type=OntologyMappingDependencies,
+    output_type=MappingResult,
+    system_prompt=ONTOLOGY_MAPPING_SYSTEM_PROMPT,
+    defer_model_check=True,
+)
+
+# Register tools
+ontology_mapping_agent.tool(search_go)
+ontology_mapping_agent.tool(search_cl)
+ontology_mapping_agent.tool(search_uberon)
+ontology_mapping_agent.tool(search_chebi)
+ontology_mapping_agent.tool(search_multi_ontology)
diff --git a/cellsem_agent/agents/ontology_mapping/ontology_mapping_config.py b/cellsem_agent/agents/ontology_mapping/ontology_mapping_config.py
@@ -0,0 +1,42 @@
+"""
+Configuration for the Ontology Mapping agent.
+"""
+from dataclasses import dataclass
+import os
+from typing import List, Optional
+from pathlib import Path
+
+
+@dataclass
+class OntologyMappingDependencies:
+    """
+    Configuration for the Ontology Mapping agent.
+
+    Maps atomic functions and cellular components to terms in OLS ontologies.
+    Target ontologies include GO, CL, UBERON, and others as appropriate.
+    """
+    workdir: Optional[Path] = None
+    target_ontologies: Optional[List[str]] = None
+
+    def __post_init__(self):
+        """Initialize the config with default values."""
+        if self.workdir is None:
+            workdir_path = os.environ.get("WORKDIR", "./workdir")
+            self.workdir = Path(workdir_path)
+            # Create workdir if it doesn't exist
+            self.workdir.mkdir(parents=True, exist_ok=True)
+
+        if self.target_ontologies is None:
+            # Default ontologies for functional and anatomical mapping
+            self.target_ontologies = [
+                "GO",     # Gene Ontology (molecular functions, biological processes, cellular components)
+                "CL",     # Cell Ontology
+                "UBERON", # Uber-anatomy ontology
+                "CHEBI",  # Chemical Entities of Biological Interest
+                "PR"      # Protein Ontology
+            ]
+
+
+def get_config(target_ontologies: Optional[List[str]] = None) -> OntologyMappingDependencies:
+    """Get the Ontology Mapping configuration from environment variables or defaults."""
+    return OntologyMappingDependencies(target_ontologies=target_ontologies)
diff --git a/cellsem_agent/agents/ontology_mapping/ontology_mapping_tools.py b/cellsem_agent/agents/ontology_mapping/ontology_mapping_tools.py
@@ -0,0 +1,136 @@
+"""
+Tools for the Ontology Mapping agent.
+"""
+import os
+import logging
+from typing import List, Tuple, Dict, Any
+
+from oaklib import get_adapter
+from pydantic_ai import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+def search_go(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
+    """
+    Search the Gene Ontology for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+
+    Returns:
+        A list of tuples, each containing a GO ID and a label.
+    """
+    try:
+        adapter = get_adapter("ols:go")
+        results = adapter.basic_search(term)
+        labels = list(adapter.labels(results))
+        logger.info(f"GO search for '{term}' returned {len(labels)} results")
+        return labels
+    except Exception as e:
+        logger.error(f"Error searching GO for term '{term}': {e}")
+        return []
+
+
+def search_cl(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
+    """
+    Search the Cell Ontology for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+
+    Returns:
+        A list of tuples, each containing a CL ID and a label.
+    """
+    try:
+        adapter = get_adapter("ols:cl")
+        results = adapter.basic_search(term)
+        labels = list(adapter.labels(results))
+        logger.info(f"CL search for '{term}' returned {len(labels)} results")
+        return labels
+    except Exception as e:
+        logger.error(f"Error searching CL for term '{term}': {e}")
+        return []
+
+
+def search_uberon(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
+    """
+    Search the UBERON anatomy ontology for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+
+    Returns:
+        A list of tuples, each containing a UBERON ID and a label.
+    """
+    try:
+        adapter = get_adapter("ols:uberon")
+        results = adapter.basic_search(term)
+        labels = list(adapter.labels(results))
+        logger.info(f"UBERON search for '{term}' returned {len(labels)} results")
+        return labels
+    except Exception as e:
+        logger.error(f"Error searching UBERON for term '{term}': {e}")
+        return []
+
+
+def search_chebi(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
+    """
+    Search the ChEBI chemical ontology for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+
+    Returns:
+        A list of tuples, each containing a ChEBI ID and a label.
+    """
+    try:
+        adapter = get_adapter("ols:chebi")
+        results = adapter.basic_search(term)
+        labels = list(adapter.labels(results))
+        logger.info(f"ChEBI search for '{term}' returned {len(labels)} results")
+        return labels
+    except Exception as e:
+        logger.error(f"Error searching ChEBI for term '{term}': {e}")
+        return []
+
+
+def search_multi_ontology(ctx: RunContext[str], term: str, ontologies: List[str] = None) -> Dict[str, List[Tuple[str, str]]]:
+    """
+    Search multiple ontologies for a term.
+
+    Args:
+        ctx: The run context
+        term: The term to search for.
+        ontologies: List of ontology prefixes to search (e.g., ['GO', 'CL', 'UBERON'])
+
+    Returns:
+        A dictionary mapping ontology names to lists of (ID, label) tuples.
+    """
+    if ontologies is None:
+        ontologies = ['GO', 'CL', 'UBERON', 'CHEBI']
+
+    results = {}
+    search_functions = {
+        'GO': search_go,
+        'CL': search_cl,
+        'UBERON': search_uberon,
+        'CHEBI': search_chebi
+    }
+
+    for ont in ontologies:
+        if ont in search_functions:
+            try:
+                results[ont] = search_functions[ont](ctx, term)
+            except Exception as e:
+                logger.error(f"Error searching {ont} for term '{term}': {e}")
+                results[ont] = []
+        else:
+            logger.warning(f"Ontology {ont} not supported")
+            results[ont] = []
+
+    return results
diff --git a/cellsem_agent/services/gene_list_contextual_deepsearch/decomposer.py b/cellsem_agent/services/gene_list_contextual_deepsearch/decomposer.py
@@ -0,0 +1,34 @@
+from cellsem_agent.utils.openai.simple_response_wrapper import SimpleResponder
+from dotenv import load_dotenv
+
+load_dotenv()
+
+def decompose(genelist_annotation):
+    with open('./cellsem_agent/services/gene_list_contextual_deepsearch/schema/deepsearch_results_schema.json',
+              "r") as f:
+        schema = f.read()
+    sr = SimpleResponder(timeout=45)
+    prompt = f"""The following JSON document details a set of gene programs.  For each program, 
+     Use the contents of the program_name and description fields to break the program down into atomic biological 
+     processes and cell component, adding these to the JSON document in a manner compliant with the schema provided.
+       JSON doc: 
+
+       ```JSON
+       {genelist_annotation} 
+       ```
+       JSON schema:
+
+       ```JSON
+       {schema}
+       ```
+       """
+    res = sr.ask(
+        model='gpt5',
+        prompt=prompt,
+        instructions="""You understand how to break down the meaning of the language of biology into its component parts.
+        You can fluently and accurately read JSON schema and write compliant JSON""",
+        temperature=0.3,
+        max_output_tokens=500,
+    )
+    print(res.status, res.elapsed_sec, "s")
+    return res.output_text or ""
diff --git a/cellsem_agent/services/gene_list_contextual_deepsearch/examples/AT2_combined_input.json b/cellsem_agent/services/gene_list_contextual_deepsearch/examples/AT2_combined_input.json
@@ -0,0 +1,7 @@
+{
+  "genes": [
+    "ABCA3","NAPSA","CTSH","SFTPB","SFTPC","ABCA3","NAPSA","CTSH","SFTPB","SFTPC"
+  ],
+  "context": "enriched gene list for cluster of cells in scRNAseq data from lung",
+  "description": ""
+}