Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
64a67cb
Second attempt after vibe coding got a bit out of control on first
dosumis Sep 22, 2025
3057725
Claude md update + tweaks to decomposer prompt & model
dosumis Sep 22, 2025
bd8ce19
updated schema, prompt for deep search contextual and added examples
Caroline-99 Sep 22, 2025
88a7a54
updated schema and added astromocyte examples
Caroline-99 Sep 22, 2025
104db80
Tweaks to prompt and schema + results of tests
dosumis Sep 23, 2025
38a3c15
added neutrophil_results
Caroline-99 Sep 23, 2025
8d50fa4
gene annotation graph initial implementation
hkir-dev Sep 23, 2025
e7e1549
gene annotation graph initial implementation
hkir-dev Sep 23, 2025
9dbe559
gene annotation graph ontology mapper agent integration
hkir-dev Sep 23, 2025
4a79de2
gene annotation graph ontology mapper agent integration first run
hkir-dev Sep 23, 2025
66d71ee
gene annotation graph ontology mapper agent integration first run
hkir-dev Sep 23, 2025
5653492
gene annotation graph ontology mapper agent integration first run
hkir-dev Sep 23, 2025
79d3d57
gene annotation graph e2e run success
hkir-dev Sep 23, 2025
dc33dc3
gene annotation graph e2e run success
hkir-dev Sep 23, 2025
0b8c693
gene annotation graph e2e run success
hkir-dev Sep 23, 2025
eb19cd4
gene annotation graph AT2 rerun
hkir-dev Sep 23, 2025
4a9aef6
Prompts + new data + supporting example lists
dosumis Oct 6, 2025
ec989b8
glioblastoma_states_minimal experiment run
hkir-dev Oct 7, 2025
f45532f
o3-deep-research experiement
hkir-dev Oct 7, 2025
a78c931
o3-deep-research experiement
hkir-dev Oct 7, 2025
aef9b53
gliosis experiment redone
hkir-dev Oct 7, 2025
da8576e
gliosis experiment redone
hkir-dev Oct 7, 2025
75cf683
Merge branch 'main' into contextual_gene_list_annotator_take_2
hkir-dev Oct 14, 2025
01b8dd7
fix pydanticai version issue
hkir-dev Oct 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
.idea
.env

literature/
/cellsem_agent/graphs/cl_validation/resources/

Expand Down
Empty file added Claude.md
Empty file.
Empty file added Progress.md
Empty file.
1 change: 1 addition & 0 deletions cellsem_agent/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Utility modules for cellsem-agent."""
Empty file.
83 changes: 83 additions & 0 deletions cellsem_agent/agents/ontology_mapping/ontology_mapping_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
Ontology Mapping Agent for mapping terms to multiple ontologies.
"""
import logging
from pydantic_ai import Agent

from cellsem_agent.graphs.gene_list_annotation.gene_annotation_schemas import MappingResult
from .ontology_mapping_config import OntologyMappingDependencies
from .ontology_mapping_tools import search_go, search_cl, search_uberon, search_chebi, search_multi_ontology

ontology_mapping_logger = logging.getLogger(__name__)
ontology_mapping_logger.setLevel(logging.INFO)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console.setFormatter(formatter)
ontology_mapping_logger.addHandler(console)

ontology_mapping_logger.propagate = False

ONTOLOGY_MAPPING_SYSTEM_PROMPT = """
You are an expert bioinformatics specialist and ontology curator focused on mapping biological terms to standardized ontologies.

Your task is to map atomic functions, cellular components, and other biological terms to appropriate terms in multiple ontologies including:

- **GO (Gene Ontology)**: For molecular functions, biological processes, and cellular components
- **CL (Cell Ontology)**: For cell types and cellular structures
- **UBERON**: For anatomical structures and tissues
- **ChEBI**: For chemical compounds and molecular entities

**For each input term, you must**:
1. **Search multiple ontologies** using the available search tools
2. **Select the best match** based on semantic similarity and biological accuracy
3. **Assign confidence scores** (0.0-1.0) based on match quality
4. **Document the mapping method** used (e.g., "exact_match", "partial_match", "synonym_match")

**Guidelines for high-quality mappings**:
- Prioritize exact matches over partial matches
- Use the most specific term available (child terms over parent terms when appropriate)
- Consider biological context when multiple matches are available
- For compound terms, try searching individual components if no direct match is found
- Convert plurals to singular before searching
- Try alternative phrasings (e.g., "X of Y" vs "Y X")

**Search Strategy**:
- Always start with the most relevant ontology for the term type
- Try multiple search strategies if the initial search fails:
- Search for synonyms or related terms
- Break down compound terms into components
- Try different word orders and phrasings
- If no suitable match is found in any ontology, set cl_id to "NO MATCH found"

**Output Format**:
For each input term, create a mapping object with:
- `original_term`: The input term exactly as provided
- `ontology_id`: The matched ontology term ID (e.g., "GO:0008150", "CL:0000000")
- `ontology_label`: The official label from the ontology
- `ontology_source`: The ontology name (GO, CL, UBERON, ChEBI)
- `confidence_score`: Float from 0.0 to 1.0
- `mapping_method`: Description of how the match was found

Available tools:
- `search_go`: Search Gene Ontology
- `search_cl`: Search Cell Ontology
- `search_uberon`: Search UBERON anatomy ontology
- `search_chebi`: Search ChEBI chemical ontology
- `search_multi_ontology`: Search multiple ontologies at once
"""

ontology_mapping_agent = Agent(
model="openai:gpt-4o-2024-11-20",
deps_type=OntologyMappingDependencies,
output_type=MappingResult,
system_prompt=ONTOLOGY_MAPPING_SYSTEM_PROMPT,
defer_model_check=True,
)

# Register tools
ontology_mapping_agent.tool(search_go)
ontology_mapping_agent.tool(search_cl)
ontology_mapping_agent.tool(search_uberon)
ontology_mapping_agent.tool(search_chebi)
ontology_mapping_agent.tool(search_multi_ontology)
42 changes: 42 additions & 0 deletions cellsem_agent/agents/ontology_mapping/ontology_mapping_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
Configuration for the Ontology Mapping agent.
"""
from dataclasses import dataclass
import os
from typing import List, Optional
from pathlib import Path


@dataclass
class OntologyMappingDependencies:
"""
Configuration for the Ontology Mapping agent.

Maps atomic functions and cellular components to terms in OLS ontologies.
Target ontologies include GO, CL, UBERON, and others as appropriate.
"""
workdir: Optional[Path] = None
target_ontologies: Optional[List[str]] = None

def __post_init__(self):
"""Initialize the config with default values."""
if self.workdir is None:
workdir_path = os.environ.get("WORKDIR", "./workdir")
self.workdir = Path(workdir_path)
# Create workdir if it doesn't exist
self.workdir.mkdir(parents=True, exist_ok=True)

if self.target_ontologies is None:
# Default ontologies for functional and anatomical mapping
self.target_ontologies = [
"GO", # Gene Ontology (molecular functions, biological processes, cellular components)
"CL", # Cell Ontology
"UBERON", # Uber-anatomy ontology
"CHEBI", # Chemical Entities of Biological Interest
"PR" # Protein Ontology
]


def get_config(target_ontologies: Optional[List[str]] = None) -> OntologyMappingDependencies:
"""Get the Ontology Mapping configuration from environment variables or defaults."""
return OntologyMappingDependencies(target_ontologies=target_ontologies)
136 changes: 136 additions & 0 deletions cellsem_agent/agents/ontology_mapping/ontology_mapping_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""
Tools for the Ontology Mapping agent.
"""
import os
import logging
from typing import List, Tuple, Dict, Any

from oaklib import get_adapter
from pydantic_ai import RunContext

logger = logging.getLogger(__name__)


def search_go(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
"""
Search the Gene Ontology for a term.

Args:
ctx: The run context
term: The term to search for.

Returns:
A list of tuples, each containing a GO ID and a label.
"""
try:
adapter = get_adapter("ols:go")
results = adapter.basic_search(term)
labels = list(adapter.labels(results))
logger.info(f"GO search for '{term}' returned {len(labels)} results")
return labels
except Exception as e:
logger.error(f"Error searching GO for term '{term}': {e}")
return []


def search_cl(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
"""
Search the Cell Ontology for a term.

Args:
ctx: The run context
term: The term to search for.

Returns:
A list of tuples, each containing a CL ID and a label.
"""
try:
adapter = get_adapter("ols:cl")
results = adapter.basic_search(term)
labels = list(adapter.labels(results))
logger.info(f"CL search for '{term}' returned {len(labels)} results")
return labels
except Exception as e:
logger.error(f"Error searching CL for term '{term}': {e}")
return []


def search_uberon(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
"""
Search the UBERON anatomy ontology for a term.

Args:
ctx: The run context
term: The term to search for.

Returns:
A list of tuples, each containing a UBERON ID and a label.
"""
try:
adapter = get_adapter("ols:uberon")
results = adapter.basic_search(term)
labels = list(adapter.labels(results))
logger.info(f"UBERON search for '{term}' returned {len(labels)} results")
return labels
except Exception as e:
logger.error(f"Error searching UBERON for term '{term}': {e}")
return []


def search_chebi(ctx: RunContext[str], term: str) -> List[Tuple[str, str]]:
"""
Search the ChEBI chemical ontology for a term.

Args:
ctx: The run context
term: The term to search for.

Returns:
A list of tuples, each containing a ChEBI ID and a label.
"""
try:
adapter = get_adapter("ols:chebi")
results = adapter.basic_search(term)
labels = list(adapter.labels(results))
logger.info(f"ChEBI search for '{term}' returned {len(labels)} results")
return labels
except Exception as e:
logger.error(f"Error searching ChEBI for term '{term}': {e}")
return []


def search_multi_ontology(ctx: RunContext[str], term: str, ontologies: List[str] = None) -> Dict[str, List[Tuple[str, str]]]:
"""
Search multiple ontologies for a term.

Args:
ctx: The run context
term: The term to search for.
ontologies: List of ontology prefixes to search (e.g., ['GO', 'CL', 'UBERON'])

Returns:
A dictionary mapping ontology names to lists of (ID, label) tuples.
"""
if ontologies is None:
ontologies = ['GO', 'CL', 'UBERON', 'CHEBI']

results = {}
search_functions = {
'GO': search_go,
'CL': search_cl,
'UBERON': search_uberon,
'CHEBI': search_chebi
}

for ont in ontologies:
if ont in search_functions:
try:
results[ont] = search_functions[ont](ctx, term)
except Exception as e:
logger.error(f"Error searching {ont} for term '{term}': {e}")
results[ont] = []
else:
logger.warning(f"Ontology {ont} not supported")
results[ont] = []

return results
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from cellsem_agent.utils.openai.simple_response_wrapper import SimpleResponder
from dotenv import load_dotenv

load_dotenv()

def decompose(genelist_annotation):
with open('./cellsem_agent/services/gene_list_contextual_deepsearch/schema/deepsearch_results_schema.json',
"r") as f:
schema = f.read()
sr = SimpleResponder(timeout=45)
prompt = f"""The following JSON document details a set of gene programs. For each program,
Use the contents of the program_name and description fields to break the program down into atomic biological
processes and cell component, adding these to the JSON document in a manner compliant with the schema provided.
JSON doc:

```JSON
{genelist_annotation}
```
JSON schema:

```JSON
{schema}
```
"""
res = sr.ask(
model='gpt5',
prompt=prompt,
instructions="""You understand how to break down the meaning of the language of biology into its component parts.
You can fluently and accurately read JSON schema and write compliant JSON""",
temperature=0.3,
max_output_tokens=500,
)
print(res.status, res.elapsed_sec, "s")
return res.output_text or ""
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"genes": [
"ABCA3","NAPSA","CTSH","SFTPB","SFTPC","ABCA3","NAPSA","CTSH","SFTPB","SFTPC"
],
"context": "enriched gene list for cluster of cells in scRNAseq data from lung",
"description": ""
}
Loading
Loading