diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..edeb2b7a --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,119 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Build & Development Commands + +```bash +# Install in development mode +pip install -e . + +# Install with notebook support +pip install -e '.[notebook]' + +# Install with dev dependencies (pytest) +pip install -e '.[dev]' + +# Run tests +pytest + +# Run a single test +pytest tests/test_file.py::test_function + +# Start the CLI +pantheon-cli + +# Start with specific model +pantheon-cli --model claude-sonnet-4-20250514 + +# Start without RAG +pantheon-cli --disable_rag + +# Build RAG database (requires SCRAPER_API_KEY env var) +pantheon-cli --build-rag [output_dir] [--rag-config config.yaml] +``` + +## Architecture Overview + +Pantheon-CLI is a scientific AI assistant built on a **persistent interpreter paradigm** - Python/R/Julia interpreters maintain session state across all tool calls, avoiding redundant file I/O for large datasets. + +### Entry Point Flow + +``` +pantheon_cli/__init__.py::cli_main() + → cli/core.py::cli() [Fire CLI] + → cli/core.py::main() + → Agent initialization + Toolset registration + → repl/core.py::Repl.run() +``` + +### Key Components + +| Component | Path | Purpose | +|-----------|------|---------| +| **CLI Core** | `pantheon_cli/cli/core.py` | Main entry, toolset init, agent creation, DEFAULT_INSTRUCTIONS | +| **REPL** | `pantheon_cli/repl/core.py` | Interactive loop, command parsing, tool call display | +| **REPL UI** | `pantheon_cli/repl/ui.py` | Rich console formatting | +| **API Key Manager** | `pantheon_cli/cli/manager/api_key_manager.py` | Multi-provider API key handling | +| **Model Manager** | `pantheon_cli/cli/manager/model_manager.py` | Model switching | +| **Config Loader** | `pantheon_cli/utils/config_loader.py` | Loads PANTHEON.md / PANTHEON.local.md | +| **Bio Handler** | `pantheon_cli/repl/bio_handler.py` | `/bio` command processing | +| **Bio Prompts** | `pantheon_cli/cli/prompt/` | 13+ specialized analysis workflows (ATAC, RNA-seq, spatial, etc.) | +| **Dev Loop Mode** | `pantheon_cli/cli/modes/dev_loop.py` | Autonomous plan → code → review workflow | + +### Dependencies + +- `pantheon-agents`: Agent framework for reasoning/planning +- `pantheon-toolsets`: Toolset implementations (Python/R/Julia interpreters, file ops, web, bio tools) + +### REPL Command Prefixes + +- `!command` → Shell execution +- `%python_code` → Direct Python execution (stateful) +- `>r_code` → Direct R execution (stateful) +- `]julia_code` → Direct Julia execution (stateful) +- `/api-key` → Configure API keys +- `/bio ` → Bio analysis tools +- `/clear`, `/exit` → Session management + +## Configuration + +### API Key Priority + +1. Environment variables (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) +2. Local config: `.pantheon_config.json` (current directory) +3. Global config: `~/.pantheon/config.json` + +### Project Config Files + +- `PANTHEON.md` - Project-wide config (safe to commit) +- `PANTHEON.local.md` - Personal config (add to .gitignore) + +These are auto-discovered in current + parent directories and appended to agent instructions. + +## Dev Loop Mode + +Autonomous plan → code → review cycle: + +```bash +pantheon-cli --mode devloop --dev_goal "your goal" --max_iters 10 +``` + +Uses different models for each phase (plan: gpt-5, code: gpt-4.1, review: gpt-5). + +## Toolsets + +Core toolsets are always enabled: Shell, Python, R, Julia, File Editor, Code Search, Code Validation, Todo, Generator, Bio Tools. + +Optional toolsets can be disabled via flags: `--disable_rag`, `--disable_web`, `--disable_dr`, `--disable_notebook`, `--disable_r`, `--disable_julia`, `--disable_bio`, etc. + +## Testing + +```bash +# Pytest with async support +pytest + +# Test config in pyproject.toml uses: +# asyncio_mode = "auto" +# asyncio_default_fixture_loop_scope = "function" +``` diff --git a/pantheon_cli/repl/bio_handler.py b/pantheon_cli/repl/bio_handler.py index 89dfd878..1fa56887 100644 --- a/pantheon_cli/repl/bio_handler.py +++ b/pantheon_cli/repl/bio_handler.py @@ -56,6 +56,8 @@ def _show_bio_help(self): self.console.print("[dim] /bio dock run_dock # Interactive molecular docking workflow[/dim]") self.console.print("[dim] /bio dock run ./data # Run batch molecular docking on folder[/dim]") self.console.print("[dim] /bio GeneAgent TP53,BRCA1,EGFR # Gene set analysis with AI[/dim]") + self.console.print("[dim] /bio fm list # List foundation models (scGPT, Geneformer, UCE)[/dim]") + self.console.print("[dim] /bio fm embed # Generate embeddings with auto-selected model[/dim]") self.console.print("") def _handle_bio_manager_command(self, parts) -> str: @@ -115,6 +117,10 @@ def _handle_tool_specific_command(self, parts) -> str: if tool_name == "spatial": return self._handle_spatial_command(parts) + # Handle foundation model commands + if tool_name == "fm": + return self._handle_fm_command(parts) + # Generic handler for other tools if len(parts) > 2: method_name = parts[2] @@ -1082,7 +1088,168 @@ def _handle_hic_command(self, parts) -> str: return f"bio_hic_{command} {params}" else: return f"bio_hic_{command}" - + + def _handle_fm_command(self, parts) -> str: + """Handle foundation model commands (/bio fm ...)""" + + if len(parts) == 2: + # Just /bio fm - show FM help + self.console.print("\n[bold]🧬 Single-Cell Foundation Models[/bold]") + self.console.print("[dim]Unified API for scGPT, Geneformer, UCE, and more[/dim]\n") + self.console.print("[bold cyan]Discovery & Selection[/bold cyan]") + self.console.print("[dim]/bio fm list[/dim] - List available foundation models") + self.console.print("[dim]/bio fm describe [/dim] - Get model I/O contract and requirements") + self.console.print("[dim]/bio fm profile [/dim] - Profile data (species, gene scheme, modality)") + self.console.print("[dim]/bio fm select [/dim] - Auto-select best model for task") + self.console.print("\n[bold cyan]Validation & Execution[/bold cyan]") + self.console.print("[dim]/bio fm validate [/dim] - Check compatibility") + self.console.print("[dim]/bio fm embed [--model X][/dim] - Generate embeddings") + self.console.print("[dim]/bio fm annotate [--model X][/dim] - Cell type annotation") + self.console.print("[dim]/bio fm integrate [--model X][/dim] - Batch integration") + self.console.print("\n[dim]Examples:[/dim]") + self.console.print("[dim] /bio fm list # Show all models[/dim]") + self.console.print("[dim] /bio fm describe uce # UCE model details[/dim]") + self.console.print("[dim] /bio fm profile pbmc3k.h5ad # Analyze data characteristics[/dim]") + self.console.print("[dim] /bio fm select pbmc3k.h5ad embed # Find best model for embedding[/dim]") + self.console.print("[dim] /bio fm embed pbmc3k.h5ad # Auto-select model & embed[/dim]") + self.console.print("[dim] /bio fm embed pbmc3k.h5ad --model uce # Use specific model[/dim]") + self.console.print() + return None + + command = parts[2] + + if command == "list": + # List available models + self.console.print("\n[bold cyan]🧬 Listing Foundation Models[/bold cyan]") + return "scfm_list_models" + + elif command == "describe": + # Describe a specific model + if len(parts) < 4: + self.console.print("[red]Error: Please specify a model name[/red]") + self.console.print("[dim]Usage: /bio fm describe [/dim]") + self.console.print("[dim]Available models: scgpt, geneformer, uce, scfoundation[/dim]") + return None + + model_name = parts[3] + self.console.print(f"\n[bold cyan]🧬 Describing model: {model_name}[/bold cyan]") + return f"scfm_describe_model {model_name}" + + elif command == "profile": + # Profile data + if len(parts) < 4: + self.console.print("[red]Error: Please specify a data file path[/red]") + self.console.print("[dim]Usage: /bio fm profile [/dim]") + return None + + adata_path = parts[3] + self.console.print(f"\n[bold cyan]🧬 Profiling data: {adata_path}[/bold cyan]") + return f"scfm_profile_data {adata_path}" + + elif command == "select": + # Select best model for task + if len(parts) < 5: + self.console.print("[red]Error: Please specify data path and task[/red]") + self.console.print("[dim]Usage: /bio fm select [/dim]") + self.console.print("[dim]Tasks: embed, annotate, integrate[/dim]") + return None + + adata_path = parts[3] + task = parts[4] + self.console.print(f"\n[bold cyan]🧬 Selecting model for {task} task[/bold cyan]") + return f"scfm_select_model {adata_path} {task}" + + elif command == "validate": + # Validate data compatibility + if len(parts) < 6: + self.console.print("[red]Error: Please specify data path, model, and task[/red]") + self.console.print("[dim]Usage: /bio fm validate [/dim]") + return None + + adata_path = parts[3] + model_name = parts[4] + task = parts[5] + self.console.print(f"\n[bold cyan]🧬 Validating {adata_path} for {model_name} ({task})[/bold cyan]") + return f"scfm_preprocess_validate {adata_path} {model_name} {task}" + + elif command == "embed": + # Generate embeddings + if len(parts) < 4: + self.console.print("[red]Error: Please specify a data file path[/red]") + self.console.print("[dim]Usage: /bio fm embed [--model X][/dim]") + return None + + adata_path = parts[3] + model_name = None + + # Parse --model flag + if "--model" in parts: + try: + model_idx = parts.index("--model") + model_name = parts[model_idx + 1] + except (IndexError, ValueError): + pass + + self.console.print(f"\n[bold cyan]🧬 Generating embeddings for {adata_path}[/bold cyan]") + if model_name: + self.console.print(f"[dim]Using model: {model_name}[/dim]") + return f"scfm_run embed {model_name} {adata_path}" + else: + self.console.print("[dim]Auto-selecting best model...[/dim]") + # First select model, then run + return f"scfm_select_model {adata_path} embed" + + elif command == "annotate": + # Cell type annotation + if len(parts) < 4: + self.console.print("[red]Error: Please specify a data file path[/red]") + self.console.print("[dim]Usage: /bio fm annotate [--model X][/dim]") + return None + + adata_path = parts[3] + model_name = None + + if "--model" in parts: + try: + model_idx = parts.index("--model") + model_name = parts[model_idx + 1] + except (IndexError, ValueError): + pass + + self.console.print(f"\n[bold cyan]🧬 Annotating cell types in {adata_path}[/bold cyan]") + if model_name: + return f"scfm_run annotate {model_name} {adata_path}" + else: + return f"scfm_select_model {adata_path} annotate" + + elif command == "integrate": + # Batch integration + if len(parts) < 4: + self.console.print("[red]Error: Please specify a data file path[/red]") + self.console.print("[dim]Usage: /bio fm integrate [--model X][/dim]") + return None + + adata_path = parts[3] + model_name = None + + if "--model" in parts: + try: + model_idx = parts.index("--model") + model_name = parts[model_idx + 1] + except (IndexError, ValueError): + pass + + self.console.print(f"\n[bold cyan]🧬 Integrating batches in {adata_path}[/bold cyan]") + if model_name: + return f"scfm_run integrate {model_name} {adata_path}" + else: + return f"scfm_select_model {adata_path} integrate" + + else: + self.console.print(f"[red]Unknown FM command: {command}[/red]") + self.console.print("[dim]Use /bio fm for help[/dim]") + return None + async def handle_deprecated_atac_command(self, command: str) -> str: """ Handle deprecated /atac commands with migration and auto-conversion @@ -1175,6 +1342,16 @@ def _show_atac_migration_help(self): 'chipseq_init': 'bio_chipseq_init', 'chipseq_call_peaks': 'bio_chipseq_call_peaks', 'chipseq_find_motifs': 'bio_chipseq_find_motifs', + + # Foundation model commands + 'fm_list': 'scfm_list_models', + 'fm_describe': 'scfm_describe_model', + 'fm_profile': 'scfm_profile_data', + 'fm_select': 'scfm_select_model', + 'fm_validate': 'scfm_preprocess_validate', + 'fm_embed': 'scfm_run embed', + 'fm_annotate': 'scfm_run annotate', + 'fm_integrate': 'scfm_run integrate', } # Deprecated command conversions @@ -1205,5 +1382,16 @@ def get_bio_command_suggestions() -> list: '/bio GeneAgent', '/bio GeneAgent TP53,BRCA1,EGFR', '/bio chipseq init', # Future + # Foundation model commands + '/bio fm', + '/bio fm list', + '/bio fm describe scgpt', + '/bio fm describe geneformer', + '/bio fm describe uce', + '/bio fm profile', + '/bio fm select', + '/bio fm embed', + '/bio fm annotate', + '/bio fm integrate', ] return suggestions