warestack · dkargatzis · Mar 10, 2026 · Feb 26, 2026 · Feb 28, 2026 · Mar 3, 2026
@@ -11,13 +11,15 @@
 from src.agents.engine_agent import RuleEngineAgent
 from src.agents.factory import get_agent
 from src.agents.feasibility_agent import RuleFeasibilityAgent
+from src.agents.extractor_agent import RuleExtractorAgent
 from src.agents.repository_analysis_agent import RepositoryAnalysisAgent
 
 __all__ = [
     "BaseAgent",
     "AgentResult",
     "RuleFeasibilityAgent",
     "RuleEngineAgent",
+    "RuleExtractorAgent",
     "AcknowledgmentAgent",
     "RepositoryAnalysisAgent",
     "get_agent",

@@ -0,0 +1,7 @@
+"""
+Rule Extractor Agent: LLM-powered extraction of rule-like statements from markdown.
+"""
+
+from src.agents.extractor_agent.agent import RuleExtractorAgent
+
+__all__ = ["RuleExtractorAgent"]
@@ -0,0 +1,113 @@
+"""
+Rule Extractor Agent: LLM-powered extraction of rule-like statements from markdown.
+"""
+
+import logging
+import time
+from typing import Any
+
+from langgraph.graph import END, START, StateGraph
+from pydantic import BaseModel, Field
+
+from src.agents.base import AgentResult, BaseAgent
+from src.agents.extractor_agent.models import ExtractorOutput
+from src.agents.extractor_agent.prompts import EXTRACTOR_PROMPT
+
+logger = logging.getLogger(__name__)
+
+
+class ExtractorState(BaseModel):
+    """State for the extractor (single-node) graph."""
+
+    markdown_content: str = ""
+    statements: list[str] = Field(default_factory=list)
+
+
+class RuleExtractorAgent(BaseAgent):
+    """
+    Extractor Agent: reads raw markdown and returns a structured list of rule-like statements.
+    Single-node LangGraph: extract -> END. Uses LLM with structured output.
+    """
+
+    def __init__(self, max_retries: int = 3, timeout: float = 30.0):
+        super().__init__(max_retries=max_retries, agent_name="extractor_agent")
+        self.timeout = timeout
+        logger.info("🔧 RuleExtractorAgent initialized with max_retries=%s, timeout=%ss", max_retries, timeout)
+
+    def _build_graph(self):
+        """Single node: run LLM extraction and set state.statements."""
+        workflow = StateGraph(ExtractorState)
+
+        async def extract_node(state: ExtractorState) -> dict:
+            content = (state.markdown_content or "").strip()
+            if not content:
+                return {"statements": []}
+            prompt = EXTRACTOR_PROMPT.format(markdown_content=content)
+            structured_llm = self.llm.with_structured_output(ExtractorOutput)
+            result = await structured_llm.ainvoke(prompt)
+            return {"statements": result.statements}
+
+        workflow.add_node("extract", extract_node)
+        workflow.add_edge(START, "extract")
+        workflow.add_edge("extract", END)
+        return workflow.compile()
+
+    async def execute(self, **kwargs: Any) -> AgentResult:
+        """Extract rule statements from markdown. Expects markdown_content=... in kwargs."""
+        markdown_content = kwargs.get("markdown_content") or kwargs.get("content") or ""
+        if not isinstance(markdown_content, str):
+            markdown_content = str(markdown_content or "")
+
+        start_time = time.time()
+
+        if not markdown_content.strip():
+            return AgentResult(
+                success=True,
+                message="Empty content",
+                data={"statements": []},
+                metadata={"execution_time_ms": 0},
+            )
+
+        try:
+            logger.info("🚀 Extractor agent processing markdown (%s chars)", len(markdown_content))
+            initial_state = ExtractorState(markdown_content=markdown_content)
+            result = await self._execute_with_timeout(
+                self.graph.ainvoke(initial_state),
+                timeout=self.timeout,
+            )
+            if isinstance(result, dict):
+                statements = result.get("statements", [])
+            elif hasattr(result, "statements"):
+                statements = result.statements
+            else:
+                statements = []
+            execution_time = time.time() - start_time
+            logger.info(
+                "✅ Extractor agent completed in %.2fs; extracted %s statements",
+                execution_time,
+                len(statements),
+            )
+            return AgentResult(
+                success=True,
+                message="OK",
+                data={"statements": statements},
+                metadata={"execution_time_ms": execution_time * 1000},
+            )
+        except TimeoutError:
+            execution_time = time.time() - start_time
+            logger.error("❌ Extractor agent timed out after %.2fs", execution_time)
+            return AgentResult(
+                success=False,
+                message=f"Extractor timed out after {self.timeout}s",
+                data={"statements": []},
+                metadata={"execution_time_ms": execution_time * 1000, "error_type": "timeout"},
+            )
+        except Exception as e:
+            execution_time = time.time() - start_time
+            logger.exception("❌ Extractor agent failed: %s", e)
+            return AgentResult(
+                success=False,
+                message=str(e),
+                data={"statements": []},
+                metadata={"execution_time_ms": execution_time * 1000, "error_type": type(e).__name__},
+            )
@@ -0,0 +1,14 @@
+"""
+Data models for the Rule Extractor Agent.
+"""
+
+from pydantic import BaseModel, Field
+
+
+class ExtractorOutput(BaseModel):
+    """Structured output: list of rule-like statements extracted from markdown."""
+
+    statements: list[str] = Field(
+        description="List of distinct rule-like statements extracted from the document. Each item is a single, clear sentence or phrase describing one rule or guideline.",
+        default_factory=list,
+    )
@@ -0,0 +1,23 @@
+"""
+Prompt template for the Rule Extractor Agent.
+"""
+
+EXTRACTOR_PROMPT = """
+You are an expert at reading AI assistant guidelines and coding standards (e.g. Cursor rules, Claude instructions, Copilot guidelines, .cursorrules, repo rules).
+
+Your task: read the following markdown document and extract every distinct **rule-like statement** or guideline. Treat the document holistically: rules may appear as:
+- Bullet points or numbered lists
+- Paragraphs or full sentences
+- Section headings plus body text
+- Implicit requirements (e.g. "PRs should be small" or "we use conventional commits")
+- Explicit markers like "Rule:", "Instruction:", "Always", "Never", "Must", "Should"
+
+For each rule you identify, output one clear, standalone statement (a single sentence or short phrase). Preserve the intent; normalize wording only if it helps clarity. Do not merge unrelated rules. If there are no rules or guidelines, return an empty list.
+
+Markdown content:
+---
+{markdown_content}
+---
+
+Output the list of rule statements. Do not include explanations or numbering in the statements themselves.
+"""
@@ -12,6 +12,7 @@
 from src.agents.base import BaseAgent
 from src.agents.engine_agent import RuleEngineAgent
 from src.agents.feasibility_agent import RuleFeasibilityAgent
+from src.agents.extractor_agent import RuleExtractorAgent
 from src.agents.repository_analysis_agent import RepositoryAnalysisAgent
 
 logger = logging.getLogger(__name__)
@@ -43,10 +44,12 @@ def get_agent(agent_type: str, **kwargs: Any) -> BaseAgent:
         return RuleEngineAgent(**kwargs)
     elif agent_type == "feasibility":
         return RuleFeasibilityAgent(**kwargs)
+    elif agent_type == "extractor":
+        return RuleExtractorAgent(**kwargs)
     elif agent_type == "acknowledgment":
         return AcknowledgmentAgent(**kwargs)
     elif agent_type == "repository_analysis":
         return RepositoryAnalysisAgent(**kwargs)
     else:
-        supported = ", ".join(["engine", "feasibility", "acknowledgment", "repository_analysis"])
+        supported = ", ".join(["engine", "feasibility", "extractor", "acknowledgment", "repository_analysis"])
         raise ValueError(f"Unsupported agent type: {agent_type}. Supported: {supported}")