Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
from src.agents.engine_agent import RuleEngineAgent
from src.agents.factory import get_agent
from src.agents.feasibility_agent import RuleFeasibilityAgent
from src.agents.extractor_agent import RuleExtractorAgent
from src.agents.repository_analysis_agent import RepositoryAnalysisAgent

__all__ = [
"BaseAgent",
"AgentResult",
"RuleFeasibilityAgent",
"RuleEngineAgent",
"RuleExtractorAgent",
"AcknowledgmentAgent",
"RepositoryAnalysisAgent",
"get_agent",
Expand Down
7 changes: 7 additions & 0 deletions src/agents/extractor_agent/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""
Rule Extractor Agent: LLM-powered extraction of rule-like statements from markdown.
"""

from src.agents.extractor_agent.agent import RuleExtractorAgent

__all__ = ["RuleExtractorAgent"]
113 changes: 113 additions & 0 deletions src/agents/extractor_agent/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
Rule Extractor Agent: LLM-powered extraction of rule-like statements from markdown.
"""

import logging
import time
from typing import Any

from langgraph.graph import END, START, StateGraph
from pydantic import BaseModel, Field

from src.agents.base import AgentResult, BaseAgent
from src.agents.extractor_agent.models import ExtractorOutput
from src.agents.extractor_agent.prompts import EXTRACTOR_PROMPT

logger = logging.getLogger(__name__)


class ExtractorState(BaseModel):
"""State for the extractor (single-node) graph."""

markdown_content: str = ""
statements: list[str] = Field(default_factory=list)


class RuleExtractorAgent(BaseAgent):
"""
Extractor Agent: reads raw markdown and returns a structured list of rule-like statements.
Single-node LangGraph: extract -> END. Uses LLM with structured output.
"""

def __init__(self, max_retries: int = 3, timeout: float = 30.0):
super().__init__(max_retries=max_retries, agent_name="extractor_agent")
self.timeout = timeout
logger.info("🔧 RuleExtractorAgent initialized with max_retries=%s, timeout=%ss", max_retries, timeout)

def _build_graph(self):
"""Single node: run LLM extraction and set state.statements."""
workflow = StateGraph(ExtractorState)

async def extract_node(state: ExtractorState) -> dict:
content = (state.markdown_content or "").strip()
if not content:
return {"statements": []}
prompt = EXTRACTOR_PROMPT.format(markdown_content=content)
structured_llm = self.llm.with_structured_output(ExtractorOutput)
result = await structured_llm.ainvoke(prompt)
return {"statements": result.statements}

workflow.add_node("extract", extract_node)
workflow.add_edge(START, "extract")
workflow.add_edge("extract", END)
return workflow.compile()
Comment thread
coderabbitai[bot] marked this conversation as resolved.

async def execute(self, **kwargs: Any) -> AgentResult:
"""Extract rule statements from markdown. Expects markdown_content=... in kwargs."""
markdown_content = kwargs.get("markdown_content") or kwargs.get("content") or ""
if not isinstance(markdown_content, str):
markdown_content = str(markdown_content or "")

start_time = time.time()

if not markdown_content.strip():
return AgentResult(
success=True,
message="Empty content",
data={"statements": []},
metadata={"execution_time_ms": 0},
)

try:
logger.info("🚀 Extractor agent processing markdown (%s chars)", len(markdown_content))
initial_state = ExtractorState(markdown_content=markdown_content)
result = await self._execute_with_timeout(
self.graph.ainvoke(initial_state),
timeout=self.timeout,
)
if isinstance(result, dict):
statements = result.get("statements", [])
elif hasattr(result, "statements"):
statements = result.statements
else:
statements = []
execution_time = time.time() - start_time
logger.info(
"✅ Extractor agent completed in %.2fs; extracted %s statements",
execution_time,
len(statements),
)
return AgentResult(
success=True,
message="OK",
data={"statements": statements},
metadata={"execution_time_ms": execution_time * 1000},
)
except TimeoutError:
execution_time = time.time() - start_time
logger.error("❌ Extractor agent timed out after %.2fs", execution_time)
return AgentResult(
success=False,
message=f"Extractor timed out after {self.timeout}s",
data={"statements": []},
metadata={"execution_time_ms": execution_time * 1000, "error_type": "timeout"},
)
except Exception as e:
execution_time = time.time() - start_time
logger.exception("❌ Extractor agent failed: %s", e)
return AgentResult(
success=False,
message=str(e),
data={"statements": []},
metadata={"execution_time_ms": execution_time * 1000, "error_type": type(e).__name__},
)
14 changes: 14 additions & 0 deletions src/agents/extractor_agent/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
Data models for the Rule Extractor Agent.
"""

from pydantic import BaseModel, Field


class ExtractorOutput(BaseModel):
"""Structured output: list of rule-like statements extracted from markdown."""

statements: list[str] = Field(
description="List of distinct rule-like statements extracted from the document. Each item is a single, clear sentence or phrase describing one rule or guideline.",
default_factory=list,
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
23 changes: 23 additions & 0 deletions src/agents/extractor_agent/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
Prompt template for the Rule Extractor Agent.
"""

EXTRACTOR_PROMPT = """
You are an expert at reading AI assistant guidelines and coding standards (e.g. Cursor rules, Claude instructions, Copilot guidelines, .cursorrules, repo rules).

Your task: read the following markdown document and extract every distinct **rule-like statement** or guideline. Treat the document holistically: rules may appear as:
- Bullet points or numbered lists
- Paragraphs or full sentences
- Section headings plus body text
- Implicit requirements (e.g. "PRs should be small" or "we use conventional commits")
- Explicit markers like "Rule:", "Instruction:", "Always", "Never", "Must", "Should"

For each rule you identify, output one clear, standalone statement (a single sentence or short phrase). Preserve the intent; normalize wording only if it helps clarity. Do not merge unrelated rules. If there are no rules or guidelines, return an empty list.

Markdown content:
---
{markdown_content}
---

Output the list of rule statements. Do not include explanations or numbering in the statements themselves.
"""
Comment thread
coderabbitai[bot] marked this conversation as resolved.
5 changes: 4 additions & 1 deletion src/agents/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from src.agents.base import BaseAgent
from src.agents.engine_agent import RuleEngineAgent
from src.agents.feasibility_agent import RuleFeasibilityAgent
from src.agents.extractor_agent import RuleExtractorAgent
from src.agents.repository_analysis_agent import RepositoryAnalysisAgent

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -43,10 +44,12 @@ def get_agent(agent_type: str, **kwargs: Any) -> BaseAgent:
return RuleEngineAgent(**kwargs)
elif agent_type == "feasibility":
return RuleFeasibilityAgent(**kwargs)
elif agent_type == "extractor":
return RuleExtractorAgent(**kwargs)
elif agent_type == "acknowledgment":
return AcknowledgmentAgent(**kwargs)
elif agent_type == "repository_analysis":
return RepositoryAnalysisAgent(**kwargs)
else:
supported = ", ".join(["engine", "feasibility", "acknowledgment", "repository_analysis"])
supported = ", ".join(["engine", "feasibility", "extractor", "acknowledgment", "repository_analysis"])
raise ValueError(f"Unsupported agent type: {agent_type}. Supported: {supported}")
Loading