diff --git a/README.md b/README.md index 42cd6f4..ed47806 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ audit logs, and generates internal/applicant reports. - extraction - evaluation - decision - - (optional) human review - report generation - notification - Persist results in SQLite (`applications`, `audit_log`) @@ -27,7 +26,7 @@ Decision routing: - `PASS` -> `report` -> `notify` - `FAIL` -> `report` -> `notify` -- `REVIEW` -> `human_review` -> `report` -> `notify` +- `REVIEW` -> `report` -> `notify` --- @@ -62,10 +61,6 @@ Settings are loaded from environment variables and `.env` (if present). | `MAX_UPLOAD_SIZE_BYTES` | `10485760` | Max upload size (10 MB) | | `OLLAMA_BASE_URL` | `http://localhost:11434` | Local Ollama base URL | | `EXTRACTION_MODEL` | `smollm:360m` | Extraction model label | -| `EVALUATION_MODEL` | `gemma3:1b-it-q4_K_M` | Evaluation model label | -| `DECISION_MODEL` | `phi4-mini:3.8b-q4_K_M` | Decision model label | -| `REPORT_MODEL` | `gemma3:1b-it-q4_K_M` | Report model label | -| `NOTIFICATION_MODEL` | `smollm:360m` | Notification model label | | `RESEND_API_KEY` | empty | Resend API key (optional) | | `RESEND_FROM_EMAIL` | `noreply@example.com` | Sender email | | `RETRY_ATTEMPTS` | `2` | Retries per workflow node | diff --git a/app/agents/extraction_agent.py b/app/agents/extraction_agent.py index 6a04362..89e64c9 100644 --- a/app/agents/extraction_agent.py +++ b/app/agents/extraction_agent.py @@ -10,6 +10,7 @@ from app.config import get_settings from app.database import update_application +from app.agents.personas import EXTRACTION_PERSONA, build_structured_prompt from app.observability import traced from app.state import ApplicationState from app.tools.ollama import generate_json_response @@ -21,6 +22,36 @@ MAX_INPUT_CHARS = 32000 +def _strip_markdown_json_fences(text: str) -> str: + """Strip wrapping markdown JSON fences from model output. + + Args: + text: Raw model output that may include fenced JSON. + + Returns: + The unfenced JSON string when standard markdown fences are present, + otherwise the original trimmed text. + + Example: + _strip_markdown_json_fences("```json\\n{\\"name\\": \\"A\\"}\\n```") + '{"name": "A"}' + """ + stripped = text.strip() + if not stripped.startswith("```"): + return stripped + + lines = stripped.splitlines() + if not lines: + return stripped + if lines[-1].strip() != "```": + return stripped + + body = lines[1:-1] + if lines[0].strip().lower() in {"```json", "```"}: + return "\n".join(body).strip() + return stripped + + def _read_input(file_path: str) -> str: extension = Path(file_path).suffix.lower() if extension == ".pdf": @@ -32,15 +63,32 @@ def _read_input(file_path: str) -> str: raise ValueError("Unsupported file type. Use PDF, TXT, MD, or JSON") -def _build_prompt(raw_text: str, correction_error: str | None = None) -> str: - instruction = ( - "Extract applicant details as strict JSON with keys: " - "name, email, phone, skills (array), experience, education. " - "Return JSON only and include all keys." +def _build_extraction_prompt(raw_text: str, correction_error: str | None = None) -> str: + task = ( + "Extract applicant details from the provided text into the exact structured JSON schema." ) if correction_error: - instruction = f"{instruction}\nPrevious response failed validation: {correction_error}" - return f"{instruction}\n\nApplication:\n{raw_text[:MAX_INPUT_CHARS]}" + task = f"{task}\nPrevious response failed validation: {correction_error}" + context = f"document_text:\n{raw_text[:MAX_INPUT_CHARS]}" + output = ( + "Return JSON only with exactly these keys:\n" + '{\n' + ' "name": string or null,\n' + ' "email": string or null,\n' + ' "phone": string or null,\n' + ' "website": string or null,\n' + ' "skills": [string, ...],\n' + ' "experience": [{"title": string or null, "company": string or null, "duration": string or null}, ...],\n' + ' "education": [{"degree": string or null, "institution": string or null, "year": string or null}, ...],\n' + ' "other_details": [string, ...]\n' + '}' + ) + return build_structured_prompt( + persona=EXTRACTION_PERSONA, + task=task, + context=context, + output=output, + ) def _extract_with_retry( @@ -52,12 +100,13 @@ def _extract_with_retry( response_text = generate_json_response( base_url=base_url, model=model, - prompt=_build_prompt(raw_text, correction_error=error), + prompt=_build_extraction_prompt(raw_text, correction_error=error), temperature=0.0, + top_p=0.1, timeout_seconds=timeout_seconds, ) try: - payload = json.loads(response_text) + payload = json.loads(_strip_markdown_json_fences(response_text)) validated = CandidateExtraction.model_validate(payload) return validated.model_dump() except (json.JSONDecodeError, ValidationError) as exc: diff --git a/app/agents/personas.py b/app/agents/personas.py new file mode 100644 index 0000000..989c03d --- /dev/null +++ b/app/agents/personas.py @@ -0,0 +1,70 @@ +"""Persona specifications and prompt helpers for extraction agent.""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class PersonaSpec: + """Structured persona contract used to build system prompts.""" + + role_identity: str + scope_boundaries: tuple[str, ...] + hard_constraints: tuple[str, ...] + output_contract: tuple[str, ...] + + def system_section(self) -> str: + """Render persona details as a structured system section.""" + boundaries = "\n".join(f"- {item}" for item in self.scope_boundaries) + constraints = "\n".join(f"- {item}" for item in self.hard_constraints) + outputs = "\n".join(f"- {item}" for item in self.output_contract) + return ( + f"ROLE IDENTITY:\n{self.role_identity}\n\n" + f"SCOPE BOUNDARIES:\n{boundaries}\n\n" + f"HARD CONSTRAINTS:\n{constraints}\n\n" + f"OUTPUT CONTRACT:\n{outputs}" + ) + + +GLOBAL_GUARDRAILS: tuple[str, ...] = ( + "no hallucinated fields", + "no secret/API key leakage", + "no overwriting other agents' owned state", +) + + +EXTRACTION_PERSONA = PersonaSpec( + role_identity=( + "You are the Extraction Agent. You convert raw applicant documents into structured JSON." + ), + scope_boundaries=( + "Only extract applicant facts from the provided document text.", + "Do not score, decide, report, or notify.", + "Return only extraction-owned data fields.", + ), + hard_constraints=( + *GLOBAL_GUARDRAILS, + "Use null for unknown scalar fields and [] for unknown list fields.", + ), + output_contract=( + "Return one valid JSON object matching the extraction schema exactly.", + "No markdown fences, preambles, or explanations.", + ), +) + + +def build_structured_prompt( + *, persona: PersonaSpec, task: str, context: str, output: str +) -> str: + """Build the standard prompt layout with explicit sections.""" + return ( + "SYSTEM SECTION:\n" + f"{persona.system_section()}\n\n" + "TASK SECTION:\n" + f"{task}\n\n" + "CONTEXT SECTION:\n" + f"{context}\n\n" + "OUTPUT SECTION:\n" + f"{output}" + ) diff --git a/app/tools/ollama.py b/app/tools/ollama.py index 30b14c6..ca37582 100644 --- a/app/tools/ollama.py +++ b/app/tools/ollama.py @@ -17,6 +17,8 @@ def generate_json_response( model: str, prompt: str, temperature: float = 0.0, + top_p: float = 0.1, + stop: list[str] | None = None, timeout_seconds: float = 30.0, ) -> str: """Generate a JSON completion response from an Ollama model. @@ -26,6 +28,8 @@ def generate_json_response( model: Model name to query. prompt: Prompt text sent to the model. temperature: Sampling temperature for generation. + top_p: Nucleus sampling parameter. + stop: Optional stop tokens. timeout_seconds: HTTP timeout in seconds. Returns: @@ -43,12 +47,16 @@ def generate_json_response( ) """ endpoint = f"{base_url.rstrip('/')}/api/generate" + options: dict[str, object] = {"temperature": temperature, "top_p": top_p} + if stop: + options["stop"] = stop + payload = { "model": model, "prompt": prompt, "stream": False, "format": "json", - "options": {"temperature": temperature}, + "options": options, } try: diff --git a/app/tools/parse_pdf.py b/app/tools/parse_pdf.py index e0376aa..ca5aafa 100644 --- a/app/tools/parse_pdf.py +++ b/app/tools/parse_pdf.py @@ -4,8 +4,8 @@ from pathlib import Path -import fitz from langchain.tools import tool +import pymupdf4llm @tool @@ -29,8 +29,7 @@ def parse_pdf_tool(path: str) -> str: if not file_path.exists(): raise FileNotFoundError(f"File does not exist: {path}") - with fitz.open(path) as document: - text = "\n".join(page.get_text("text") for page in document) + text = str(pymupdf4llm.to_markdown(path)) cleaned = text.strip() if not cleaned: diff --git a/app/tools/validate_extraction.py b/app/tools/validate_extraction.py index 5bf6156..4a203f2 100644 --- a/app/tools/validate_extraction.py +++ b/app/tools/validate_extraction.py @@ -2,15 +2,39 @@ from __future__ import annotations -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field + + +class ExperienceEntry(BaseModel): + """Structured experience entry extracted from a candidate profile.""" + + model_config = ConfigDict(extra="forbid") + + title: str | None = Field(default=None) + company: str | None = Field(default=None) + duration: str | None = Field(default=None) + + +class EducationEntry(BaseModel): + """Structured education entry extracted from a candidate profile.""" + + model_config = ConfigDict(extra="forbid") + + degree: str | None = Field(default=None) + institution: str | None = Field(default=None) + year: str | None = Field(default=None) class CandidateExtraction(BaseModel): """Structured extraction output from extraction agent.""" + model_config = ConfigDict(extra="forbid") + name: str | None = Field(default=None) email: str | None = Field(default=None) phone: str | None = Field(default=None) + website: str | None = Field(default=None) skills: list[str] = Field(default_factory=list) - experience: str | None = Field(default=None) - education: str | None = Field(default=None) + experience: list[ExperienceEntry] = Field(default_factory=list) + education: list[EducationEntry] = Field(default_factory=list) + other_details: list[str] = Field(default_factory=list) diff --git a/pyproject.toml b/pyproject.toml index 5349c60..969df35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "python-dotenv>=1.0.1", "httpx>=0.27.0", "pymupdf>=1.24.0", + "pymupdf4llm>=1.27.2.1", ] [project.optional-dependencies] diff --git a/tests/test_extraction_agent.py b/tests/test_extraction_agent.py index de6323f..aa1c124 100644 --- a/tests/test_extraction_agent.py +++ b/tests/test_extraction_agent.py @@ -29,9 +29,15 @@ def test_extraction_success(monkeypatch: pytest.MonkeyPatch, base_state: dict[st "name": "Jane Doe", "email": "jane@example.com", "phone": "+1-123-456-7890", + "website": None, "skills": ["Python", "SQL"], - "experience": "3 years", - "education": "BSc Computer Science", + "experience": [ + {"title": "Backend Engineer", "company": "Tech Corp", "duration": "3 years"} + ], + "education": [ + {"degree": "BSc Computer Science", "institution": "State U", "year": "2020"} + ], + "other_details": ["AWS Certified"], } ), ) @@ -53,9 +59,11 @@ def test_extraction_retry_once(monkeypatch: pytest.MonkeyPatch, base_state: dict "name": "Jane Doe", "email": "jane@example.com", "phone": None, + "website": None, "skills": ["Python"], - "experience": "3 years", - "education": "BSc", + "experience": [{"title": "Engineer", "company": None, "duration": "3 years"}], + "education": [], + "other_details": [], } ), ]) @@ -115,5 +123,92 @@ def test_missing_optional_fields_default_to_null( extracted = result["extracted_json"] assert extracted["email"] is None assert extracted["phone"] is None - assert extracted["experience"] is None - assert extracted["education"] is None + assert extracted["website"] is None + assert extracted["experience"] == [] + assert extracted["education"] == [] + assert extracted["other_details"] == [] + + +def test_extraction_accepts_fenced_json( + monkeypatch: pytest.MonkeyPatch, base_state: dict[str, object] +) -> None: + monkeypatch.setattr("app.agents.extraction_agent.update_application", lambda *_args, **_kwargs: None) + call_count = {"count": 0} + valid_payload = { + "name": "Jane Doe", + "email": "jane@example.com", + "phone": None, + "website": None, + "skills": ["Python"], + "experience": [{"title": "Engineer", "company": None, "duration": "3 years"}], + "education": [], + "other_details": [], + } + + def mock_generate_fenced(**kwargs: object) -> str: + call_count["count"] += 1 + return f"```json\n{json.dumps(valid_payload)}\n```" + + monkeypatch.setattr("app.agents.extraction_agent.generate_json_response", mock_generate_fenced) + + result = extraction_agent(base_state) + + assert result["status"] == "extracted" + assert result["extracted_json"]["email"] == "jane@example.com" + assert call_count["count"] == 1 + + +def test_extraction_retries_when_extra_fields_present( + monkeypatch: pytest.MonkeyPatch, base_state: dict[str, object] +) -> None: + monkeypatch.setattr("app.agents.extraction_agent.update_application", lambda *_args, **_kwargs: None) + call_count = {"count": 0} + responses = iter([ + json.dumps( + { + "name": "Jane Doe", + "email": "jane@example.com", + "phone": None, + "website": None, + "skills": ["Python"], + "experience": [ + { + "title": "Engineer", + "company": "Acme", + "duration": "3 years", + "extra_nested_key": "unexpected", + } + ], + "education": [], + "other_details": [], + "extra_key": "unexpected", + } + ), + json.dumps( + { + "name": "Jane Doe", + "email": "jane@example.com", + "phone": None, + "website": None, + "skills": ["Python"], + "experience": [{"title": "Engineer", "company": "Acme", "duration": "3 years"}], + "education": [], + "other_details": [], + } + ), + ]) + + def mock_generate_with_retry(**kwargs: object) -> str: + call_count["count"] += 1 + return next(responses) + + monkeypatch.setattr( + "app.agents.extraction_agent.generate_json_response", + mock_generate_with_retry, + ) + + result = extraction_agent(base_state) + + assert result["status"] == "extracted" + assert result["extracted_json"]["name"] == "Jane Doe" + assert call_count["count"] == 2 diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 3d9053e..76a0317 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -17,9 +17,11 @@ def test_workflow_runs_with_stubbed_agents(monkeypatch, tmp_path: Path) -> None: "name": "Test Candidate", "email": "test@example.com", "phone": None, + "website": None, "skills": ["Python"], - "experience": "2 years", - "education": "BSc", + "experience": [{"title": "Engineer", "company": "Acme", "duration": "2 years"}], + "education": [{"degree": "BSc", "institution": "Uni", "year": "2021"}], + "other_details": [], } ), )