Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ audit logs, and generates internal/applicant reports.
- extraction
- evaluation
- decision
- (optional) human review
- report generation
- notification
- Persist results in SQLite (`applications`, `audit_log`)
Expand All @@ -27,7 +26,7 @@ Decision routing:

- `PASS` -> `report` -> `notify`
- `FAIL` -> `report` -> `notify`
- `REVIEW` -> `human_review` -> `report` -> `notify`
- `REVIEW` -> `report` -> `notify`

---

Expand Down Expand Up @@ -62,10 +61,6 @@ Settings are loaded from environment variables and `.env` (if present).
| `MAX_UPLOAD_SIZE_BYTES` | `10485760` | Max upload size (10 MB) |
| `OLLAMA_BASE_URL` | `http://localhost:11434` | Local Ollama base URL |
| `EXTRACTION_MODEL` | `smollm:360m` | Extraction model label |
| `EVALUATION_MODEL` | `gemma3:1b-it-q4_K_M` | Evaluation model label |
| `DECISION_MODEL` | `phi4-mini:3.8b-q4_K_M` | Decision model label |
| `REPORT_MODEL` | `gemma3:1b-it-q4_K_M` | Report model label |
| `NOTIFICATION_MODEL` | `smollm:360m` | Notification model label |
| `RESEND_API_KEY` | empty | Resend API key (optional) |
| `RESEND_FROM_EMAIL` | `noreply@example.com` | Sender email |
| `RETRY_ATTEMPTS` | `2` | Retries per workflow node |
Expand Down
67 changes: 58 additions & 9 deletions app/agents/extraction_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from app.config import get_settings
from app.database import update_application
from app.agents.personas import EXTRACTION_PERSONA, build_structured_prompt
from app.observability import traced
from app.state import ApplicationState
from app.tools.ollama import generate_json_response
Expand All @@ -21,6 +22,36 @@
MAX_INPUT_CHARS = 32000


def _strip_markdown_json_fences(text: str) -> str:
"""Strip wrapping markdown JSON fences from model output.

Args:
text: Raw model output that may include fenced JSON.

Returns:
The unfenced JSON string when standard markdown fences are present,
otherwise the original trimmed text.

Example:
_strip_markdown_json_fences("```json\\n{\\"name\\": \\"A\\"}\\n```")
'{"name": "A"}'
"""
stripped = text.strip()
if not stripped.startswith("```"):
return stripped

lines = stripped.splitlines()
if not lines:
return stripped
if lines[-1].strip() != "```":
return stripped

body = lines[1:-1]
if lines[0].strip().lower() in {"```json", "```"}:
return "\n".join(body).strip()
return stripped


def _read_input(file_path: str) -> str:
extension = Path(file_path).suffix.lower()
if extension == ".pdf":
Expand All @@ -32,15 +63,32 @@ def _read_input(file_path: str) -> str:
raise ValueError("Unsupported file type. Use PDF, TXT, MD, or JSON")


def _build_prompt(raw_text: str, correction_error: str | None = None) -> str:
instruction = (
"Extract applicant details as strict JSON with keys: "
"name, email, phone, skills (array), experience, education. "
"Return JSON only and include all keys."
def _build_extraction_prompt(raw_text: str, correction_error: str | None = None) -> str:
task = (
"Extract applicant details from the provided text into the exact structured JSON schema."
)
if correction_error:
instruction = f"{instruction}\nPrevious response failed validation: {correction_error}"
return f"{instruction}\n\nApplication:\n{raw_text[:MAX_INPUT_CHARS]}"
task = f"{task}\nPrevious response failed validation: {correction_error}"
context = f"document_text:\n{raw_text[:MAX_INPUT_CHARS]}"
output = (
"Return JSON only with exactly these keys:\n"
'{\n'
' "name": string or null,\n'
' "email": string or null,\n'
' "phone": string or null,\n'
' "website": string or null,\n'
' "skills": [string, ...],\n'
' "experience": [{"title": string or null, "company": string or null, "duration": string or null}, ...],\n'
' "education": [{"degree": string or null, "institution": string or null, "year": string or null}, ...],\n'
' "other_details": [string, ...]\n'
'}'
)
return build_structured_prompt(
persona=EXTRACTION_PERSONA,
task=task,
context=context,
output=output,
)


def _extract_with_retry(
Expand All @@ -52,12 +100,13 @@ def _extract_with_retry(
response_text = generate_json_response(
base_url=base_url,
model=model,
prompt=_build_prompt(raw_text, correction_error=error),
prompt=_build_extraction_prompt(raw_text, correction_error=error),
temperature=0.0,
top_p=0.1,
timeout_seconds=timeout_seconds,
Comment on lines +103 to 106
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Passing stop=["```"] can truncate responses that contain fenced JSON (e.g., starting with ```json), producing an empty/partial string that will always fail json.loads and consume retries. Consider removing the stop token and instead stripping markdown fences in post-processing, or choosing stop tokens that only match trailing fences so the JSON body is preserved.

Copilot uses AI. Check for mistakes.
)
try:
payload = json.loads(response_text)
payload = json.loads(_strip_markdown_json_fences(response_text))
validated = CandidateExtraction.model_validate(payload)
return validated.model_dump()
except (json.JSONDecodeError, ValidationError) as exc:
Expand Down
70 changes: 70 additions & 0 deletions app/agents/personas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Persona specifications and prompt helpers for extraction agent."""

from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class PersonaSpec:
"""Structured persona contract used to build system prompts."""

role_identity: str
scope_boundaries: tuple[str, ...]
hard_constraints: tuple[str, ...]
output_contract: tuple[str, ...]

def system_section(self) -> str:
"""Render persona details as a structured system section."""
boundaries = "\n".join(f"- {item}" for item in self.scope_boundaries)
constraints = "\n".join(f"- {item}" for item in self.hard_constraints)
outputs = "\n".join(f"- {item}" for item in self.output_contract)
return (
f"ROLE IDENTITY:\n{self.role_identity}\n\n"
f"SCOPE BOUNDARIES:\n{boundaries}\n\n"
f"HARD CONSTRAINTS:\n{constraints}\n\n"
f"OUTPUT CONTRACT:\n{outputs}"
)


GLOBAL_GUARDRAILS: tuple[str, ...] = (
"no hallucinated fields",
"no secret/API key leakage",
"no overwriting other agents' owned state",
)


EXTRACTION_PERSONA = PersonaSpec(
role_identity=(
"You are the Extraction Agent. You convert raw applicant documents into structured JSON."
),
scope_boundaries=(
"Only extract applicant facts from the provided document text.",
"Do not score, decide, report, or notify.",
"Return only extraction-owned data fields.",
),
hard_constraints=(
*GLOBAL_GUARDRAILS,
"Use null for unknown scalar fields and [] for unknown list fields.",
),
output_contract=(
"Return one valid JSON object matching the extraction schema exactly.",
"No markdown fences, preambles, or explanations.",
),
)


def build_structured_prompt(
*, persona: PersonaSpec, task: str, context: str, output: str
) -> str:
"""Build the standard prompt layout with explicit sections."""
return (
"SYSTEM SECTION:\n"
f"{persona.system_section()}\n\n"
"TASK SECTION:\n"
f"{task}\n\n"
"CONTEXT SECTION:\n"
f"{context}\n\n"
"OUTPUT SECTION:\n"
f"{output}"
)
10 changes: 9 additions & 1 deletion app/tools/ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ def generate_json_response(
model: str,
prompt: str,
temperature: float = 0.0,
top_p: float = 0.1,
stop: list[str] | None = None,
timeout_seconds: float = 30.0,
) -> str:
"""Generate a JSON completion response from an Ollama model.
Expand All @@ -26,6 +28,8 @@ def generate_json_response(
model: Model name to query.
prompt: Prompt text sent to the model.
temperature: Sampling temperature for generation.
top_p: Nucleus sampling parameter.
stop: Optional stop tokens.
timeout_seconds: HTTP timeout in seconds.

Returns:
Expand All @@ -43,12 +47,16 @@ def generate_json_response(
)
"""
endpoint = f"{base_url.rstrip('/')}/api/generate"
options: dict[str, object] = {"temperature": temperature, "top_p": top_p}
if stop:
options["stop"] = stop

payload = {
"model": model,
"prompt": prompt,
"stream": False,
"format": "json",
"options": {"temperature": temperature},
"options": options,
}

try:
Expand Down
5 changes: 2 additions & 3 deletions app/tools/parse_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

from pathlib import Path

import fitz
from langchain.tools import tool
import pymupdf4llm


@tool
Expand All @@ -29,8 +29,7 @@ def parse_pdf_tool(path: str) -> str:
if not file_path.exists():
raise FileNotFoundError(f"File does not exist: {path}")

with fitz.open(path) as document:
text = "\n".join(page.get_text("text") for page in document)
text = str(pymupdf4llm.to_markdown(path))

cleaned = text.strip()
if not cleaned:
Expand Down
30 changes: 27 additions & 3 deletions app/tools/validate_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,39 @@

from __future__ import annotations

from pydantic import BaseModel, Field
from pydantic import BaseModel, ConfigDict, Field


class ExperienceEntry(BaseModel):
"""Structured experience entry extracted from a candidate profile."""

model_config = ConfigDict(extra="forbid")

title: str | None = Field(default=None)
company: str | None = Field(default=None)
duration: str | None = Field(default=None)


class EducationEntry(BaseModel):
"""Structured education entry extracted from a candidate profile."""

model_config = ConfigDict(extra="forbid")

degree: str | None = Field(default=None)
institution: str | None = Field(default=None)
year: str | None = Field(default=None)


class CandidateExtraction(BaseModel):
"""Structured extraction output from extraction agent."""

model_config = ConfigDict(extra="forbid")

name: str | None = Field(default=None)
email: str | None = Field(default=None)
phone: str | None = Field(default=None)
website: str | None = Field(default=None)
skills: list[str] = Field(default_factory=list)
experience: str | None = Field(default=None)
education: str | None = Field(default=None)
experience: list[ExperienceEntry] = Field(default_factory=list)
education: list[EducationEntry] = Field(default_factory=list)
other_details: list[str] = Field(default_factory=list)
Comment on lines 28 to +40
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CandidateExtraction currently accepts and silently ignores unexpected/hallucinated keys because no Pydantic extra policy is set. Since the prompt/persona contract says the JSON must match the schema exactly, consider forbidding extra keys (and similarly for nested ExperienceEntry/EducationEntry) so validation fails when the model returns additional fields instead of dropping them unnoticed.

Copilot uses AI. Check for mistakes.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies = [
"python-dotenv>=1.0.1",
"httpx>=0.27.0",
"pymupdf>=1.24.0",
"pymupdf4llm>=1.27.2.1",
]

[project.optional-dependencies]
Expand Down
Loading
Loading