-
Notifications
You must be signed in to change notification settings - Fork 0
Align extraction agent with structured Modelfile JSON contract #5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
fb428b1
07b1241
e36c798
6ba7a00
c688745
9bef4c9
4b740c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| """Persona specifications and prompt helpers for extraction agent.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from dataclasses import dataclass | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class PersonaSpec: | ||
| """Structured persona contract used to build system prompts.""" | ||
|
|
||
| role_identity: str | ||
| scope_boundaries: tuple[str, ...] | ||
| hard_constraints: tuple[str, ...] | ||
| output_contract: tuple[str, ...] | ||
|
|
||
| def system_section(self) -> str: | ||
| """Render persona details as a structured system section.""" | ||
| boundaries = "\n".join(f"- {item}" for item in self.scope_boundaries) | ||
| constraints = "\n".join(f"- {item}" for item in self.hard_constraints) | ||
| outputs = "\n".join(f"- {item}" for item in self.output_contract) | ||
| return ( | ||
| f"ROLE IDENTITY:\n{self.role_identity}\n\n" | ||
| f"SCOPE BOUNDARIES:\n{boundaries}\n\n" | ||
| f"HARD CONSTRAINTS:\n{constraints}\n\n" | ||
| f"OUTPUT CONTRACT:\n{outputs}" | ||
| ) | ||
|
|
||
|
|
||
| GLOBAL_GUARDRAILS: tuple[str, ...] = ( | ||
| "no hallucinated fields", | ||
| "no secret/API key leakage", | ||
| "no overwriting other agents' owned state", | ||
| ) | ||
|
|
||
|
|
||
| EXTRACTION_PERSONA = PersonaSpec( | ||
| role_identity=( | ||
| "You are the Extraction Agent. You convert raw applicant documents into structured JSON." | ||
| ), | ||
| scope_boundaries=( | ||
| "Only extract applicant facts from the provided document text.", | ||
| "Do not score, decide, report, or notify.", | ||
| "Return only extraction-owned data fields.", | ||
| ), | ||
| hard_constraints=( | ||
| *GLOBAL_GUARDRAILS, | ||
| "Use null for unknown scalar fields and [] for unknown list fields.", | ||
| ), | ||
| output_contract=( | ||
| "Return one valid JSON object matching the extraction schema exactly.", | ||
| "No markdown fences, preambles, or explanations.", | ||
| ), | ||
| ) | ||
|
|
||
|
|
||
| def build_structured_prompt( | ||
| *, persona: PersonaSpec, task: str, context: str, output: str | ||
| ) -> str: | ||
| """Build the standard prompt layout with explicit sections.""" | ||
| return ( | ||
| "SYSTEM SECTION:\n" | ||
| f"{persona.system_section()}\n\n" | ||
| "TASK SECTION:\n" | ||
| f"{task}\n\n" | ||
| "CONTEXT SECTION:\n" | ||
| f"{context}\n\n" | ||
| "OUTPUT SECTION:\n" | ||
| f"{output}" | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,15 +2,39 @@ | |
|
|
||
| from __future__ import annotations | ||
|
|
||
| from pydantic import BaseModel, Field | ||
| from pydantic import BaseModel, ConfigDict, Field | ||
|
|
||
|
|
||
| class ExperienceEntry(BaseModel): | ||
| """Structured experience entry extracted from a candidate profile.""" | ||
|
|
||
| model_config = ConfigDict(extra="forbid") | ||
|
|
||
| title: str | None = Field(default=None) | ||
| company: str | None = Field(default=None) | ||
| duration: str | None = Field(default=None) | ||
|
|
||
|
|
||
| class EducationEntry(BaseModel): | ||
| """Structured education entry extracted from a candidate profile.""" | ||
|
|
||
| model_config = ConfigDict(extra="forbid") | ||
|
|
||
| degree: str | None = Field(default=None) | ||
| institution: str | None = Field(default=None) | ||
| year: str | None = Field(default=None) | ||
|
|
||
|
|
||
| class CandidateExtraction(BaseModel): | ||
| """Structured extraction output from extraction agent.""" | ||
|
|
||
| model_config = ConfigDict(extra="forbid") | ||
|
|
||
| name: str | None = Field(default=None) | ||
| email: str | None = Field(default=None) | ||
| phone: str | None = Field(default=None) | ||
| website: str | None = Field(default=None) | ||
| skills: list[str] = Field(default_factory=list) | ||
| experience: str | None = Field(default=None) | ||
| education: str | None = Field(default=None) | ||
| experience: list[ExperienceEntry] = Field(default_factory=list) | ||
| education: list[EducationEntry] = Field(default_factory=list) | ||
| other_details: list[str] = Field(default_factory=list) | ||
|
Comment on lines
28
to
+40
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Passing
stop=["```"]can truncate responses that contain fenced JSON (e.g., starting with ```json), producing an empty/partial string that will always failjson.loadsand consume retries. Consider removing the stop token and instead stripping markdown fences in post-processing, or choosing stop tokens that only match trailing fences so the JSON body is preserved.