From e621f15fd14a03bb80dcd8f0d97467876c745262 Mon Sep 17 00:00:00 2001 From: Chris Krough <461869+ckrough@users.noreply.github.com> Date: Fri, 29 May 2026 07:45:39 -0400 Subject: [PATCH] fix: derive eval subjects from document content (prof-cgu) Why: Encode the synthetic-eval subject convention in code so ground-truth rows describe document content rather than the document-type form the file's header prohibits. Changes: - Derive each row's subject via a model call against generated text, with a structural forbidden= guard that rejects the f"{category} {doctype}" form - Add scripts/backfill_eval_subjects.py to apply the same derivation in place across existing PDFs, preserving comments, key order, and entity fields - Validate --ai-model against MODEL_PRICING at startup; collect-then-apply on cost-cap abort; share an atomic-write helper with the generator - Add tests for sanitizer normalization, forbidden-form retry, empty-text raise, and fallback behavior - Rewrite the 80 subjects in eval/ground_truth/synthetic.jsonl to content-based form Issues: prof-cgu --- eval/ground_truth/synthetic.jsonl | 160 ++++++++--------- scripts/backfill_eval_subjects.py | 247 +++++++++++++++++++++++++++ scripts/generate_eval_samples.py | 132 ++++++++++++-- tests/test_generate_eval_subjects.py | 128 ++++++++++++++ 4 files changed, 572 insertions(+), 95 deletions(-) create mode 100644 scripts/backfill_eval_subjects.py create mode 100644 tests/test_generate_eval_subjects.py diff --git a/eval/ground_truth/synthetic.jsonl b/eval/ground_truth/synthetic.jsonl index bda398e..a7852fb 100644 --- a/eval/ground_truth/synthetic.jsonl +++ b/eval/ground_truth/synthetic.jsonl @@ -11,83 +11,83 @@ # # Usage (filenames below resolve under --documents-dir): # drover evaluate --ground-truth eval/ground_truth/synthetic.jsonl --documents-dir eval/samples/synthetic -{"filename": "medical_bill.pdf", "domain": "medical", "category": "expense", "doctype": "invoices", "vendor": "Northern Virginia Medical Center", "date": "20241101", "subject": "medical services", "notes": "Itemized medical services bill", "entity": ""} -{"filename": "receipt.pdf", "domain": "pets", "category": "expense", "doctype": "receipts", "vendor": "PetSmart", "date": "20241208", "subject": "pet supplies", "notes": "Retail receipt for pet supplies", "entity": ""} -{"filename": "user_manual.pdf", "domain": "household", "category": "documentation", "doctype": "manuals", "vendor": "smarthub", "date": "20240901", "subject": "smarthub 3000", "notes": "Device manual for SmartHub 3000"} -{"filename": "summit-academy_agreement_2025-07-23.pdf", "domain": "education", "category": "financial_aid", "doctype": "agreements", "vendor": "Summit Academy", "date": "20250723", "subject": "reference agreement", "notes": "synthetic, generated 2026-04-28; round4-remap: reference -> financial_aid", "entity": ""} -{"filename": "summit-academy_report_2025-12-09.pdf", "domain": "education", "category": "transcript", "doctype": "reports", "vendor": "Summit Academy", "date": "20251209", "subject": "transcript report", "notes": "synthetic, generated 2026-04-28", "entity": ""} -{"filename": "riverbend-medical-center_invoice_2026-02-16.pdf", "domain": "medical", "category": "specialist", "doctype": "invoices", "vendor": "Riverbend Medical Center", "date": "20260216", "subject": "specialist invoice", "notes": "synthetic, generated 2026-04-28", "entity": "Margaret L. Thornton"} -{"filename": "highland-hoa_receipt_2025-08-13.pdf", "domain": "property", "category": "hoa", "doctype": "receipts", "vendor": "Highland HOA", "date": "20250813", "subject": "agreement receipt", "notes": "synthetic, generated 2026-04-28; round4-remap: agreement -> hoa"} -{"filename": "coastline-adventures_itinerary_2026-01-17.pdf", "domain": "lifestyle", "category": "travel", "doctype": "itineraries", "vendor": "Coastline Adventures", "date": "20260117", "subject": "travel itinerary", "notes": "synthetic, generated 2026-04-28", "entity": ""} -{"filename": "cascade-power_invoice_2026-03-12.pdf", "domain": "utilities", "category": "electric", "doctype": "invoices", "vendor": "Cascade Power", "date": "20260312", "subject": "electric invoice", "notes": "synthetic, generated 2026-04-28"} -{"filename": "maple-lane-grocers_recipe_2025-05-04.pdf", "domain": "food", "category": "meal_plan", "doctype": "recipes", "vendor": "Maple Lane Grocers", "date": "20250504", "subject": "recipe recipe", "notes": "synthetic, generated 2026-04-28; round4-remap: recipe -> meal_plan"} -{"filename": "cedar-pine-home_guide_2026-01-06.pdf", "domain": "household", "category": "documentation", "doctype": "guides", "vendor": "Cedar & Pine Home", "date": "20260106", "subject": "documentation guide", "notes": "synthetic, generated 2026-04-28"} -{"filename": "keystone-coverage_policy_2026-03-02.pdf", "domain": "insurance", "category": "auto", "doctype": "policies", "vendor": "Keystone Coverage", "date": "20260302", "subject": "auto policy", "notes": "synthetic, generated 2026-04-28"} -{"filename": "brightford-notary_will_2025-09-24.pdf", "domain": "legal", "category": "estate", "doctype": "wills", "vendor": "Brightford Notary", "date": "20250924", "subject": "estate will", "notes": "synthetic, generated 2026-04-28"} -{"filename": "westhaven-rentals_lease_2025-12-07.pdf", "domain": "housing", "category": "property", "doctype": "leases", "vendor": "Westhaven Rentals", "date": "20251207", "subject": "property lease", "notes": "synthetic, generated 2026-04-28"} -{"filename": "riverbend-medical-center_invoice_2025-10-16.pdf", "domain": "medical", "category": "expense", "doctype": "invoices", "vendor": "Riverbend Medical Center", "date": "20251016", "subject": "expense invoice", "notes": "synthetic, generated 2026-04-28", "entity": "Margaret L. Holloway"} -{"filename": "hillcrest-college_agreement_2025-12-07.pdf", "domain": "education", "category": "financial_aid", "doctype": "agreements", "vendor": "Hillcrest College", "date": "20251207", "subject": "financial aid agreement", "notes": "synthetic, generated 2026-04-28", "entity": ""} -{"filename": "state-tax-authority_letter_2025-11-03.pdf", "domain": "government", "category": "state", "doctype": "letters", "vendor": "State Tax Authority", "date": "20251103", "subject": "correspondence letter", "notes": "synthetic, generated 2026-04-28; round4-remap: correspondence -> state"} -{"filename": "meridian-software_report_2025-11-07.pdf", "domain": "career", "category": "employer", "doctype": "reports", "vendor": "Meridian Software", "date": "20251107", "subject": "employer report", "notes": "synthetic, generated 2026-04-28"} -{"filename": "hillcrest-college_report_2026-04-06.pdf", "domain": "education", "category": "transcript", "doctype": "reports", "vendor": "Hillcrest College", "date": "20260406", "subject": "agreement report", "notes": "synthetic, generated 2026-04-28; round4-remap: agreement -> transcript", "entity": ""} -{"filename": "greenwood-family-practice_invoice_2026-03-19.pdf", "domain": "medical", "category": "primary_care", "doctype": "invoices", "vendor": "Greenwood Family Practice", "date": "20260319", "subject": "record invoice", "notes": "synthetic, generated 2026-04-28; round4-remap: record -> primary_care", "entity": "Patricia M. Holloway"} -{"filename": "greenwood-family-practice_referral_2025-06-11.pdf", "domain": "medical", "category": "prescription", "doctype": "referrals", "vendor": "Greenwood Family Practice", "date": "20250611", "subject": "prescription referral", "notes": "synthetic, generated 2026-04-28", "entity": "Harold T. Benson"} -{"filename": "civic-library-associati_certificate_2025-05-25.pdf", "domain": "lifestyle", "category": "membership", "doctype": "certificates", "vendor": "Civic Library Association", "date": "20250525", "subject": "membership certificate", "notes": "synthetic, generated 2026-04-28; relabeled prof-b7h personal->lifestyle"} -{"filename": "brightway-hardware_manual_2025-05-06.pdf", "domain": "household", "category": "documentation", "doctype": "manuals", "vendor": "Brightway Hardware", "date": "20250506", "subject": "documentation manual", "notes": "synthetic, generated 2026-04-28"} -{"filename": "northwind-studios_paystub_2025-06-07.pdf", "domain": "career", "category": "employer", "doctype": "paystubs", "vendor": "Northwind Studios", "date": "20250607", "subject": "employer paystub", "notes": "synthetic, generated 2026-04-28"} -{"filename": "willowbrook-market_manual_2025-10-16.pdf", "domain": "food", "category": "meal_plan", "doctype": "manuals", "vendor": "Willowbrook Market", "date": "20251016", "subject": "recipe manual", "notes": "synthetic, generated 2026-04-28; round4-remap: recipe -> meal_plan"} -{"filename": "sterling-property-group_agreement_2026-01-20.pdf", "domain": "property", "category": "rental", "doctype": "agreements", "vendor": "Sterling Property Group", "date": "20260120", "subject": "agreement agreement", "notes": "synthetic, generated 2026-04-28; round4-remap: agreement -> rental"} -{"filename": "bluefield-reference-library_manual_2025-07-28.pdf", "domain": "reference", "category": "documentation", "doctype": "manuals", "vendor": "Bluefield Reference Library", "date": "20250728", "subject": "manual manual", "notes": "synthetic, generated 2026-04-28; round4-remap: manual -> documentation"} -{"filename": "cedar-pine-home_warranty_2026-02-04.pdf", "domain": "household", "category": "documentation", "doctype": "warranties", "vendor": "Cedar & Pine Home", "date": "20260204", "subject": "documentation warranty", "notes": "synthetic, generated 2026-04-28"} -{"filename": "oakridge-realty_lease_2026-03-19.pdf", "domain": "housing", "category": "rental", "doctype": "leases", "vendor": "Oakridge Realty", "date": "20260319", "subject": "rental lease", "notes": "synthetic, generated 2026-04-28"} -{"filename": "greenleaf-tours_itinerary_2026-01-11.pdf", "domain": "lifestyle", "category": "expense", "doctype": "itineraries", "vendor": "Greenleaf Tours", "date": "20260111", "subject": "expense itinerary", "notes": "synthetic, generated 2026-04-28", "entity": ""} -{"filename": "wagging-tails-boarding_receipt_2026-03-23.pdf", "domain": "pets", "category": "expense", "doctype": "receipts", "vendor": "Wagging Tails Boarding", "date": "20260323", "subject": "expense receipt", "notes": "synthetic, generated 2026-04-28", "entity": "Biscuit"} -{"filename": "crestline-federal-credit-uni_report_2026-01-31.pdf", "domain": "financial", "category": "payment", "doctype": "reports", "vendor": "Crestline Federal Credit Union", "date": "20260131", "subject": "payment report", "notes": "synthetic, generated 2026-04-28"} -{"filename": "heritage-trust_statement_2025-05-04.pdf", "domain": "financial", "category": "investment", "doctype": "statements", "vendor": "Heritage Trust", "date": "20250504", "subject": "investment statement", "notes": "synthetic, generated 2026-04-28"} -{"filename": "meridian-software_letter_2025-09-05.pdf", "domain": "career", "category": "application", "doctype": "letters", "vendor": "Meridian Software", "date": "20250905", "subject": "application letter", "notes": "synthetic, generated 2026-04-29"} -{"filename": "pawsworth-supply_receipt_2025-06-27.pdf", "domain": "pets", "category": "registration", "doctype": "receipts", "vendor": "Pawsworth Supply", "date": "20250627", "subject": "registration receipt", "notes": "synthetic, generated 2026-04-29", "entity": ""} -{"filename": "willowbrook-market_recipe_2025-12-04.pdf", "domain": "food", "category": "meal_plan", "doctype": "recipes", "vendor": "Willowbrook Market", "date": "20251204", "subject": "reference recipe", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> meal_plan"} -{"filename": "harbor-athletic-club_identification_2026-04-20.pdf", "domain": "lifestyle", "category": "membership", "doctype": "identifications", "vendor": "Harbor Athletic Club", "date": "20260420", "subject": "athletic club id", "notes": "synthetic, generated 2026-04-29; relabeled prof-b7h personal/identity->lifestyle/membership"} -{"filename": "bridgeport-telecom_invoice_2025-07-23.pdf", "domain": "utilities", "category": "internet", "doctype": "invoices", "vendor": "Bridgeport Telecom", "date": "20250723", "subject": "internet invoice", "notes": "synthetic, generated 2026-04-29"} -{"filename": "riverstone-maintenance_receipt_2025-07-08.pdf", "domain": "property", "category": "maintenance", "doctype": "receipts", "vendor": "Riverstone Maintenance", "date": "20250708", "subject": "agreement receipt", "notes": "synthetic, generated 2026-04-29; round4-remap: agreement -> maintenance"} -{"filename": "wayfarer-travel_confirmation_2026-02-25.pdf", "domain": "lifestyle", "category": "travel", "doctype": "confirmations", "vendor": "Wayfarer Travel", "date": "20260225", "subject": "travel confirmation", "notes": "synthetic, generated 2026-04-29", "entity": ""} -{"filename": "harbor-athletic-club_certificate_2025-08-19.pdf", "domain": "lifestyle", "category": "membership", "doctype": "certificates", "vendor": "Harbor Athletic Club", "date": "20250819", "subject": "athletic club membership", "notes": "synthetic, generated 2026-04-29; relabeled prof-b7h personal/identity->lifestyle/membership"} -{"filename": "furry-friends-veterinary_agreement_2026-03-11.pdf", "domain": "pets", "category": "registration", "doctype": "agreements", "vendor": "Furry Friends Veterinary", "date": "20260311", "subject": "registration agreement", "notes": "synthetic, generated 2026-04-29", "entity": ""} -{"filename": "summit-water-district_statement_2026-01-29.pdf", "domain": "utilities", "category": "electric", "doctype": "statements", "vendor": "Summit Water District", "date": "20260129", "subject": "electric statement", "notes": "synthetic, generated 2026-04-29"} -{"filename": "oakridge-realty_agreement_2025-06-28.pdf", "domain": "housing", "category": "property", "doctype": "agreements", "vendor": "Oakridge Realty", "date": "20250628", "subject": "property agreement", "notes": "synthetic, generated 2026-04-29"} -{"filename": "brightford-notary_will_2025-05-07.pdf", "domain": "legal", "category": "estate", "doctype": "wills", "vendor": "Brightford Notary", "date": "20250507", "subject": "estate will", "notes": "synthetic, generated 2026-04-29"} -{"filename": "summit-water-district_statement_2026-03-28.pdf", "domain": "utilities", "category": "internet", "doctype": "statements", "vendor": "Summit Water District", "date": "20260328", "subject": "internet statement", "notes": "synthetic, generated 2026-04-29"} -{"filename": "open-knowledge-press_reference_2026-01-24.pdf", "domain": "reference", "category": "topic", "doctype": "references", "vendor": "Open Knowledge Press", "date": "20260124", "subject": "topic reference", "notes": "synthetic, generated 2026-04-29"} -{"filename": "northern-capital-bank_report_2025-11-29.pdf", "domain": "financial", "category": "investment", "doctype": "reports", "vendor": "Northern Capital Bank", "date": "20251129", "subject": "investment report", "notes": "synthetic, generated 2026-04-29"} -{"filename": "highland-hoa_estimate_2025-07-04.pdf", "domain": "property", "category": "hoa", "doctype": "estimates", "vendor": "Highland HOA", "date": "20250704", "subject": "reference estimate", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> hoa"} -{"filename": "brightford-notary_trust_2025-11-11.pdf", "domain": "legal", "category": "estate", "doctype": "trusts", "vendor": "Brightford Notary", "date": "20251111", "subject": "reference trust", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> estate"} -{"filename": "wagging-tails-boarding_receipt_2025-05-24.pdf", "domain": "pets", "category": "expense", "doctype": "receipts", "vendor": "Wagging Tails Boarding", "date": "20250524", "subject": "expense receipt", "notes": "synthetic, generated 2026-04-29", "entity": "Biscuit"} -{"filename": "oakridge-realty_lease_2025-07-09.pdf", "domain": "housing", "category": "rental", "doctype": "leases", "vendor": "Oakridge Realty", "date": "20250709", "subject": "rental lease", "notes": "synthetic, generated 2026-04-29"} -{"filename": "hillcrest-college_agreement_2026-02-03.pdf", "domain": "education", "category": "financial_aid", "doctype": "agreements", "vendor": "Hillcrest College", "date": "20260203", "subject": "financial aid agreement", "notes": "synthetic, generated 2026-04-29", "entity": ""} -{"filename": "marlowe-stone-llp_will_2025-08-05.pdf", "domain": "legal", "category": "estate", "doctype": "wills", "vendor": "Marlowe & Stone LLP", "date": "20250805", "subject": "estate will", "notes": "synthetic, generated 2026-04-29"} -{"filename": "open-knowledge-press_reference_2025-11-17.pdf", "domain": "reference", "category": "topic", "doctype": "references", "vendor": "Open Knowledge Press", "date": "20251117", "subject": "topic reference", "notes": "synthetic, generated 2026-04-29"} -{"filename": "highland-hoa_agreement_2025-06-15.pdf", "domain": "property", "category": "hoa", "doctype": "agreements", "vendor": "Highland HOA", "date": "20250615", "subject": "agreement agreement", "notes": "synthetic, generated 2026-04-29; round4-remap: agreement -> hoa"} -{"filename": "wayfarer-travel_confirmation_2025-07-21.pdf", "domain": "lifestyle", "category": "expense", "doctype": "confirmations", "vendor": "Wayfarer Travel", "date": "20250721", "subject": "expense confirmation", "notes": "synthetic, generated 2026-04-29", "entity": ""} -{"filename": "county-permits-office_form_2025-12-13.pdf", "domain": "government", "category": "federal", "doctype": "forms", "vendor": "County Permits Office", "date": "20251213", "subject": "federal form", "notes": "synthetic, generated 2026-04-29"} -{"filename": "state-tax-authority_form_2026-04-08.pdf", "domain": "government", "category": "local", "doctype": "forms", "vendor": "State Tax Authority", "date": "20260408", "subject": "local form", "notes": "synthetic, generated 2026-04-29"} -{"filename": "crestline-federal-credit_statement_2025-10-26.pdf", "domain": "financial", "category": "investment", "doctype": "statements", "vendor": "Crestline Federal Credit Union", "date": "20251026", "subject": "investment statement", "notes": "synthetic, generated 2026-04-29"} -{"filename": "open-knowledge-press_manual_2025-05-20.pdf", "domain": "reference", "category": "documentation", "doctype": "manuals", "vendor": "Open Knowledge Press", "date": "20250520", "subject": "manual manual", "notes": "synthetic, generated 2026-04-29; round4-remap: manual -> documentation"} -{"filename": "sentinel-mutual_letter_2025-06-25.pdf", "domain": "insurance", "category": "home", "doctype": "letters", "vendor": "Sentinel Mutual", "date": "20250625", "subject": "home letter", "notes": "synthetic, generated 2026-04-29"} -{"filename": "allport-insurance-group_policy_2025-11-28.pdf", "domain": "insurance", "category": "home", "doctype": "policies", "vendor": "Allport Insurance Group", "date": "20251128", "subject": "home policy", "notes": "synthetic, generated 2026-04-29"} -{"filename": "bridgeport-telecom_invoice_2025-07-29.pdf", "domain": "utilities", "category": "electric", "doctype": "invoices", "vendor": "Bridgeport Telecom", "date": "20250729", "subject": "electric invoice", "notes": "synthetic, generated 2026-04-29"} -{"filename": "ironwood-supply_manual_2026-03-16.pdf", "domain": "household", "category": "documentation", "doctype": "manuals", "vendor": "Ironwood Supply", "date": "20260316", "subject": "reference manual", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> documentation"} -{"filename": "northwind-studios_report_2025-07-03.pdf", "domain": "career", "category": "employer", "doctype": "reports", "vendor": "Northwind Studios", "date": "20250703", "subject": "employer report", "notes": "synthetic, generated 2026-04-29"} -{"filename": "allport-insurance-group_policy_2025-10-28.pdf", "domain": "insurance", "category": "home", "doctype": "policies", "vendor": "Allport Insurance Group", "date": "20251028", "subject": "home policy", "notes": "synthetic, generated 2026-04-29"} -{"filename": "lighthouse-volunteers_certificate_2026-02-28.pdf", "domain": "lifestyle", "category": "volunteering", "doctype": "certificates", "vendor": "Lighthouse Volunteers", "date": "20260228", "subject": "volunteer certificate", "notes": "synthetic, generated 2026-04-29; relabeled prof-b7h personal/identity->lifestyle/volunteering"} -{"filename": "northwind-studios_letter_2025-12-18.pdf", "domain": "career", "category": "application", "doctype": "letters", "vendor": "Northwind Studios", "date": "20251218", "subject": "application letter", "notes": "synthetic, generated 2026-04-29"} -{"filename": "keystone-coverage_policy_2025-11-23.pdf", "domain": "insurance", "category": "auto", "doctype": "policies", "vendor": "Keystone Coverage", "date": "20251123", "subject": "auto policy", "notes": "synthetic, generated 2026-04-29"} -{"filename": "northern-capital-bank_statement_2026-04-28.pdf", "domain": "financial", "category": "investment", "doctype": "statements", "vendor": "Northern Capital Bank", "date": "20260428", "subject": "investment statement", "notes": "synthetic, generated 2026-04-29"} -{"filename": "marlowe-stone-llp_will_2026-02-24.pdf", "domain": "legal", "category": "estate", "doctype": "wills", "vendor": "Marlowe & Stone LLP", "date": "20260224", "subject": "estate will", "notes": "synthetic, generated 2026-04-29"} -{"filename": "wayfarer-travel_confirmation_2025-04-30.pdf", "domain": "lifestyle", "category": "expense", "doctype": "confirmations", "vendor": "Wayfarer Travel", "date": "20250430", "subject": "expense confirmation", "notes": "synthetic, generated 2026-04-29", "entity": ""} -{"filename": "maple-lane-grocers_manual_2025-10-04.pdf", "domain": "food", "category": "meal_plan", "doctype": "manuals", "vendor": "Maple Lane Grocers", "date": "20251004", "subject": "reference manual", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> meal_plan"} -{"filename": "harvest-table-co-op_recipe_2025-11-13.pdf", "domain": "food", "category": "meal_plan", "doctype": "recipes", "vendor": "Harvest Table Co-op", "date": "20251113", "subject": "recipe recipe", "notes": "synthetic, generated 2026-04-29; round4-remap: recipe -> meal_plan"} -{"filename": "bluefield-reference-library_manual_2025-11-06.pdf", "domain": "reference", "category": "documentation", "doctype": "manuals", "vendor": "Bluefield Reference Library", "date": "20251106", "subject": "manual manual", "notes": "synthetic, generated 2026-04-29; round4-remap: manual -> documentation"} -{"filename": "department-of-public-records_form_2026-01-14.pdf", "domain": "government", "category": "federal", "doctype": "forms", "vendor": "Department of Public Records", "date": "20260114", "subject": "federal form", "notes": "synthetic, generated 2026-04-29"} -{"filename": "lighthouse-volunteer_identification_2026-04-03.pdf", "domain": "lifestyle", "category": "volunteering", "doctype": "identifications", "vendor": "Lighthouse Volunteers", "date": "20260403", "subject": "volunteer id", "notes": "synthetic, generated 2026-04-29; relabeled prof-b7h personal/identity->lifestyle/volunteering"} -{"filename": "county-permits-office_form_2025-07-09.pdf", "domain": "government", "category": "federal", "doctype": "forms", "vendor": "County Permits Office", "date": "20250709", "subject": "federal form", "notes": "synthetic, generated 2026-04-29"} -{"filename": "oakridge-realty_lease_2025-06-29.pdf", "domain": "housing", "category": "rental", "doctype": "leases", "vendor": "Oakridge Realty", "date": "20250629", "subject": "rental lease", "notes": "synthetic, generated 2026-04-29"} +{"filename": "medical_bill.pdf", "domain": "medical", "category": "expense", "doctype": "invoices", "vendor": "Northern Virginia Medical Center", "date": "20241101", "subject": "october 2024 medical charges", "notes": "Itemized medical services bill", "entity": ""} +{"filename": "receipt.pdf", "domain": "pets", "category": "expense", "doctype": "receipts", "vendor": "PetSmart", "date": "20241208", "subject": "pet supply purchase", "notes": "Retail receipt for pet supplies", "entity": ""} +{"filename": "user_manual.pdf", "domain": "household", "category": "documentation", "doctype": "manuals", "vendor": "smarthub", "date": "20240901", "subject": "smarthub 3000 setup", "notes": "Device manual for SmartHub 3000"} +{"filename": "summit-academy_agreement_2025-07-23.pdf", "domain": "education", "category": "financial_aid", "doctype": "agreements", "vendor": "Summit Academy", "date": "20250723", "subject": "summit academy tutoring enrollment", "notes": "synthetic, generated 2026-04-28; round4-remap: reference -> financial_aid", "entity": ""} +{"filename": "summit-academy_report_2025-12-09.pdf", "domain": "education", "category": "transcript", "doctype": "reports", "vendor": "Summit Academy", "date": "20251209", "subject": "student fall 2025 grades", "notes": "synthetic, generated 2026-04-28", "entity": ""} +{"filename": "riverbend-medical-center_invoice_2026-02-16.pdf", "domain": "medical", "category": "specialist", "doctype": "invoices", "vendor": "Riverbend Medical Center", "date": "20260216", "subject": "margaret thornton neurology services", "notes": "synthetic, generated 2026-04-28", "entity": "Margaret L. Thornton"} +{"filename": "highland-hoa_receipt_2025-08-13.pdf", "domain": "property", "category": "hoa", "doctype": "receipts", "vendor": "Highland HOA", "date": "20250813", "subject": "unit 47 hoa dues", "notes": "synthetic, generated 2026-04-28; round4-remap: agreement -> hoa"} +{"filename": "coastline-adventures_itinerary_2026-01-17.pdf", "domain": "lifestyle", "category": "travel", "doctype": "itineraries", "vendor": "Coastline Adventures", "date": "20260117", "subject": "margaret fontaine pelican bay", "notes": "synthetic, generated 2026-04-28", "entity": ""} +{"filename": "cascade-power_invoice_2026-03-12.pdf", "domain": "utilities", "category": "electric", "doctype": "invoices", "vendor": "Cascade Power", "date": "20260312", "subject": "thornton february electricity usage", "notes": "synthetic, generated 2026-04-28"} +{"filename": "maple-lane-grocers_recipe_2025-05-04.pdf", "domain": "food", "category": "meal_plan", "doctype": "recipes", "vendor": "Maple Lane Grocers", "date": "20250504", "subject": "honey glazed chicken thighs", "notes": "synthetic, generated 2026-04-28; round4-remap: recipe -> meal_plan"} +{"filename": "cedar-pine-home_guide_2026-01-06.pdf", "domain": "household", "category": "documentation", "doctype": "guides", "vendor": "Cedar & Pine Home", "date": "20260106", "subject": "upholstered furniture cleaning", "notes": "synthetic, generated 2026-04-28"} +{"filename": "keystone-coverage_policy_2026-03-02.pdf", "domain": "insurance", "category": "auto", "doctype": "policies", "vendor": "Keystone Coverage", "date": "20260302", "subject": "gerald whitmore auto coverage", "notes": "synthetic, generated 2026-04-28"} +{"filename": "brightford-notary_will_2025-09-24.pdf", "domain": "legal", "category": "estate", "doctype": "wills", "vendor": "Brightford Notary", "date": "20250924", "subject": "margaret elaine voss estate", "notes": "synthetic, generated 2026-04-28"} +{"filename": "westhaven-rentals_lease_2025-12-07.pdf", "domain": "housing", "category": "property", "doctype": "leases", "vendor": "Westhaven Rentals", "date": "20251207", "subject": "17 maplewood court tenancy", "notes": "synthetic, generated 2026-04-28"} +{"filename": "riverbend-medical-center_invoice_2025-10-16.pdf", "domain": "medical", "category": "expense", "doctype": "invoices", "vendor": "Riverbend Medical Center", "date": "20251016", "subject": "margaret holloway medical services", "notes": "synthetic, generated 2026-04-28", "entity": "Margaret L. Holloway"} +{"filename": "hillcrest-college_agreement_2025-12-07.pdf", "domain": "education", "category": "financial_aid", "doctype": "agreements", "vendor": "Hillcrest College", "date": "20251207", "subject": "hillcrest college spring aid", "notes": "synthetic, generated 2026-04-28", "entity": ""} +{"filename": "state-tax-authority_letter_2025-11-03.pdf", "domain": "government", "category": "state", "doctype": "letters", "vendor": "State Tax Authority", "date": "20251103", "subject": "forsythe 2024 tax refund", "notes": "synthetic, generated 2026-04-28; round4-remap: correspondence -> state"} +{"filename": "meridian-software_report_2025-11-07.pdf", "domain": "career", "category": "employer", "doctype": "reports", "vendor": "Meridian Software", "date": "20251107", "subject": "meridian software q3 productivity", "notes": "synthetic, generated 2026-04-28"} +{"filename": "hillcrest-college_report_2026-04-06.pdf", "domain": "education", "category": "transcript", "doctype": "reports", "vendor": "Hillcrest College", "date": "20260406", "subject": "hillcrest meridian learning contract", "notes": "synthetic, generated 2026-04-28; round4-remap: agreement -> transcript", "entity": ""} +{"filename": "greenwood-family-practice_invoice_2026-03-19.pdf", "domain": "medical", "category": "primary_care", "doctype": "invoices", "vendor": "Greenwood Family Practice", "date": "20260319", "subject": "patricia holloway medical charges", "notes": "synthetic, generated 2026-04-28; round4-remap: record -> primary_care", "entity": "Patricia M. Holloway"} +{"filename": "greenwood-family-practice_referral_2025-06-11.pdf", "domain": "medical", "category": "prescription", "doctype": "referrals", "vendor": "Greenwood Family Practice", "date": "20250611", "subject": "harold benson endocrinology referral", "notes": "synthetic, generated 2026-04-28", "entity": "Harold T. Benson"} +{"filename": "civic-library-associati_certificate_2025-05-25.pdf", "domain": "lifestyle", "category": "membership", "doctype": "certificates", "vendor": "Civic Library Association", "date": "20250525", "subject": "margaret fontaine library membership", "notes": "synthetic, generated 2026-04-28; relabeled prof-b7h personal->lifestyle"} +{"filename": "brightway-hardware_manual_2025-05-06.pdf", "domain": "household", "category": "documentation", "doctype": "manuals", "vendor": "Brightway Hardware", "date": "20250506", "subject": "brightway proclean 3500 setup", "notes": "synthetic, generated 2026-04-28"} +{"filename": "northwind-studios_paystub_2025-06-07.pdf", "domain": "career", "category": "employer", "doctype": "paystubs", "vendor": "Northwind Studios", "date": "20250607", "subject": "marcus delacroix may earnings", "notes": "synthetic, generated 2026-04-28"} +{"filename": "willowbrook-market_manual_2025-10-16.pdf", "domain": "food", "category": "meal_plan", "doctype": "manuals", "vendor": "Willowbrook Market", "date": "20251016", "subject": "harvest grain bowl kit", "notes": "synthetic, generated 2026-04-28; round4-remap: recipe -> meal_plan"} +{"filename": "sterling-property-group_agreement_2026-01-20.pdf", "domain": "property", "category": "rental", "doctype": "agreements", "vendor": "Sterling Property Group", "date": "20260120", "subject": "sterling property group services", "notes": "synthetic, generated 2026-04-28; round4-remap: agreement -> rental"} +{"filename": "bluefield-reference-library_manual_2025-07-28.pdf", "domain": "reference", "category": "documentation", "doctype": "manuals", "vendor": "Bluefield Reference Library", "date": "20250728", "subject": "library kiosk setup manual", "notes": "synthetic, generated 2026-04-28; round4-remap: manual -> documentation"} +{"filename": "cedar-pine-home_warranty_2026-02-04.pdf", "domain": "household", "category": "documentation", "doctype": "warranties", "vendor": "Cedar & Pine Home", "date": "20260204", "subject": "artisan console table warranty", "notes": "synthetic, generated 2026-04-28"} +{"filename": "oakridge-realty_lease_2026-03-19.pdf", "domain": "housing", "category": "rental", "doctype": "leases", "vendor": "Oakridge Realty", "date": "20260319", "subject": "jordan calloway rental unit", "notes": "synthetic, generated 2026-04-28"} +{"filename": "greenleaf-tours_itinerary_2026-01-11.pdf", "domain": "lifestyle", "category": "expense", "doctype": "itineraries", "vendor": "Greenleaf Tours", "date": "20260111", "subject": "margaret fontaine ashford trip", "notes": "synthetic, generated 2026-04-28", "entity": ""} +{"filename": "wagging-tails-boarding_receipt_2026-03-23.pdf", "domain": "pets", "category": "expense", "doctype": "receipts", "vendor": "Wagging Tails Boarding", "date": "20260323", "subject": "biscuit clover pet boarding", "notes": "synthetic, generated 2026-04-28", "entity": "Biscuit"} +{"filename": "crestline-federal-credit-uni_report_2026-01-31.pdf", "domain": "financial", "category": "payment", "doctype": "reports", "vendor": "Crestline Federal Credit Union", "date": "20260131", "subject": "credit union payment discrepancies", "notes": "synthetic, generated 2026-04-28"} +{"filename": "heritage-trust_statement_2025-05-04.pdf", "domain": "financial", "category": "investment", "doctype": "statements", "vendor": "Heritage Trust", "date": "20250504", "subject": "forsythe april investment account", "notes": "synthetic, generated 2026-04-28"} +{"filename": "meridian-software_letter_2025-09-05.pdf", "domain": "career", "category": "application", "doctype": "letters", "vendor": "Meridian Software", "date": "20250905", "subject": "senior software engineer application", "notes": "synthetic, generated 2026-04-29"} +{"filename": "pawsworth-supply_receipt_2025-06-27.pdf", "domain": "pets", "category": "registration", "doctype": "receipts", "vendor": "Pawsworth Supply", "date": "20250627", "subject": "dog registration purchase", "notes": "synthetic, generated 2026-04-29", "entity": ""} +{"filename": "willowbrook-market_recipe_2025-12-04.pdf", "domain": "food", "category": "meal_plan", "doctype": "recipes", "vendor": "Willowbrook Market", "date": "20251204", "subject": "beef bourguignon recipe", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> meal_plan"} +{"filename": "harbor-athletic-club_identification_2026-04-20.pdf", "domain": "lifestyle", "category": "membership", "doctype": "identifications", "vendor": "Harbor Athletic Club", "date": "20260420", "subject": "marlowe hendricks gym membership", "notes": "synthetic, generated 2026-04-29; relabeled prof-b7h personal/identity->lifestyle/membership"} +{"filename": "bridgeport-telecom_invoice_2025-07-23.pdf", "domain": "utilities", "category": "internet", "doctype": "invoices", "vendor": "Bridgeport Telecom", "date": "20250723", "subject": "kowalski july internet service", "notes": "synthetic, generated 2026-04-29"} +{"filename": "riverstone-maintenance_receipt_2025-07-08.pdf", "domain": "property", "category": "maintenance", "doctype": "receipts", "vendor": "Riverstone Maintenance", "date": "20250708", "subject": "delmont property maintenance services", "notes": "synthetic, generated 2026-04-29; round4-remap: agreement -> maintenance"} +{"filename": "wayfarer-travel_confirmation_2026-02-25.pdf", "domain": "lifestyle", "category": "travel", "doctype": "confirmations", "vendor": "Wayfarer Travel", "date": "20260225", "subject": "ostrowski lisbon trip booking", "notes": "synthetic, generated 2026-04-29", "entity": ""} +{"filename": "harbor-athletic-club_certificate_2025-08-19.pdf", "domain": "lifestyle", "category": "membership", "doctype": "certificates", "vendor": "Harbor Athletic Club", "date": "20250819", "subject": "jordan calloway membership", "notes": "synthetic, generated 2026-04-29; relabeled prof-b7h personal/identity->lifestyle/membership"} +{"filename": "furry-friends-veterinary_agreement_2026-03-11.pdf", "domain": "pets", "category": "registration", "doctype": "agreements", "vendor": "Furry Friends Veterinary", "date": "20260311", "subject": "pet wellness registration plan", "notes": "synthetic, generated 2026-04-29", "entity": ""} +{"filename": "summit-water-district_statement_2026-01-29.pdf", "domain": "utilities", "category": "electric", "doctype": "statements", "vendor": "Summit Water District", "date": "20260129", "subject": "patricia fenwick january electricity", "notes": "synthetic, generated 2026-04-29"} +{"filename": "oakridge-realty_agreement_2025-06-28.pdf", "domain": "housing", "category": "property", "doctype": "agreements", "vendor": "Oakridge Realty", "date": "20250628", "subject": "47 birchwood court tenancy", "notes": "synthetic, generated 2026-04-29"} +{"filename": "brightford-notary_will_2025-05-07.pdf", "domain": "legal", "category": "estate", "doctype": "wills", "vendor": "Brightford Notary", "date": "20250507", "subject": "margaret dunford estate wishes", "notes": "synthetic, generated 2026-04-29"} +{"filename": "summit-water-district_statement_2026-03-28.pdf", "domain": "utilities", "category": "internet", "doctype": "statements", "vendor": "Summit Water District", "date": "20260328", "subject": "margaret thornton march water", "notes": "synthetic, generated 2026-04-29"} +{"filename": "open-knowledge-press_reference_2026-01-24.pdf", "domain": "reference", "category": "topic", "doctype": "references", "vendor": "Open Knowledge Press", "date": "20260124", "subject": "open access publishing tiers", "notes": "synthetic, generated 2026-04-29"} +{"filename": "northern-capital-bank_report_2025-11-29.pdf", "domain": "financial", "category": "investment", "doctype": "reports", "vendor": "Northern Capital Bank", "date": "20251129", "subject": "client portfolio q3 performance", "notes": "synthetic, generated 2026-04-29"} +{"filename": "highland-hoa_estimate_2025-07-04.pdf", "domain": "property", "category": "hoa", "doctype": "estimates", "vendor": "Highland HOA", "date": "20250704", "subject": "highland estates property maintenance", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> hoa"} +{"filename": "brightford-notary_trust_2025-11-11.pdf", "domain": "legal", "category": "estate", "doctype": "trusts", "vendor": "Brightford Notary", "date": "20251111", "subject": "hollowell granddaughter education trust", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> estate"} +{"filename": "wagging-tails-boarding_receipt_2025-05-24.pdf", "domain": "pets", "category": "expense", "doctype": "receipts", "vendor": "Wagging Tails Boarding", "date": "20250524", "subject": "biscuit dog boarding", "notes": "synthetic, generated 2026-04-29", "entity": "Biscuit"} +{"filename": "oakridge-realty_lease_2025-07-09.pdf", "domain": "housing", "category": "rental", "doctype": "leases", "vendor": "Oakridge Realty", "date": "20250709", "subject": "oakridge realty apartment lease", "notes": "synthetic, generated 2026-04-29"} +{"filename": "hillcrest-college_agreement_2026-02-03.pdf", "domain": "education", "category": "financial_aid", "doctype": "agreements", "vendor": "Hillcrest College", "date": "20260203", "subject": "hillcrest college spring aid", "notes": "synthetic, generated 2026-04-29", "entity": ""} +{"filename": "marlowe-stone-llp_will_2025-08-05.pdf", "domain": "legal", "category": "estate", "doctype": "wills", "vendor": "Marlowe & Stone LLP", "date": "20250805", "subject": "dorothea elaine voss estate", "notes": "synthetic, generated 2026-04-29"} +{"filename": "open-knowledge-press_reference_2025-11-17.pdf", "domain": "reference", "category": "topic", "doctype": "references", "vendor": "Open Knowledge Press", "date": "20251117", "subject": "microeconomic concepts overview", "notes": "synthetic, generated 2026-04-29"} +{"filename": "highland-hoa_agreement_2025-06-15.pdf", "domain": "property", "category": "hoa", "doctype": "agreements", "vendor": "Highland HOA", "date": "20250615", "subject": "highland hoa grounds maintenance", "notes": "synthetic, generated 2026-04-29; round4-remap: agreement -> hoa"} +{"filename": "wayfarer-travel_confirmation_2025-07-21.pdf", "domain": "lifestyle", "category": "expense", "doctype": "confirmations", "vendor": "Wayfarer Travel", "date": "20250721", "subject": "fontaine lisbon travel expenses", "notes": "synthetic, generated 2026-04-29", "entity": ""} +{"filename": "county-permits-office_form_2025-12-13.pdf", "domain": "government", "category": "federal", "doctype": "forms", "vendor": "County Permits Office", "date": "20251213", "subject": "minor works permit application", "notes": "synthetic, generated 2026-04-29"} +{"filename": "state-tax-authority_form_2026-04-08.pdf", "domain": "government", "category": "local", "doctype": "forms", "vendor": "State Tax Authority", "date": "20260408", "subject": "local business tax update", "notes": "synthetic, generated 2026-04-29"} +{"filename": "crestline-federal-credit_statement_2025-10-26.pdf", "domain": "financial", "category": "investment", "doctype": "statements", "vendor": "Crestline Federal Credit Union", "date": "20251026", "subject": "fontaine investment account october", "notes": "synthetic, generated 2026-04-29"} +{"filename": "open-knowledge-press_manual_2025-05-20.pdf", "domain": "reference", "category": "documentation", "doctype": "manuals", "vendor": "Open Knowledge Press", "date": "20250520", "subject": "brewmaster bm 200 coffee", "notes": "synthetic, generated 2026-04-29; round4-remap: manual -> documentation"} +{"filename": "sentinel-mutual_letter_2025-06-25.pdf", "domain": "insurance", "category": "home", "doctype": "letters", "vendor": "Sentinel Mutual", "date": "20250625", "subject": "fontaine kitchen water damage", "notes": "synthetic, generated 2026-04-29"} +{"filename": "allport-insurance-group_policy_2025-11-28.pdf", "domain": "insurance", "category": "home", "doctype": "policies", "vendor": "Allport Insurance Group", "date": "20251128", "subject": "thornton homeowners insurance coverage", "notes": "synthetic, generated 2026-04-29"} +{"filename": "bridgeport-telecom_invoice_2025-07-29.pdf", "domain": "utilities", "category": "electric", "doctype": "invoices", "vendor": "Bridgeport Telecom", "date": "20250729", "subject": "carleton marsh july electricity", "notes": "synthetic, generated 2026-04-29"} +{"filename": "ironwood-supply_manual_2026-03-16.pdf", "domain": "household", "category": "documentation", "doctype": "manuals", "vendor": "Ironwood Supply", "date": "20260316", "subject": "iw 7 air purifier", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> documentation"} +{"filename": "northwind-studios_report_2025-07-03.pdf", "domain": "career", "category": "employer", "doctype": "reports", "vendor": "Northwind Studios", "date": "20250703", "subject": "northwind studios q2 workforce", "notes": "synthetic, generated 2026-04-29"} +{"filename": "allport-insurance-group_policy_2025-10-28.pdf", "domain": "insurance", "category": "home", "doctype": "policies", "vendor": "Allport Insurance Group", "date": "20251028", "subject": "patricia harmon home insurance", "notes": "synthetic, generated 2026-04-29"} +{"filename": "lighthouse-volunteers_certificate_2026-02-28.pdf", "domain": "lifestyle", "category": "volunteering", "doctype": "certificates", "vendor": "Lighthouse Volunteers", "date": "20260228", "subject": "jordan calloway volunteer verification", "notes": "synthetic, generated 2026-04-29; relabeled prof-b7h personal/identity->lifestyle/volunteering"} +{"filename": "northwind-studios_letter_2025-12-18.pdf", "domain": "career", "category": "application", "doctype": "letters", "vendor": "Northwind Studios", "date": "20251218", "subject": "northwind studios designer application", "notes": "synthetic, generated 2026-04-29"} +{"filename": "keystone-coverage_policy_2025-11-23.pdf", "domain": "insurance", "category": "auto", "doctype": "policies", "vendor": "Keystone Coverage", "date": "20251123", "subject": "marcus delgado auto insurance", "notes": "synthetic, generated 2026-04-29"} +{"filename": "northern-capital-bank_statement_2026-04-28.pdf", "domain": "financial", "category": "investment", "doctype": "statements", "vendor": "Northern Capital Bank", "date": "20260428", "subject": "forsythe investment account april", "notes": "synthetic, generated 2026-04-29"} +{"filename": "marlowe-stone-llp_will_2026-02-24.pdf", "domain": "legal", "category": "estate", "doctype": "wills", "vendor": "Marlowe & Stone LLP", "date": "20260224", "subject": "dorothy carmichael estate wishes", "notes": "synthetic, generated 2026-04-29"} +{"filename": "wayfarer-travel_confirmation_2025-04-30.pdf", "domain": "lifestyle", "category": "expense", "doctype": "confirmations", "vendor": "Wayfarer Travel", "date": "20250430", "subject": "ostrowski portland travel package", "notes": "synthetic, generated 2026-04-29", "entity": ""} +{"filename": "maple-lane-grocers_manual_2025-10-04.pdf", "domain": "food", "category": "meal_plan", "doctype": "manuals", "vendor": "Maple Lane Grocers", "date": "20251004", "subject": "mlg 7 food warmer", "notes": "synthetic, generated 2026-04-29; round4-remap: reference -> meal_plan"} +{"filename": "harvest-table-co-op_recipe_2025-11-13.pdf", "domain": "food", "category": "meal_plan", "doctype": "recipes", "vendor": "Harvest Table Co-op", "date": "20251113", "subject": "butternut squash mushroom risotto", "notes": "synthetic, generated 2026-04-29; round4-remap: recipe -> meal_plan"} +{"filename": "bluefield-reference-library_manual_2025-11-06.pdf", "domain": "reference", "category": "documentation", "doctype": "manuals", "vendor": "Bluefield Reference Library", "date": "20251106", "subject": "library kiosk terminal setup", "notes": "synthetic, generated 2026-04-29; round4-remap: manual -> documentation"} +{"filename": "department-of-public-records_form_2026-01-14.pdf", "domain": "government", "category": "federal", "doctype": "forms", "vendor": "Department of Public Records", "date": "20260114", "subject": "federal records access request", "notes": "synthetic, generated 2026-04-29"} +{"filename": "lighthouse-volunteer_identification_2026-04-03.pdf", "domain": "lifestyle", "category": "volunteering", "doctype": "identifications", "vendor": "Lighthouse Volunteers", "date": "20260403", "subject": "marguerite trowbridge volunteer membership", "notes": "synthetic, generated 2026-04-29; relabeled prof-b7h personal/identity->lifestyle/volunteering"} +{"filename": "county-permits-office_form_2025-07-09.pdf", "domain": "government", "category": "federal", "doctype": "forms", "vendor": "County Permits Office", "date": "20250709", "subject": "minor construction permit application", "notes": "synthetic, generated 2026-04-29"} +{"filename": "oakridge-realty_lease_2025-06-29.pdf", "domain": "housing", "category": "rental", "doctype": "leases", "vendor": "Oakridge Realty", "date": "20250629", "subject": "oakridge realty residential tenancy", "notes": "synthetic, generated 2026-04-29"} diff --git a/scripts/backfill_eval_subjects.py b/scripts/backfill_eval_subjects.py new file mode 100644 index 0000000..84fcb06 --- /dev/null +++ b/scripts/backfill_eval_subjects.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +"""Backfill content-based subjects into the synthetic eval ground truth. + +prof-cgu: rows in eval/ground_truth/synthetic.jsonl historically carried the +document-type subject form (e.g. ``estate will``) that the file's own header +prohibits. This one-shot tool reads each document's PDF, asks the model for a +2-4 word content subject via ``generate_eval_samples._subject_from_text``, and +rewrites that row's ``subject`` in place. + +Each row already carries a ``subject`` key, so the value is rewritten in +place; comment lines and every other field (e.g. ``entity``) are preserved +unchanged. The PDFs, filenames, dates, and vendors are untouched. + +Usage: + uv run --with pypdf python scripts/backfill_eval_subjects.py --dry-run + uv run --with pypdf python scripts/backfill_eval_subjects.py --max-cost-usd 3 +""" + +from __future__ import annotations + +import asyncio +import contextlib +import json +import sys +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import click + +if TYPE_CHECKING: + from collections import Counter + +sys.path.insert(0, str(Path(__file__).parent)) +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from generate_eval_samples import ( + MODEL_PRICING, + EmptyDocumentTextError, + _anthropic_client, + _estimate_cost, + _subject_from_text, + _write_text_atomic, +) + + +def _extract_text(pdf_path: Path) -> str: + """Return the concatenated text layer of a PDF (no OCR). + + The eval PDFs are reportlab-rendered from markdown and carry a clean text + layer, so a plain text-layer read is faster and lighter than Docling's + full-page-OCR loader and is sufficient to derive a short content subject. + """ + from pypdf import PdfReader + + reader = PdfReader(str(pdf_path)) + return "\n".join(page.extract_text() or "" for page in reader.pages) + + +def _data_rows(lines: list[str]) -> list[tuple[int, dict[str, Any]]]: + """Return (line_index, parsed_dict) for each JSONL data row.""" + rows: list[tuple[int, dict[str, Any]]] = [] + for i, line in enumerate(lines): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + try: + obj = json.loads(stripped) + except json.JSONDecodeError: + continue + if isinstance(obj, dict) and "filename" in obj: + rows.append((i, obj)) + return rows + + +def _forbidden_form(obj: dict[str, Any]) -> str: + """The ``f'{category} {doctype}'`` form prof-cgu set out to eliminate.""" + cat = str(obj.get("category", "")).replace("_", " ") + dt = str(obj.get("doctype", "")).replace("_", " ") + return f"{cat} {dt}".strip() + + +async def _drain(tasks: list[asyncio.Task[Any]]) -> None: + """Cancel and await any tasks still pending, swallowing CancelledError. + + The in-flight HTTP calls Anthropic has already received will still bill + (asyncio cannot un-send a request), but the loop exits without + 'Task was destroyed but it is pending' warnings or unclean socket + teardown. + """ + for t in tasks: + if not t.done(): + t.cancel() + for t in tasks: + with contextlib.suppress(BaseException): + await t + + +async def _backfill( + *, + ground_truth: Path, + lines: list[str], + rows: list[tuple[int, dict[str, Any]]], + documents_dir: Path, + model: str, + concurrency: int, + max_cost_usd: float, +) -> int: + client = _anthropic_client(model, max_retries=3, timeout_seconds=120) + sem = asyncio.Semaphore(concurrency) + + async def _one(idx: int, obj: dict[str, Any]) -> tuple[int, str, Counter[str]]: + async with sem: + pdf = documents_dir / obj["filename"] + text = await asyncio.to_thread(_extract_text, pdf) + subject, usage = await _subject_from_text( + client, text, forbidden=_forbidden_form(obj) + ) + return idx, subject, usage + + tasks: list[asyncio.Task[Any]] = [ + asyncio.create_task(_one(i, obj)) for i, obj in rows + ] + + click.echo("# Subject backfill") + click.echo(f"# rows={len(rows)} model={model}") + click.echo("| filename | old subject | new subject | cost_usd |") + click.echo("|----------|-------------|-------------|----------|") + + # Collect first, apply at the end — separating the two means an abort + # cannot leave `lines` half-rewritten and a future refactor cannot ship + # a partial-write regression. + results: dict[int, str] = {} + cost = 0.0 + aborted = False + by_index = dict(rows) + + for coro in asyncio.as_completed(tasks): + try: + idx, subject, usage = await coro + except EmptyDocumentTextError as exc: + click.echo(f"ERROR: {exc}", err=True) + await _drain(tasks) + return 2 + except Exception as exc: + click.echo(f"ERROR: per-row task failed: {exc!r}", err=True) + await _drain(tasks) + return 3 + results[idx] = subject + call_cost = _estimate_cost(usage, model) + cost += call_cost + obj = by_index[idx] + old = obj.get("subject", "") + click.echo(f"| {obj['filename']} | {old} | {subject} | {call_cost:.5f} |") + if cost > max_cost_usd: + click.echo( + f"\nABORT: cumulative cost ${cost:.2f} exceeds " + f"--max-cost-usd ${max_cost_usd:.2f}; no file written.", + err=True, + ) + aborted = True + break + + await _drain(tasks) + if aborted: + return 1 + + for idx, subject in results.items(): + obj = by_index[idx] + obj["subject"] = subject # reassignment preserves key position + newline = "\n" if lines[idx].endswith("\n") else "" + lines[idx] = json.dumps(obj) + newline + _write_text_atomic(ground_truth, "".join(lines)) + + click.echo(f"\nsummary: rewritten={len(results)} cost_usd={cost:.4f}") + return 0 + + +@click.command() +@click.option("--ai-model", default="claude-sonnet-4-6", show_default=True) +@click.option("--concurrency", type=int, default=5, show_default=True) +@click.option("--max-cost-usd", type=float, default=6.00, show_default=True) +@click.option( + "--ground-truth", + type=click.Path(dir_okay=False, path_type=Path), + default=Path("eval/ground_truth/synthetic.jsonl"), + show_default=True, +) +@click.option( + "--documents-dir", + type=click.Path(file_okay=False, path_type=Path), + default=Path("eval/samples/synthetic"), + show_default=True, +) +@click.option("--dry-run", is_flag=True, default=False) +def main( + ai_model: str, + concurrency: int, + max_cost_usd: float, + ground_truth: Path, + documents_dir: Path, + dry_run: bool, +) -> None: + """Rewrite the `subject` of every synthetic eval row from its content.""" + if not dry_run and ai_model not in MODEL_PRICING: + click.echo( + f"ERROR: --ai-model {ai_model!r} has no pricing entry in " + f"MODEL_PRICING ({sorted(MODEL_PRICING)}); cost cap would be " + "silently disabled. Add pricing or pick a priced model.", + err=True, + ) + raise SystemExit(2) + + lines = ground_truth.read_text(encoding="utf-8").splitlines(keepends=True) + rows = _data_rows(lines) + missing = [ + obj["filename"] + for _, obj in rows + if not (documents_dir / obj["filename"]).exists() + ] + if missing: + click.echo(f"ERROR: {len(missing)} PDFs missing: {missing[:5]}", err=True) + raise SystemExit(2) + + if dry_run: + click.echo(f"# dry-run: {len(rows)} rows; no API calls or writes.") + click.echo("| filename | current subject |") + click.echo("|----------|-----------------|") + for _, obj in rows: + click.echo(f"| {obj['filename']} | {obj.get('subject', '')} |") + return + + exit_code = asyncio.run( + _backfill( + ground_truth=ground_truth, + lines=lines, + rows=rows, + documents_dir=documents_dir, + model=ai_model, + concurrency=concurrency, + max_cost_usd=max_cost_usd, + ) + ) + raise SystemExit(exit_code) + + +if __name__ == "__main__": + main() diff --git a/scripts/generate_eval_samples.py b/scripts/generate_eval_samples.py index 1407839..1d149c3 100644 --- a/scripts/generate_eval_samples.py +++ b/scripts/generate_eval_samples.py @@ -21,6 +21,7 @@ import re import sys import tempfile +from collections import Counter from datetime import UTC, date, datetime, timedelta from pathlib import Path from typing import TYPE_CHECKING, Any, NamedTuple @@ -927,6 +928,19 @@ def flush_list() -> None: ) +def _write_text_atomic(path: Path, payload: str) -> None: + """Atomically write `payload` to `path` via a temp sibling + rename.""" + path.parent.mkdir(parents=True, exist_ok=True) + fd, tmp_path = tempfile.mkstemp(prefix=path.name + ".", dir=str(path.parent)) + try: + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write(payload) + Path(tmp_path).replace(path) + except Exception: + Path(tmp_path).unlink(missing_ok=True) + raise + + def _append_jsonl_atomic(row: GroundTruthRow, jsonl_path: Path) -> None: """Append one row by writing the full new file to a temp sibling and renaming.""" existing = jsonl_path.read_text() if jsonl_path.exists() else "" @@ -936,18 +950,7 @@ def _append_jsonl_atomic(row: GroundTruthRow, jsonl_path: Path) -> None: if existing.endswith("\n") or not existing else existing + "\n" + new_line ) - - jsonl_path.parent.mkdir(parents=True, exist_ok=True) - fd, tmp_path = tempfile.mkstemp( - prefix=jsonl_path.name + ".", dir=str(jsonl_path.parent) - ) - try: - with os.fdopen(fd, "w", encoding="utf-8") as f: - f.write(payload) - Path(tmp_path).replace(jsonl_path) - except Exception: - Path(tmp_path).unlink(missing_ok=True) - raise + _write_text_atomic(jsonl_path, payload) def _existing_filenames(output_dir: Path, jsonl_path: Path) -> set[str]: @@ -1097,8 +1100,92 @@ def _record(t: Triple) -> None: # -- Generation orchestration ------------------------------------------------ -def _subject_for(triple: Triple) -> str: - return f"{triple.category} {triple.doctype}".replace("_", " ") +SUBJECT_SYSTEM = ( + "You read a document and name its specific subject in 2-4 lowercase " + "words. The subject describes WHAT THE DOCUMENT IS ABOUT (its content, " + "parties, or topic), never the document type. For an auto insurance " + "policy, answer 'sedan collision coverage', not 'auto policy'. For a " + "utility bill, answer something like 'march electricity usage', not " + "'electric invoice'. Output ONLY the 2-4 word subject: no label, no " + "quotes, no punctuation, no explanation." +) + + +def _sanitize_subject(raw: str) -> str | None: + """Normalize a model reply into a 2-4 word lowercase content subject. + + Format only; the prompt owns the "content, not document type" semantics. + Returns None when the reply cannot satisfy the convention (empty, or a + single word), signaling the caller to retry. + """ + text = re.sub(r"^\s*subject\s*:\s*", "", raw.strip(), flags=re.IGNORECASE) + text = text.strip().strip("\"'") + text = re.sub(r"[^a-z0-9]+", " ", text.lower()).strip() + words = text.split() + if len(words) < 2: + return None + return " ".join(words[:4]) + + +_STOPWORDS = frozenset( + {"the", "and", "for", "this", "that", "with", "from", "your", "are", "was"} +) + + +def _fallback_subject(text: str) -> str | None: + """First content words of a document, or None if the text is too empty. + + Returns None instead of a literal placeholder when the document yields + fewer than two distinct content words; the caller decides how to surface + that signal rather than silently writing 'document content' into ground + truth (prof-cgu regression guard). + """ + picked = [w for w in re.findall(r"[a-z]{3,}", text.lower()) if w not in _STOPWORDS][ + :3 + ] + if len(picked) < 2: + return None + return " ".join(picked) + + +class EmptyDocumentTextError(ValueError): + """Raised when subject derivation has no document text to work with.""" + + +async def _subject_from_text( + client: ChatAnthropic, text: str, forbidden: str | None = None +) -> tuple[str, Counter[str]]: + """Derive a convention-compliant content subject from document text. + + Makes up to three model calls, sanitizing each reply. Replies matching + ``forbidden`` (the row's ``f"{category} {doctype}"`` form) trigger a retry + — the prof-cgu invariant has a structural guard, not just a prompt. On + retry exhaustion, falls back to the document's first content words; if + even that is empty, raises ``EmptyDocumentTextError`` so the caller can + surface a real signal instead of silently writing a placeholder. + Returns (subject, cumulative_usage). + """ + excerpt = text.strip()[:2000] + total: Counter[str] = Counter() + if not excerpt: + fallback = _fallback_subject(text) + if fallback is None: + raise EmptyDocumentTextError("document text is empty") + return fallback, total + user = f"Document:\n\n{excerpt}\n\nSubject:" + for _ in range(3): + reply, usage = await _call_anthropic_async(client, SUBJECT_SYSTEM, user) + total.update(usage) + subject = _sanitize_subject(reply) + if subject is None: + continue + if forbidden is not None and subject == forbidden: + continue + return subject, total + fallback = _fallback_subject(text) + if fallback is None: + raise EmptyDocumentTextError("document text yields no content subject") + return fallback, total def _length_target(triple: Triple) -> int: @@ -1148,6 +1235,21 @@ async def _generate_one( logger.error("sanitize_exhausted", filename=filename) return None + # Derive subject from generated content BEFORE writing the PDF + # (prof-cgu): otherwise an exception here would orphan the PDF on disk + # and skip_filenames would poison it as "done" on the next run. The + # subject call's tokens are folded into this row's usage so the cost + # cap and per-row cost column reflect real spend. + forbidden = f"{triple.category} {triple.doctype}".replace("_", " ") + try: + subject, subject_usage = await _subject_from_text( + client, text, forbidden=forbidden + ) + except EmptyDocumentTextError: + logger.error("subject_empty_text", filename=filename) + return None + usage = dict(Counter(usage) + subject_usage) + out_path = output_dir / filename output_dir.mkdir(parents=True, exist_ok=True) _render_pdf(text, out_path) @@ -1159,7 +1261,7 @@ async def _generate_one( doctype=triple.doctype, vendor=vendor, date=doc_date.strftime("%Y%m%d"), - subject=_subject_for(triple), + subject=subject, notes=f"synthetic, generated {datetime.now(UTC).date().isoformat()}", ) diff --git a/tests/test_generate_eval_subjects.py b/tests/test_generate_eval_subjects.py new file mode 100644 index 0000000..d3f424d --- /dev/null +++ b/tests/test_generate_eval_subjects.py @@ -0,0 +1,128 @@ +"""Tests for content-subject derivation in scripts/generate_eval_samples.py. + +Covers the pure sanitizer that turns a model's free-text reply into a +convention-compliant ``subject`` (2-4 lowercase words describing content). +Guards prof-cgu: the generator must never again emit the document-type form +``f"{category} {doctype}"``. +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +generate_eval_samples = pytest.importorskip("generate_eval_samples") +_sanitize_subject = generate_eval_samples._sanitize_subject +_fallback_subject = generate_eval_samples._fallback_subject +_subject_from_text = generate_eval_samples._subject_from_text +EmptyDocumentTextError = generate_eval_samples.EmptyDocumentTextError + + +def test_lowercases_and_trims() -> None: + assert ( + _sanitize_subject(" Mortgage Refinance Offer ") == "mortgage refinance offer" + ) + + +def test_strips_surrounding_quotes_and_trailing_period() -> None: + assert _sanitize_subject('"account summary."') == "account summary" + + +def test_strips_period_glued_to_last_word() -> None: + assert _sanitize_subject("renewal policy.") == "renewal policy" + + +def test_drops_leading_subject_label() -> None: + assert _sanitize_subject("Subject: property tax notice") == "property tax notice" + + +def test_truncates_to_four_words() -> None: + assert ( + _sanitize_subject("annual home insurance renewal policy declaration") + == "annual home insurance renewal" + ) + + +def test_rejects_single_word() -> None: + # One word cannot satisfy the 2-4 word convention; signal a retry. + assert _sanitize_subject("invoice") is None + + +def test_rejects_empty() -> None: + assert _sanitize_subject(" ") is None + + +def test_collapses_internal_whitespace_and_punctuation() -> None: + assert _sanitize_subject("vehicle loan, agreement") == "vehicle loan agreement" + + +def test_fallback_returns_none_on_empty_text() -> None: + # No content words -> caller decides how to surface the signal; the + # fallback never silently writes a placeholder. + assert _fallback_subject("") is None + assert _fallback_subject("the and for this that") is None + + +def test_fallback_returns_first_content_words() -> None: + assert _fallback_subject("Margaret Voss estate documents") == "margaret voss estate" + + +class _FakeClient: + """Minimal stub for ``_call_anthropic_async``'s client argument.""" + + def __init__(self, replies: list[str]) -> None: + self._replies = list(replies) + + def take(self) -> str: + return self._replies.pop(0) + + +async def _fake_call( + client: _FakeClient, system: str, user: str +) -> tuple[str, dict[str, int]]: + return client.take(), { + "input_tokens": 10, + "output_tokens": 3, + "cache_read_tokens": 0, + "cache_write_tokens": 0, + } + + +@pytest.mark.asyncio +async def test_subject_from_text_retries_when_reply_matches_forbidden( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(generate_eval_samples, "_call_anthropic_async", _fake_call) + client = _FakeClient(["auto policy", "sedan collision coverage"]) + subject, _ = await _subject_from_text( + client, # type: ignore[arg-type] + "Auto policy declarations and coverage details for a 2019 sedan.", + forbidden="auto policy", + ) + # First reply matches the forbidden category+doctype form, so the loop + # must retry and accept the second reply. + assert subject == "sedan collision coverage" + + +@pytest.mark.asyncio +async def test_subject_from_text_raises_on_empty_document( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(generate_eval_samples, "_call_anthropic_async", _fake_call) + client = _FakeClient([]) # no calls should happen + with pytest.raises(EmptyDocumentTextError): + await _subject_from_text(client, " \n ") # type: ignore[arg-type] + + +def _forbidden_form(obj: dict[str, Any]) -> str: + """Mirror of backfill_eval_subjects._forbidden_form (avoid import dep).""" + cat = str(obj.get("category", "")).replace("_", " ") + dt = str(obj.get("doctype", "")).replace("_", " ") + return f"{cat} {dt}".strip() + + +def test_forbidden_form_normalizes_underscores() -> None: + assert _forbidden_form({"category": "financial_aid", "doctype": "agreements"}) == ( + "financial aid agreements" + )