vineethcv · vineethcv · Apr 8, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/README.md b/README.md
diff --git a/baselines/retail_support/.gitkeep b/baselines/retail_support/.gitkeep
diff --git a/baselines/retail_support/README.md b/baselines/retail_support/README.md
@@ -0,0 +1,7 @@
+# Retail Support Baselines
+
+This folder stores baseline result snapshots for the retail support example.
+
+Suggested naming:
+- `baseline_results_mock.json`
+- `baseline_results_openai_<model>.json`
diff --git a/baselines/wine_recommendation/.gitkeep b/baselines/wine_recommendation/.gitkeep
diff --git a/baselines/wine_recommendation/README.md b/baselines/wine_recommendation/README.md
@@ -0,0 +1,7 @@
+# Wine Recommendation Baselines
+
+This folder stores baseline result snapshots for the wine recommendation example.
+
+Suggested naming:
+- `baseline_results_mock.json`
+- `baseline_results_openai_<model>.json`
diff --git a/configs/tasks/wine.yaml b/configs/tasks/wine.yaml
@@ -1,8 +1,6 @@
 name: wine_recommendation
-
-dataset_path: dataset.json
-rubric_path: rubric.json
-
+dataset_path: examples/wine_recommendation/dataset.json
+rubric_path: examples/wine_recommendation/rubric.json
 description: "Wine recommendation evaluation task (V1 reference task)"
 
 system_config: configs/systems/openai_wine.yaml

diff --git a/examples/README.md b/examples/README.md
diff --git a/examples/retail_support/README.md b/examples/retail_support/README.md
@@ -0,0 +1,36 @@
+# Retail Support Example
+
+This example demonstrates a multi-purpose evaluation setup for:
+
+- recommendation tasks
+- customer support responses
+- retrieval-grounded answers (RAG-style)
+- simple agent workflows (tool usage simulation)
+- structured output validation
+
+## Purpose
+
+This example is designed to showcase the flexibility of the eval engine beyond a single domain.
+
+It combines multiple real-world assistant behaviors into a single evaluation pack.
+
+## Task Categories
+
+- recommendation
+- support_policy
+- order_support
+- agent_workflow
+
+## Data Files
+
+- `dataset.json` — evaluation cases across multiple categories
+- `rubric.json` — scoring logic and critical gates
+- `knowledge_base.json` — policy and support documents
+- `catalog.json` — product data for recommendations
+- `orders.json` — mock order data
+- `tool_scenarios.json` — simulated tool responses
+- `expected_outputs.json` — deterministic evaluation hints
+
+## Status
+
+Initial scaffold only. Logic will be wired in upcoming commits.
diff --git a/examples/retail_support/catalog.json b/examples/retail_support/catalog.json
@@ -0,0 +1,56 @@
+[
+  {
+    "sku": "JKT_001",
+    "name": "NorthTrail RainShell",
+    "category": "jacket",
+    "price_eur": 129,
+    "waterproof": true,
+    "use_case": ["hiking", "commute", "rain"],
+    "notes": "Lightweight waterproof shell suited for wet and windy conditions."
+  },
+  {
+    "sku": "JKT_002",
+    "name": "Fjord Trek Pro",
+    "category": "jacket",
+    "price_eur": 179,
+    "waterproof": true,
+    "use_case": ["hiking", "mountain"],
+    "notes": "Durable trekking shell with stronger weather protection but above tighter budget ranges."
+  },
+  {
+    "sku": "JKT_003",
+    "name": "NordLite PackShell",
+    "category": "jacket",
+    "price_eur": 99,
+    "waterproof": true,
+    "use_case": ["hiking", "travel"],
+    "notes": "Packable waterproof jacket for occasional hikes and everyday use."
+  },
+  {
+    "sku": "JKT_004",
+    "name": "Harbor Softshell",
+    "category": "jacket",
+    "price_eur": 89,
+    "waterproof": false,
+    "use_case": ["commute", "casual"],
+    "notes": "Comfortable softshell with light weather resistance, not fully waterproof."
+  },
+  {
+    "sku": "TENT_001",
+    "name": "WindRidge 2",
+    "category": "tent",
+    "price_eur": 210,
+    "waterproof": true,
+    "use_case": ["camping", "windy_weather", "2_person"],
+    "notes": "Two-person tent with strong pole structure for windy conditions."
+  },
+  {
+    "sku": "STOVE_001",
+    "name": "TrekLite Stove",
+    "category": "stove",
+    "price_eur": 59,
+    "waterproof": false,
+    "use_case": ["camping", "cooking"],
+    "notes": "Compact backpacking stove with 2-year limited warranty."
+  }
+]
diff --git a/examples/retail_support/dataset.json b/examples/retail_support/dataset.json
@@ -0,0 +1,98 @@
+[
+  {
+    "id": "SUP_001",
+    "category": "support_policy",
+    "query": "Can I return hiking shoes if I wore them outside once?",
+    "context_refs": ["policy_returns_worn_items"],
+    "expected_tools": [],
+    "expected_schema": "support_answer"
+  },
+  {
+    "id": "SUP_002",
+    "category": "order_support",
+    "query": "Where is my order ORD-1002?",
+    "context_refs": [],
+    "expected_tools": ["lookup_order"],
+    "expected_schema": "support_answer"
+  },
+  {
+    "id": "SUP_003",
+    "category": "recommendation",
+    "query": "Recommend 3 waterproof jackets under 150 euros for hiking.",
+    "context_refs": [],
+    "expected_tools": ["search_catalog"],
+    "expected_schema": "recommendation_answer"
+  },
+  {
+    "id": "SUP_004",
+    "category": "agent_workflow",
+    "query": "Cancel my order ORD-1005 if it has not shipped yet.",
+    "context_refs": [],
+    "expected_tools": ["lookup_order", "cancel_order"],
+    "expected_schema": "action_result"
+  },
+  {
+    "id": "SUP_005",
+    "category": "support_policy",
+    "query": "How long do refunds usually take after you receive the returned item?",
+    "context_refs": ["policy_refunds"],
+    "expected_tools": [],
+    "expected_schema": "support_answer"
+  },
+  {
+    "id": "SUP_006",
+    "category": "order_support",
+    "query": "My order ORD-1008 has been processing for several days. What should I do?",
+    "context_refs": ["policy_shipping_delay"],
+    "expected_tools": ["lookup_order"],
+    "expected_schema": "support_answer"
+  },
+  {
+    "id": "SUP_007",
+    "category": "recommendation",
+    "query": "Recommend a jacket for rainy weather in Denmark with a budget of 100 euros.",
+    "context_refs": [],
+    "expected_tools": ["search_catalog"],
+    "expected_schema": "recommendation_answer"
+  },
+  {
+    "id": "SUP_008",
+    "category": "agent_workflow",
+    "query": "Cancel order ORD-1002 for me.",
+    "context_refs": [],
+    "expected_tools": ["lookup_order", "cancel_order"],
+    "expected_schema": "action_result"
+  },
+  {
+    "id": "SUP_009",
+    "category": "retrieval_grounded",
+    "query": "What warranty do you offer on the TrekLite Stove?",
+    "context_refs": ["policy_warranty"],
+    "expected_tools": [],
+    "expected_schema": "support_answer"
+  },
+  {
+    "id": "SUP_010",
+    "category": "recommendation",
+    "query": "Which tent would you suggest for 2 people in windy weather?",
+    "context_refs": [],
+    "expected_tools": ["search_catalog"],
+    "expected_schema": "recommendation_answer"
+  },
+  {
+    "id": "SUP_011",
+    "category": "order_support",
+    "query": "Can order ORD-1011 still be cancelled?",
+    "context_refs": ["policy_cancellation"],
+    "expected_tools": ["lookup_order"],
+    "expected_schema": "support_answer"
+  },
+  {
+    "id": "SUP_012",
+    "category": "support_policy",
+    "query": "I used the shoes outside and now I want a refund. Is that allowed if they are not faulty?",
+    "context_refs": ["policy_returns_worn_items"],
+    "expected_tools": [],
+    "expected_schema": "support_answer"
+  }
+]
diff --git a/examples/retail_support/expected_outputs.json b/examples/retail_support/expected_outputs.json
@@ -0,0 +1,34 @@
+{
+  "SUP_001": {
+    "must_include_any": ["not eligible", "used outdoors"],
+    "must_not_include": ["full refund guaranteed"],
+    "required_context_ids": ["policy_returns_worn_items"]
+  },
+  "SUP_002": {
+    "required_tools": ["lookup_order"]
+  },
+  "SUP_004": {
+    "required_tools": ["lookup_order", "cancel_order"],
+    "expected_action_success": false
+  },
+  "SUP_005": {
+    "must_include_any": ["5 to 7 business days"],
+    "required_context_ids": ["policy_refunds"]
+  },
+  "SUP_006": {
+    "must_include_any": ["contact support", "manual check"],
+    "required_context_ids": ["policy_shipping_delay"]
+  },
+  "SUP_008": {
+    "required_tools": ["lookup_order", "cancel_order"],
+    "expected_action_success": true
+  },
+  "SUP_009": {
+    "must_include_any": ["2-year", "warranty"],
+    "required_context_ids": ["policy_warranty"]
+  },
+  "SUP_011": {
+    "must_include_any": ["cannot be cancelled"],
+    "required_context_ids": ["policy_cancellation"]
+  }
+}
diff --git a/examples/retail_support/knowledge_base.json b/examples/retail_support/knowledge_base.json
@@ -0,0 +1,27 @@
+[
+  {
+    "id": "policy_returns_worn_items",
+    "title": "Returns for worn footwear",
+    "text": "Footwear used outdoors is not eligible for return unless the item is faulty."
+  },
+  {
+    "id": "policy_cancellation",
+    "title": "Order cancellation policy",
+    "text": "Orders can be cancelled only before the shipment status changes to shipped."
+  },
+  {
+    "id": "policy_refunds",
+    "title": "Refund processing policy",
+    "text": "Approved refunds are processed within 5 to 7 business days after the returned item is received and inspected."
+  },
+  {
+    "id": "policy_shipping_delay",
+    "title": "Shipping delay guidance",
+    "text": "If an order remains in processing for more than 3 business days, the customer should be advised to contact support for a manual check."
+  },
+  {
+    "id": "policy_warranty",
+    "title": "Warranty policy",
+    "text": "Outdoor stoves and technical gear include a 2-year limited warranty covering manufacturing defects only."
+  }
+]
diff --git a/examples/retail_support/orders.json b/examples/retail_support/orders.json
@@ -0,0 +1,30 @@
+[
+  {
+    "order_id": "ORD-1002",
+    "status": "processing",
+    "items": ["JKT_001"],
+    "can_cancel": true,
+    "days_in_status": 2
+  },
+  {
+    "order_id": "ORD-1005",
+    "status": "shipped",
+    "items": ["JKT_003"],
+    "can_cancel": false,
+    "days_in_status": 1
+  },
+  {
+    "order_id": "ORD-1008",
+    "status": "processing",
+    "items": ["STOVE_001"],
+    "can_cancel": true,
+    "days_in_status": 5
+  },
+  {
+    "order_id": "ORD-1011",
+    "status": "delivered",
+    "items": ["TENT_001"],
+    "can_cancel": false,
+    "days_in_status": 0
+  }
+]
diff --git a/examples/retail_support/rubric.json b/examples/retail_support/rubric.json
@@ -0,0 +1,21 @@
+{
+  "version": "v1.1",
+  "weights": {
+    "instruction_match": 0.25,
+    "grounding_accuracy": 0.25,
+    "tool_use_correctness": 0.20,
+    "resolution_helpfulness": 0.20,
+    "tone_clarity": 0.10
+  },
+  "thresholds": {
+    "pass": 4.0,
+    "warn": 3.0
+  },
+  "critical_gates": [
+    "no_policy_hallucination",
+    "no_false_action_claim",
+    "respect_constraints",
+    "valid_output_schema",
+    "no_invented_order_status"
+  ]
+}
diff --git a/examples/retail_support/task_config.yaml b/examples/retail_support/task_config.yaml
@@ -0,0 +1,30 @@
+task_name: retail_support
+task_type: mixed
+description: "Retail support evaluation task covering recommendation support retrieval and simple workflow cases"
+
+dataset_path: examples/retail_support/dataset.json
+rubric_path: examples/retail_support/rubric.json
+
+knowledge_base_path: examples/retail_support/knowledge_base.json
+catalog_path: examples/retail_support/catalog.json
+orders_path: examples/retail_support/orders.json
+tool_scenarios_path: examples/retail_support/tool_scenarios.json
+expected_outputs_path: examples/retail_support/expected_outputs.json
+
+system_config: configs/systems/openai_wine.yaml
+judge_config: configs/judges/default_ensemble.yaml
+
+mock_response_profile:
+  type: retail_support
+
+evaluation:
+  rubric:
+    source: task_local
+  thresholds:
+    source: rubric
+  judge_ensemble:
+    source: task_judge_config
+
+output_schema_mode: disabled
+tool_simulation_mode: disabled
+judge_mode: disabled