MedARC-AI · warner-benjamin · Dec 12, 2025 · Oct 11, 2025 · Oct 11, 2025 · Oct 11, 2025
diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md
@@ -0,0 +1,42 @@
+# careqa
+
+Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) multiple-choice dataset.
+
+### Overview
+- **Environment ID**: `careqa_mcq`  
+- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the MCQs only.  
+- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, single-turn
+
+### Datasets
+- **Primary dataset(s)**:  
+  - `CareQA_en` – multiple-choice clinical questions with 4 options and correct answer labels.  
+- **Source links**:  
+  - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA)
+
+### Task
+- **Type**: single-turn  
+- **Parser**: custom prompt mapping (no structured markup)  
+- **Rubric overview**:  
+**MCQ (`closed_mcq`)**: `vf.Rubric()` measuring **accuracy** (letter match).  
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval careqa
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval careqa_mcq --model gpt-4.1-mini --num-examples 3 -s
+``` 
+
+### Metrics
+
+| Metric        | Meaning |
+|---------------|---------|
+| `reward`      | Main scalar reward (weighted sum of rubric criteria) |
+| `accuracy`    | Exact match on target MCQ answer (letter A–D) |
+
+
diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+from typing import Any, Optional
+from datasets import load_dataset
+import verifiers as vf
+
+
+# Helper Functions
+
+def _get_text_from_completion(completion: Any) -> str:
+    """Extract plain text from completion."""
+    if isinstance(completion, str):
+        return completion.strip()
+    if isinstance(completion, list) and completion:
+        last = completion[-1]
+        if isinstance(last, dict):
+            return str(last.get("content", "")).strip()
+        return str(last).strip()
+    return str(completion).strip()
+
+
+def _first_letter(text: str) -> Optional[str]:
+    """Extract the first uppercase A–Z letter."""
+    for ch in (text or "").upper():
+        if "A" <= ch <= "Z":
+            return ch
+    return None
+
+# Prompt Construction
+
+def _build_prompt(question: str, options: dict[str, str]) -> str:
+    """Create an MCQ prompt."""
+    formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items())
+    letters = ", ".join(options.keys())
+    return (
+        "You are a board-certified clinician taking a medical reasoning test.\n"
+        "Read the following question carefully and choose the most appropriate answer.\n\n"
+        f"Question:\n{question.strip()}\n\n"
+        f"Options:\n{formatted_opts}\n\n"
+        f"Respond with only the option letter ({letters}), nothing else."
+    )
+
+# Main Environment
+
+def load_environment(split: str = "test") -> vf.Environment:
+    """
+    CareQA multiple-choice evaluation environment.
+    Uses vf.SingleTurnEnv + MCQ accuracy rubric.
+    """
+    ds = load_dataset("HPAI-BSC/CareQA",'CareQA_en', split=split)
+
+    def _map(ex):
+        options = {"A": ex["op1"], "B": ex["op2"], "C": ex["op3"], "D": ex["op4"]}
+        gold_letter = ["A", "B", "C", "D"][ex["cop"] - 1] 
+        # The key change is here: format the single prompt string as a list of dicts (ChatML format)
+        return {
+            "prompt": [
+                {
+                    "role": "user", 
+                    "content": _build_prompt(ex["question"], options)
+                }
+            ],
+            "answer": gold_letter,
+        }
+
+    mapped = ds.map(_map, remove_columns=ds.column_names)
+
+    def mcq_accuracy(completion, answer):
+        pred = _first_letter(_get_text_from_completion(completion))
+        return 1.0 if pred == str(answer).upper() else 0.0
+
+    rubric = vf.Rubric(funcs=[mcq_accuracy], weights=[1.0])
+
+    return vf.SingleTurnEnv(
+        dataset=mapped,
+        eval_dataset=mapped,
+        rubric=rubric,
+        system_prompt=None,
+    )
diff --git a/environments/careqa_mcq/pyproject.toml b/environments/careqa_mcq/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "careqa_mcq"
+description = "Evaluation environment for the HPAI-BSC/CareQA MCQ dataset"
+tags = ["healthcare", "medical-qa", "mcq", "clinical", "single-turn"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.4",
+    "datasets>=2.13.0"
+]
+
+[tool.prime.environment]
+loader = "careqa_mcq:load_environment"
+display_name = "CareQA"
+visibility = "PUBLIC"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["careqa_mcq.py"]
diff --git a/environments/careqa_openended/README.md b/environments/careqa_openended/README.md
@@ -0,0 +1,42 @@
+# careqa_openended
+
+Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) openended dataset.
+
+### Overview
+- **Environment ID**: `careqa_openended`  
+- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the open-ended questions only.  
+- **Tags**: healthcare, medical QA, clinical reasoning, single-turn
+
+### Datasets
+- **Primary dataset(s)**:  
+  - `CareQA_en_open` – open-ended clinical questions with reference answers.
+- **Source links**:  
+  - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA)
+
+### Task
+- **Type**: single-turn  
+- **Parser**: custom prompt mapping (no structured markup)  
+- **Rubric overview**:  
+**Open-ended (`open_clinical`)**: `vf.JudgeRubric()` using an LLM-as-judge to score free-text answers for correctness and clinical reasoning. 
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval careqa
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval careqa_openended --model gpt-4.1-mini --num-examples 3 -s
+``` 
+
+### Metrics
+
+| Metric        | Meaning |
+|---------------|---------|
+| `reward`      | Main scalar reward (weighted sum of rubric criteria) |
+|  `judge_score` | For open-ended questions, LLM-assigned score evaluating answer quality, correctness, and clinical reasoning |
+
+
diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+from datasets import load_dataset
+import verifiers as vf
+
+# Load Open-Ended Environment
+
+def load_environment(split: str = "test") -> vf.SingleTurnEnv:
+    ds = load_dataset("HPAI-BSC/CareQA", 'CareQA_en_open', split=split)
+
+    def _map(ex):
+        system_content = "You are an expert clinician answering medical questions."
+
+        user_content = (
+            "Read the following question carefully and provide a detailed, concise answer.\n\n"
+            f"Question:\n{ex['question'].strip()}\n\n"
+            "Answer:"
+        )
+
+        return {
+            "prompt": [
+                {"role": "system", "content": system_content},
+                {"role": "user", "content": user_content},
+            ],
+            "answer": ex.get("answer_explanation", ex.get("answer", "")),
+        }
+
+    mapped = ds.map(_map, remove_columns=ds.column_names)
+
+    rubric = vf.JudgeRubric()
+
+    return vf.SingleTurnEnv(
+        dataset=mapped,
+        eval_dataset=mapped,
+        rubric=rubric,
+        system_prompt=None,
+    )
diff --git a/environments/careqa_openended/pyproject.toml b/environments/careqa_openended/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "careqa_openended"
+description = "Evaluation environment for the HPAI-BSC/CareQA open-ended dataset"
+tags = ["healthcare", "medical-qa", "open-ended", "clinical", "single-turn"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.4",
+    "datasets>=2.13.0"
+]
+
+[tool.prime.environment]
+loader = "careqa_openended:load_environment"
+display_name = "CareQA"
+visibility = "PUBLIC"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["careqa_openended.py"]