XortexAI · ved015 · May 29, 2026 · May 29, 2026 · May 29, 2026
@@ -56,7 +56,16 @@ tests/
 !tests/
 !tests/**/*.py
 benchmarks/
+!benchmarks/
+!benchmarks/README.md
 LongMemEval/
+!benchmarks/longmemeval/
+!benchmarks/longmemeval/**
+benchmarks/longmemeval/**/__pycache__/
+benchmarks/longmemeval/**/*.pyc
+benchmarks/longmemeval/data/
+benchmarks/longmemeval/results/
+benchmarks/longmemeval/outputs/
 backboard/
 rust/
 

@@ -0,0 +1,9 @@
+# XMem Benchmarks
+
+This directory contains benchmark harnesses for XMem.
+
+- `longmemeval/`: Python-only LongMemEval benchmark runner targeting the XMem HTTP API.
+
+Benchmark runs can create large dataset and result artifacts. Keep those files under
+`benchmarks/longmemeval/data`, `benchmarks/longmemeval/results`, or
+`benchmarks/longmemeval/outputs`; those paths are intentionally ignored by git.
@@ -0,0 +1,166 @@
+# LongMemEval Benchmark for XMem Python
+
+This harness benchmarks the Python XMem service only. It targets the deployed
+Python API at `https://api.xmem.in` by default and does not run or compare the
+Go implementation.
+
+LongMemEval evaluates long-term conversational memory across multi-session
+recall, temporal reasoning, single-session recall, knowledge updates, and
+preference tracking. The harness follows the same broad structure used by
+open-source memory-layer benchmarks: load dataset records, ingest the haystack
+conversation history into an isolated user namespace, retrieve an answer for
+the benchmark question, write predictions, and compute lightweight local
+metrics for quick iteration.
+
+## Files
+
+- `dataset.py`: Loads JSON/JSONL LongMemEval records and converts sessions to
+  XMem conversation-turn ingest payloads.
+- `client.py`: Async HTTP client for the Python XMem API.
+- `runner.py`: Benchmark orchestration, batching, polling, resume support, and
+  output writing.
+- `metrics.py`: Local exact-match, contains, and token-F1 metrics plus summary
+  aggregation.
+- `run.py`: CLI entrypoint.
+
+## Secrets
+
+Do not commit API keys or provider credentials.
+
+To generate XMem predictions, set an XMem API key:
+
+```bash
+export XMEM_API_KEY="..."
+```
+
+Use `--api-key-env` if your local environment uses a different variable name.
+
+To score predictions with the official LongMemEval LLM-as-judge evaluator, set
+an OpenAI API key before running the evaluator:
+
+```bash
+export OPENAI_API_KEY="..."
+```
+
+## Run a Smoke Check
+
+Validate dataset parsing and payload construction without calling the service:
+
+```bash
+python -m benchmarks.longmemeval.run \
+  --download \
+  --dry-run \
+  --limit 2
+```
+
+Validate all six official categories without requiring an API key:
+
+```bash
+python -m benchmarks.longmemeval.run_all_categories \
+  --download \
+  --dry-run
+```
+
+If the dataset is already available locally:
+
+```bash
+python -m benchmarks.longmemeval.run \
+  --dataset-path benchmarks/longmemeval/data/longmemeval_s_cleaned.json \
+  --dry-run \
+  --limit 2
+```
+
+## Run Against the Python API
+
+```bash
+export XMEM_API_KEY="..."
+
+python -m benchmarks.longmemeval.run \
+  --download \
+  --api-base-url https://api.xmem.in \
+  --limit 10 \
+  --batch-size 25 \
+  --output-dir benchmarks/longmemeval/results/run-001
+```
+
+The runner writes:
+
+- `results.jsonl`: Full per-example benchmark records.
+- `predictions.jsonl`: Official prediction file with only `question_id` and
+  `hypothesis`.
+- `summary.json`: Aggregate local metrics and latency.
+
+The local metrics are intended for fast development feedback. For publication
+quality reporting, run the generated `predictions.jsonl` through the official
+LongMemEval evaluation flow or an agreed LLM-as-judge rubric using the same
+model/settings across systems.
+
+The benchmark runner itself only needs `XMEM_API_KEY` because it generates XMem
+answers. The official/equivalent evaluator is a separate scoring step and needs
+`OPENAI_API_KEY` when using an OpenAI judge model.
+
+## Run All Official Categories
+
+The dataset has six `question_type` categories. Each example has a unique
+`question_id` and its own haystack sessions, and this runner isolates each
+question into a separate XMem user namespace. That makes category-level
+parallelism safe from memory leakage; the only practical constraint is API
+throughput and rate limiting.
+
+```bash
+export XMEM_API_KEY="..."
+
+python -m benchmarks.longmemeval.run_all_categories \
+  --dataset-path benchmarks/longmemeval/data/longmemeval_s_cleaned.json \
+  --api-base-url https://api.xmem.in \
+  --output-root benchmarks/longmemeval/results/full-six-categories \
+  --max-parallel-categories 6
+```
+
+The all-category runner prints live processed/left/ETA status and writes one
+official merged prediction file at:
+
+```text
+benchmarks/longmemeval/results/full-six-categories/predictions.jsonl
+```
+
+Each category also gets a `runner.log` file under its output directory. If a
+category process fails, the launcher prints the failing category, exit code, log
+path, and the most recent child-process output.
+
+## Useful Options
+
+- `--limit N`: Run a small subset first.
+- `--offset N`: Skip the first N selected examples.
+- `--question-type TYPE`: Filter to one LongMemEval category.
+- `--skip-ingest`: Reuse already-ingested user namespaces and only retrieve.
+- `--no-resume`: Re-run examples even if they already exist in `results.jsonl`.
+- `--ingest-api-version v1`: Use synchronous batch ingestion instead of the
+  default durable `/v2/memory/batch-ingest` path.
+- `--effort-level high`: Use high-effort XMem ingestion for long records.
+- `--dry-run`: Validate dataset/category setup without API calls.
+- `--verbose`: Print child runner output while the all-category launcher runs.
+
+## Expected Failures
+
+These errors are intentional and should be actionable:
+
+- `Dataset file not found`: run with `--download`, or pass `--dataset-path`.
+- `Missing API key`: set `XMEM_API_KEY`, or pass `--api-key-env` for a custom
+  variable name.
+- Official evaluator authentication errors: set `OPENAI_API_KEY` before running
+  the LongMemEval scoring step.
+- `Failed to download the LongMemEval dataset`: check network access, then retry
+  or download the dataset manually.
+- `<category> failed with exit code ...`: inspect that category's `runner.log`.
+
+## Isolation Model
+
+Each example is ingested into a user id derived from:
+
+```text
+<user-prefix>-<question-id>
+```
+
+This prevents facts from one benchmark question from leaking into another. Use a
+new `--user-prefix` for fully fresh runs.
@@ -0,0 +1 @@
+"""LongMemEval benchmark harness for the Python XMem API."""
@@ -0,0 +1,123 @@
+"""HTTP client for the Python XMem API used by the benchmark."""
+
+from __future__ import annotations
+
+import asyncio
+import time
+from dataclasses import dataclass
+from typing import Any
+
+import httpx
+
+
+TERMINAL_JOB_STATUSES = {"succeeded", "dead_letter"}
+
+
+@dataclass(frozen=True)
+class ApiCallResult:
+    data: dict[str, Any]
+    elapsed_ms: float
+
+
+class XMemApiClient:
+    """Small async client around the deployed Python XMem API."""
+
+    def __init__(
+        self,
+        *,
+        base_url: str,
+        api_key: str,
+        timeout_seconds: float = 120.0,
+        max_retries: int = 3,
+        retry_backoff_seconds: float = 2.0,
+    ) -> None:
+        self.base_url = base_url.rstrip("/")
+        self.max_retries = max_retries
+        self.retry_backoff_seconds = retry_backoff_seconds
+        self._client = httpx.AsyncClient(
+            base_url=self.base_url,
+            timeout=httpx.Timeout(timeout_seconds),
+            headers={
+                "Authorization": f"Bearer {api_key}",
+                "Content-Type": "application/json",
+                "User-Agent": "xmem-longmemeval-benchmark/1.0",
+            },
+        )
+
+    async def __aenter__(self) -> "XMemApiClient":
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb) -> None:
+        await self.close()
+
+    async def close(self) -> None:
+        await self._client.aclose()
+
+    async def ingest(self, payload: dict[str, Any]) -> ApiCallResult:
+        return await self._post("/v1/memory/ingest", payload)
+
+    async def batch_ingest_v1(self, items: list[dict[str, Any]]) -> ApiCallResult:
+        return await self._post("/v1/memory/batch-ingest", {"items": items})
+
+    async def batch_ingest_v2(self, items: list[dict[str, Any]]) -> ApiCallResult:
+        return await self._post("/v2/memory/batch-ingest", {"items": items})
+
+    async def retrieve(self, payload: dict[str, Any]) -> ApiCallResult:
+        return await self._post("/v1/memory/retrieve", payload)
+
+    async def job_status(self, status_url: str) -> ApiCallResult:
+        return await self._get(status_url)
+
+    async def poll_job(
+        self,
+        status_url: str,
+        *,
+        interval_seconds: float,
+        timeout_seconds: float,
+    ) -> ApiCallResult:
+        deadline = time.monotonic() + timeout_seconds
+        last_result: ApiCallResult | None = None
+        while time.monotonic() < deadline:
+            last_result = await self.job_status(status_url)
+            status = str(last_result.data.get("status") or "").lower()
+            if status in TERMINAL_JOB_STATUSES:
+                return last_result
+            await asyncio.sleep(interval_seconds)
+        status = last_result.data.get("status") if last_result else "unknown"
+        raise TimeoutError(f"Timed out polling job {status_url}; last status={status}")
+
+    async def _get(self, path: str) -> ApiCallResult:
+        return await self._request("GET", path)
+
+    async def _post(self, path: str, payload: dict[str, Any]) -> ApiCallResult:
+        return await self._request("POST", path, json=payload)
+
+    async def _request(self, method: str, path: str, **kwargs: Any) -> ApiCallResult:
+        request_path = path if path.startswith("/") else f"/{path}"
+        start = time.perf_counter()
+        response: httpx.Response | None = None
+        for attempt in range(self.max_retries + 1):
+            try:
+                response = await self._client.request(method, request_path, **kwargs)
+                if response.status_code < 500 and response.status_code != 429:
+                    break
+            except httpx.HTTPError:
+                if attempt >= self.max_retries:
+                    raise
+            if attempt < self.max_retries:
+                await asyncio.sleep(self.retry_backoff_seconds * (attempt + 1))
+
+        if response is None:
+            raise RuntimeError(f"No response from {method} {request_path}")
+        elapsed_ms = round((time.perf_counter() - start) * 1000, 2)
+        response.raise_for_status()
+        body = response.json()
+        if body.get("status") == "error":
+            error = body.get("error") or f"XMem API error from {request_path}"
+            raise RuntimeError(error)
+        data = body.get("data")
+        if data is None:
+            data = {}
+        if not isinstance(data, dict):
+            data = {"value": data}
+        return ApiCallResult(data=data, elapsed_ms=elapsed_ms)
@@ -0,0 +1,64 @@
+"""Configuration helpers for the LongMemEval benchmark."""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+
+DEFAULT_API_BASE_URL = "https://api.xmem.in"
+DEFAULT_API_KEY_ENV = "XMEM_API_KEY"
+DEFAULT_DATASET_VARIANT = "longmemeval_s_cleaned"
+DEFAULT_DATASET_URLS = {
+    "longmemeval_s_cleaned": (
+        "https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/"
+        "resolve/main/longmemeval_s_cleaned.json"
+    ),
+    "longmemeval_m_cleaned": (
+        "https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/"
+        "resolve/main/longmemeval_m_cleaned.json"
+    ),
+    "longmemeval_oracle": (
+        "https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/"
+        "resolve/main/longmemeval_oracle.json"
+    ),
+}
+
+
+@dataclass(frozen=True)
+class BenchmarkConfig:
+    """Runtime settings for a LongMemEval benchmark run."""
+
+    dataset_path: Path
+    output_dir: Path
+    api_base_url: str = DEFAULT_API_BASE_URL
+    api_key_env: str = DEFAULT_API_KEY_ENV
+    api_timeout_seconds: float = 120.0
+    max_retries: int = 3
+    retry_backoff_seconds: float = 2.0
+    batch_size: int = 25
+    ingest_api_version: str = "v2"
+    poll_interval_seconds: float = 2.0
+    poll_timeout_seconds: float = 1800.0
+    top_k: int = 10
+    effort_level: str = "low"
+    user_prefix: str = "longmemeval"
+    limit: int | None = None
+    offset: int = 0
+    question_type: str | None = None
+    skip_ingest: bool = False
+    resume: bool = True
+    dry_run: bool = False
+
+    @property
+    def api_key(self) -> str:
+        return os.getenv(self.api_key_env, "").strip()
+
+    def require_api_key(self) -> str:
+        api_key = self.api_key
+        if not api_key:
+            raise RuntimeError(
+                f"Missing API key. Set {self.api_key_env} before running the benchmark."
+            )
+        return api_key
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""LongMemEval benchmark harness for the Python XMem API."""