diff --git a/evaluation/README.md b/evaluation/README.md index a5a4f32ca..8c1896943 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -64,6 +64,12 @@ First prepare the dataset `longmemeval_s` from https://huggingface.co/datasets/x ./scripts/run_lme_eval.sh ``` +#### Question date and `reference_time` + +LongMemEval gives each question a **question date**; evaluation should use that as the reference “now”, not the time when you run the script. The LongMemEval search script passes `question_date` as **`reference_time`** where the backend supports it. + +**MemOS Cloud** currently does not support supplying question date on search the same way, so LongMemEval scores there may differ from a spec-faithful run. **Prefer evaluating LongMemEval against the open-source MemOS server** when you need comparable numbers. + ### PrefEval Evaluation Downloading benchmark_dataset/filtered_inter_turns.json from https://github.com/amazon-science/PrefEval/blob/main/benchmark_dataset/filtered_inter_turns.json and save it as `./data/prefeval/filtered_inter_turns.json`. To evaluate the **Prefeval** dataset — run the following [script](./scripts/run_prefeval_eval.sh): diff --git a/evaluation/scripts/longmemeval/lme_search.py b/evaluation/scripts/longmemeval/lme_search.py index 8e0e3c5c2..1eea8cd37 100644 --- a/evaluation/scripts/longmemeval/lme_search.py +++ b/evaluation/scripts/longmemeval/lme_search.py @@ -41,9 +41,11 @@ def mem0_search(client, query, user_id, top_k): return context, duration_ms -def memos_search(client, query, user_id, top_k): +def memos_search(client, query, user_id, top_k, reference_time=None): start = time() - results = client.search(query=query, user_id=user_id, top_k=top_k) + results = client.search( + query=query, user_id=user_id, top_k=top_k, reference_time=reference_time + ) context = ( "\n".join([i["memory"] for i in results["text_mem"][0]["memories"]]) + f"\n{results.get('pref_string', '')}" @@ -122,12 +124,16 @@ def process_user(lme_df, conv_idx, frame, version, top_k=20): from utils.client import MemosApiClient client = MemosApiClient() - context, duration_ms = memos_search(client, question, user_id, top_k) + context, duration_ms = memos_search( + client, question, user_id, top_k, reference_time=question_date + ) elif frame == "memos-api-online": from utils.client import MemosApiOnlineClient client = MemosApiOnlineClient() - context, duration_ms = memos_search(client, question, user_id, top_k) + context, duration_ms = memos_search( + client, question, user_id, top_k, reference_time=question_date + ) elif frame == "memu": from utils.client import MemuClient