update README

JinjieNi · Nov 9, 2024 · 6dffb0e · 6dffb0e
1 parent 5bfd1bc
commit 6dffb0e
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -22,6 +22,8 @@
 <br>
 
 # ⚡ News
+[2024-11-09] Our evaluation suite now supports local model parsing/judge! Check [here](docs/evaluate_instructions.md#use-local-model-parser) for details! 
+
 [2024-10-20] MixEval-X is released! Checkout the [project page](https://mixeval-x.github.io/), [paper](https://arxiv.org/abs/2410.13754), and [github repo](https://github.com/Psycoy/MixEval-X/) to learn more about this real-world any-to-any benchmark!🌟
 
 [2024-09-27] MixEval is accepted to Neurips 2024. 
@@ -77,7 +79,7 @@ bash setup.sh
 ```
 MODEL_PARSER_API=<your openai api key>
 ```
-> The values in [Leaderboard](https://mixeval.github.io/#leaderboard) use `GPT-3.5-Turbo-0125` as the default model parser. Open-source model parsers will also be supported.
+> The values in [Leaderboard](https://mixeval.github.io/#leaderboard) use `GPT-3.5-Turbo-0125` as the default model parser. Open-source model parsers are also supported, check [here](docs/evaluate_instructions.md#use-local-model-parser) for details.
 
 > If you are using Azure or APIs for model parser, check [here](docs/evaluate_instructions.md#use-other-apis-for-model-parser).
 
@@ -175,6 +177,8 @@ If you want to separately compute the scores, you should
 
 > If you are using Azure or APIs for model parser, check [here](docs/evaluate_instructions.md#use-other-apis-for-model-parser).
 
+> Open-source model parsers are also supported, check [here](docs/evaluate_instructions.md#use-local-model-parser) for details.
+
 > If you are parsing base models' responses, set the `--extract_base_model_response` flag to only retain the meaningful part in models' response to get more stablized parsing results.
 
 > If you finished the model parsing some time ago and now want to display the model results again, add `--compute_score_from_judged_file` flag to avoid calling the model parser api again to save your budget. You have to make sure that there exists the parsed files with the name of `judge_results_ff_model_judge_gpt-3.5-turbo-0125` and `judge_results_mp_model_judge_gpt-3.5-turbo-0125` under the target model response folder, where `gpt-3.5-turbo-0125` denotes the model parser name, `ff` denotes free-form, `mp` denotes multiple-choice.

diff --git a/docs/evaluate_instructions.md b/docs/evaluate_instructions.md
@@ -58,4 +58,10 @@ OPENAI_API_VERSION=2023-07-01-preview
 ❗ If you are using Azure, there shouldn't be a `MODEL_PARSER_API` entry in `.env`, otherwise it will still use the OpenAI api.
 
 ### Other APIs
-Specify the `--api_base_url` if you wish to use other api such as llama.cpp server.
+Specify the `--api_base_url` if you wish to use other api such as llama.cpp server.
+
+
+## Use Local Model Parser
+You can now use any language model on the huggingface to do the model parsing. Set the `--judge_model_id` when running `evaluate.py` or `compute_metrics.py` to specify the huggingface model id or the path to the huggingface local checkpoint.
+
+> You can configure the `OSJudgeCloseendFreeform` and `OSJudgeCloseendMultichoice` in `judge_freeform_parser.py` and `judge_multichoice_parser.py` to customize the judge model settings.
diff --git a/mix_eval/data/mixeval-2024-08-11/mixeval/multiple-choice.json b/mix_eval/data/mixeval-2024-08-11/mixeval/multiple-choice.json
@@ -3015,18 +3015,6 @@
         ],
         "benchmark_name": "CommonsenseQA"
     },
-    "197": {
-        "problem_type": "multiple-choice",
-        "context": null,
-        "prompt": "Write a function to re-arrange the elements of the given array so that all negative elements appear before positive ones.",
-        "options": [
-            "def re_arrange_array(arr, n):\r\n  j=0\r\n  for i in range(0, n):\r\n    if (arr[i] < 0):\r\n      temp = arr[i]\r\n      arr[i] = arr[j]\r\n      arr[j] = temp\r\n      j = j + 1\r\n  return arr"
-        ],
-        "target": [
-            0
-        ],
-        "benchmark_name": "MBPP"
-    },
     "198": {
         "problem_type": "multiple-choice",
         "context": null,
@@ -4374,18 +4362,6 @@
         ],
         "benchmark_name": "HellaSwag"
     },
-    "287": {
-        "problem_type": "multiple-choice",
-        "context": null,
-        "prompt": "\n\ndef encode_shift(s: str):\n    \"\"\"\n    returns encoded string by shifting every character by 5 in the alphabet.\n    \"\"\"\n    return \"\".join([chr(((ord(ch) + 5 - ord(\"a\")) % 26) + ord(\"a\")) for ch in s])\n\n\ndef decode_shift(s: str):\n    \"\"\"\n    takes as input string encoded with encode_shift function. Returns decoded string.\n    \"\"\"\n",
-        "options": [
-            "    return \"\".join([chr(((ord(ch) - 5 - ord(\"a\")) % 26) + ord(\"a\")) for ch in s])\n"
-        ],
-        "target": [
-            0
-        ],
-        "benchmark_name": "HumanEval"
-    },
     "288": {
         "problem_type": "multiple-choice",
         "context": null,
@@ -6525,18 +6501,6 @@
         ],
         "benchmark_name": "MMLU"
     },
-    "427": {
-        "problem_type": "multiple-choice",
-        "context": null,
-        "prompt": "Write a function to compute the sum of digits of each number of a given list.",
-        "options": [
-            "def sum_of_digits(nums):\r\n    return sum(int(el) for n in nums for el in str(n) if el.isdigit())"
-        ],
-        "target": [
-            0
-        ],
-        "benchmark_name": "MBPP"
-    },
     "428": {
         "problem_type": "multiple-choice",
         "context": "[header] How to hide your period supplies [title] Choose an opaque, boring-looking box. [step] Something made from plastic, metal, or cardboard will keep people from seeing what's inside. Its ordinary appearance will blend in with the background and lower the chances of people looking through it.",
@@ -18682,18 +18646,6 @@
         ],
         "benchmark_name": "PIQA"
     },
-    "1221": {
-        "problem_type": "multiple-choice",
-        "context": null,
-        "prompt": "Write a function that matches a string that has an a followed by two to three 'b'.",
-        "options": [
-            "import re\r\ndef text_match_two_three(text):\r\n        patterns = 'ab{2,3}'\r\n        if re.search(patterns,  text):\r\n                return 'Found a match!'\r\n        else:\r\n                return('Not matched!')"
-        ],
-        "target": [
-            0
-        ],
-        "benchmark_name": "MBPP"
-    },
     "1222": {
         "problem_type": "multiple-choice",
         "context": "Addison ate their bread and drank a nice glass of water with the bread.",
@@ -23881,18 +23833,6 @@
         ],
         "benchmark_name": "HellaSwag"
     },
-    "1564": {
-        "problem_type": "multiple-choice",
-        "context": null,
-        "prompt": "\ndef is_bored(S):\n    \"\"\"\n    You'll be given a string of words, and your task is to count the number\n    of boredoms. A boredom is a sentence that starts with the word \"I\".\n    Sentences are delimited by '.', '?' or '!'.\n   \n    For example:\n    >>> is_bored(\"Hello world\")\n    0\n    >>> is_bored(\"The sky is blue. The sun is shining. I love this weather\")\n    1\n    \"\"\"\n",
-        "options": [
-            "    import re\n    sentences = re.split(r'[.?!]\\s*', S)\n    return sum(sentence[0:2] == 'I ' for sentence in sentences)\n"
-        ],
-        "target": [
-            0
-        ],
-        "benchmark_name": "HumanEval"
-    },
     "1565": {
         "problem_type": "multiple-choice",
         "context": null,

diff --git a/mix_eval/utils/judge_freeform_parser.py b/mix_eval/utils/judge_freeform_parser.py
@@ -275,6 +275,7 @@ def annotate_parallel(self, tasks):
                 task['judge_response'] = completion
                 results.append(task)
 
-        if None in results:
-            raise ValueError("Some entries are not annotated due to errors in annotate_p, please inspect and retry.")
+        # for result in results:
+        #     if result['judge_response'] is None:
+        #         raise ValueError("Some entries are not annotated due to errors in annotate_p, please inspect and retry.")
         return results
diff --git a/mix_eval/utils/judge_multichoice_parser.py b/mix_eval/utils/judge_multichoice_parser.py
@@ -266,6 +266,7 @@ def annotate_parallel(self, tasks):
                 task['judge_response'] = completion
                 results.append(task)
 
-        if None in results:
-            raise ValueError("Some entries are not annotated due to errors in annotate_p, please inspect and retry.")
+        # for result in results:
+        #     if result['judge_response'] is None:
+        #         raise ValueError("Some entries are not annotated due to errors in annotate_p, please inspect and retry.")
         return results