Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions dataset_builder/humaneval_to_el.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
This script translates problems from the OpenAI HumanEval dataset into Emacs Lisp.

- Home: https://www.gnu.org/software/emacs/
- Reference manual: https://www.gnu.org/software/emacs/manual/elisp.html
- Test library: https://www.gnu.org/software/emacs/manual/ert.html
"""

import ast
from typing import List


class Translator:

USub = "-"

stop = ["\n(defun", "\n;", "\n("]

def file_ext(self):
return "el"

def translate_prompt(
self, name: str, args: List[ast.arg], _returns, description: str
) -> str:
self.entry_point = name
el_preamble = ";;; -*- lexical-binding: t; -*-"
el_args = " ".join(arg.arg for arg in args)
el_description = description.replace('"', '\\"')
return f'{el_preamble}\n(defun {name} ({el_args})\n "{el_description}"\n '

def test_suite_prefix_lines(self, entry_point) -> List[str]:
return [
f"(defalias #'candidate #'{entry_point})",
"(ert-deftest test-human-eval ()",
]

def test_suite_suffix_lines(self) -> List[str]:
return [")"]

def deep_equality(self, left: str, right: str) -> str:
return f" (should (equal {left} {right}))"

def gen_literal(self, c: bool | str | int | float):
if type(c) is bool:
return "t" if c else "nil"
elif type(c) is str:
return f'"{c}"'
elif c is None:
return "nil"
return repr(c)

def gen_var(self, variable: str) -> str:
return variable

def gen_list(self, list: List[str]) -> str:
return "(list " + " ".join(list) + ")"

def gen_tuple(self, tuple: List[str]) -> str:
return "(list " + " ".join(tuple) + ")"

def gen_dict(self, keys: List[str], values: List[str]) -> str:
pairs = " ".join(f"(cons {k} {v})" for k, v in zip(keys, values))
return "(list " + pairs + ")"

def gen_call(self, func: str, args: List[str]) -> str:
return "(" + func + " " + " ".join(args) + ")"
1 change: 1 addition & 0 deletions dataset_builder/libexperiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def path(self) -> Path:
"elixir",
"clj",
"ada",
"el",
]
MODELS = ["davinci", "incoder", "codegen"]

Expand Down
1 change: 1 addition & 0 deletions dataset_builder/terms.csv
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ Matlab,m,array,array,array,dictionary,<missing>,true,false
Haskell,hs,list,list,tuple,association list,Nothing,True,False
Clojure,clj,vector,list,vector,map,nil,true,false
Dart,dart,list,list,record,map,null,true,false
Emacs Lisp,el,list,list,list,alist,nil,t,nil
3 changes: 3 additions & 0 deletions evaluation/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ RUN apt-get update -yqq && apt-get install -yqq dart
# Lean
# RUN wget https://github.com/leanprover/lean4/releases/download/v4.6.0-rc1/lean-4.6.0-rc1-linux.zip -O /tmp/lean.zip && unzip /tmp/lean.zip -d /root/lean/ && ln -s /root/lean/bin/lean /bin/lean

# Emacs Lisp (no-X/GUI version)
RUN apt install -y emacs-nox

# install numpy for humanevalplus
RUN python3 -m pip install numpy

Expand Down
2 changes: 2 additions & 0 deletions evaluation/src/containerized_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import eval_v
import eval_lean
import eval_dart
import eval_el
import tempfile


Expand Down Expand Up @@ -65,6 +66,7 @@
"coq": (eval_v.eval_script, ".v"),
"lean": (eval_lean.eval_script, ".lean"),
"dart": (eval_dart.eval_script, ".dart"),
"el": (eval_el.eval_script, ".el"),
}

def eval_string_script(language, program):
Expand Down
33 changes: 33 additions & 0 deletions evaluation/src/eval_el.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
Evaluates a generated Emacs Lisp program (.el).
"""
from pathlib import Path
from safe_subprocess import run

def eval_script(path: Path):

result = run([
"emacs", "-batch",
"-l", "ert",
"-l", str(path),
"-f", "ert-run-tests-batch-and-exit"
])

if result.timeout:
status = "Timeout"
elif result.exit_code != 0:
status = "Exception"
elif "\nRan 1 tests, 1 results as expected, 0 unexpected" in result.stderr:
status = "OK"
else: # test failure
status = "Exception"

return {
"status": status,
"exit_code": result.exit_code,
"stdout": result.stdout,
"stderr": result.stderr,
}

if __name__ == "__main__":
main()
162 changes: 162 additions & 0 deletions results/humaneval-el-Qwen2.5_Coder_7B-0.2-reworded.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
problem,pass,total
HumanEval_0_has_close_elements,3,200
HumanEval_1_separate_paren_groups,0,200
HumanEval_10_make_palindrome,0,200
HumanEval_100_make_a_pile,4,200
HumanEval_101_words_string,0,200
HumanEval_102_choose_num,0,200
HumanEval_103_rounded_avg,0,200
HumanEval_104_unique_digits,0,200
HumanEval_105_by_length,14,200
HumanEval_106_f,0,200
HumanEval_107_even_odd_palindrome,0,200
HumanEval_108_count_nums,0,200
HumanEval_109_move_one_ball,0,200
HumanEval_11_string_xor,179,200
HumanEval_110_exchange,0,200
HumanEval_111_histogram,0,200
HumanEval_112_reverse_delete,0,200
HumanEval_113_odd_count,0,200
HumanEval_114_minSubArraySum,25,200
HumanEval_115_max_fill,1,200
HumanEval_116_sort_array,0,200
HumanEval_117_select_words,5,200
HumanEval_118_get_closest_vowel,0,200
HumanEval_119_match_parens,0,200
HumanEval_12_longest,188,200
HumanEval_120_maximum,2,200
HumanEval_121_solution,0,200
HumanEval_122_add_elements,61,200
HumanEval_123_get_odd_collatz,0,200
HumanEval_124_valid_date,19,200
HumanEval_125_split_words,0,200
HumanEval_126_is_sorted,0,200
HumanEval_127_intersection,0,200
HumanEval_128_prod_signs,145,200
HumanEval_129_minPath,0,200
HumanEval_13_greatest_common_divisor,200,200
HumanEval_130_tri,0,200
HumanEval_131_digits,0,200
HumanEval_132_is_nested,0,200
HumanEval_133_sum_squares,99,200
HumanEval_134_check_if_last_char_is_a_letter,0,200
HumanEval_135_can_arrange,11,200
HumanEval_136_largest_smallest_integers,74,200
HumanEval_137_compare_one,1,200
HumanEval_138_is_equal_to_sum_even,0,200
HumanEval_139_special_factorial,0,200
HumanEval_14_all_prefixes,8,200
HumanEval_140_fix_spaces,0,200
HumanEval_141_file_name_check,0,200
HumanEval_142_sum_squares,0,200
HumanEval_143_words_in_sentence,0,200
HumanEval_144_simplify,44,200
HumanEval_145_order_by_points,0,200
HumanEval_146_specialFilter,0,200
HumanEval_147_get_max_triples,17,200
HumanEval_148_bf,0,200
HumanEval_149_sorted_list_sum,0,200
HumanEval_15_string_sequence,200,200
HumanEval_150_x_or_y,0,200
HumanEval_151_double_the_difference,0,200
HumanEval_152_compare,61,200
HumanEval_153_Strongest_Extension,0,200
HumanEval_154_cycpattern_check,0,200
HumanEval_155_even_odd_count,0,200
HumanEval_156_int_to_mini_roman,71,200
HumanEval_157_right_angle_triangle,187,200
HumanEval_158_find_max,20,200
HumanEval_159_eat,175,200
HumanEval_16_count_distinct_characters,13,200
HumanEval_160_do_algebra,0,200
HumanEval_161_solve,0,200
HumanEval_162_string_to_md5,86,200
HumanEval_163_generate_integers,0,200
HumanEval_17_parse_music,0,200
HumanEval_18_how_many_times,0,200
HumanEval_19_sort_numbers,2,200
HumanEval_2_truncate_number,190,200
HumanEval_20_find_closest_elements,5,200
HumanEval_21_rescale_to_unit,200,200
HumanEval_22_filter_integers,179,200
HumanEval_23_strlen,200,200
HumanEval_24_largest_divisor,35,200
HumanEval_25_factorize,38,200
HumanEval_26_remove_duplicates,3,200
HumanEval_27_flip_case,0,200
HumanEval_28_concatenate,200,200
HumanEval_29_filter_by_prefix,200,200
HumanEval_3_below_zero,0,200
HumanEval_30_get_positive,137,200
HumanEval_31_is_prime,103,200
HumanEval_33_sort_third,0,200
HumanEval_34_unique,157,200
HumanEval_35_max_element,182,200
HumanEval_36_fizz_buzz,105,200
HumanEval_37_sort_even,0,200
HumanEval_39_prime_fib,0,200
HumanEval_4_mean_absolute_deviation,85,200
HumanEval_40_triples_sum_to_zero,0,200
HumanEval_41_car_race_collision,0,200
HumanEval_42_incr_list,200,200
HumanEval_43_pairs_sum_to_zero,21,200
HumanEval_44_change_base,199,200
HumanEval_45_triangle_area,142,200
HumanEval_46_fib4,12,200
HumanEval_47_median,0,200
HumanEval_48_is_palindrome,106,200
HumanEval_49_modp,12,200
HumanEval_5_intersperse,143,200
HumanEval_51_remove_vowels,65,200
HumanEval_52_below_threshold,0,200
HumanEval_53_add,200,200
HumanEval_54_same_chars,0,200
HumanEval_55_fib,180,200
HumanEval_56_correct_bracketing,0,200
HumanEval_57_monotonic,6,200
HumanEval_58_common,73,200
HumanEval_59_largest_prime_factor,19,200
HumanEval_6_parse_nested_parens,14,200
HumanEval_60_sum_to_n,200,200
HumanEval_61_correct_bracketing,0,200
HumanEval_62_derivative,98,200
HumanEval_63_fibfib,136,200
HumanEval_64_vowels_count,0,200
HumanEval_65_circular_shift,48,200
HumanEval_66_digitSum,0,200
HumanEval_67_fruit_distribution,6,200
HumanEval_68_pluck,0,200
HumanEval_69_search,187,200
HumanEval_7_filter_by_substring,135,200
HumanEval_70_strange_sort_list,15,200
HumanEval_71_triangle_area,0,200
HumanEval_72_will_it_fly,160,200
HumanEval_73_smallest_change,190,200
HumanEval_74_total_match,20,200
HumanEval_75_is_multiply_prime,9,200
HumanEval_76_is_simple_power,111,200
HumanEval_77_iscube,1,200
HumanEval_78_hex_key,22,200
HumanEval_79_decimal_to_binary,0,200
HumanEval_8_sum_product,200,200
HumanEval_80_is_happy,4,200
HumanEval_81_numerical_letter_grade,102,200
HumanEval_82_prime_length,0,200
HumanEval_83_starts_one_ends,0,200
HumanEval_84_solve,0,200
HumanEval_85_add,0,200
HumanEval_86_anti_shuffle,47,200
HumanEval_87_get_row,84,200
HumanEval_88_sort_array,0,200
HumanEval_89_encrypt,0,200
HumanEval_9_rolling_max,71,200
HumanEval_90_next_smallest,2,200
HumanEval_91_is_bored,0,200
HumanEval_92_any_int,195,200
HumanEval_93_encode,0,200
HumanEval_94_skjkasdkd,0,200
HumanEval_95_check_dict_case,0,200
HumanEval_96_count_up_to,0,200
HumanEval_97_multiply,200,200
HumanEval_98_count_upper,3,200
HumanEval_99_closest_integer,1,200
Loading