nuprl · rudolf-adamkovic · Mar 24, 2026
diff --git a/dataset_builder/humaneval_to_elm.py b/dataset_builder/humaneval_to_elm.py
@@ -0,0 +1,167 @@
+# This script translates problems from the OpenAI HumanEval dataset into Elm.
+import re
+import ast
+from typing import List
+
+
+class Translator:
+
+    stop = ["\n\n", "\n--", "\ntype", "\nmodule"]
+
+    def __init__(self):
+        self.type = None
+
+    def translate_identifier(self, name: str) -> str:
+        parts = name.lower().split("_")
+        return parts[0] + "".join(p.capitalize() for p in parts[1:])
+
+    def file_ext(self):
+        return "elm"
+
+    def translate_type(self, t):
+        match t:
+            case ast.Subscript(ast.Name(id), slice, _ctx):
+                match id:
+                    case "List":
+                        inner = self.translate_type(slice)
+                        return f"List {inner}" if " " not in inner else f"List ({inner})"
+                    case "Tuple":
+                        match slice:
+                            case ast.Tuple(elts, _ctx):
+                                tys = [self.translate_type(e) for e in elts]
+                                return "(" + ", ".join(tys) + ")"
+                            case _other:
+                                raise Exception(f"Bad tuple: {slice}")
+                    case "Dict":
+                        match slice:
+                            case ast.Tuple([k, v], _ctx):
+                                kt = self.translate_type(k)
+                                vt = self.translate_type(v)
+                                return f"Dict.Dict {kt} {vt}"
+                            case _other:
+                                raise Exception(f"Bad dict: {slice}")
+                    case "Optional":
+                        inner = self.translate_type(slice)
+                        return f"Maybe {inner}" if " " not in inner else f"Maybe ({inner})"
+                    case "Union":
+                        raise Exception("Union is not supported")
+                    case other:
+                        raise Exception(f"Bad generic {other}")
+            case ast.Name("int") | "int":
+                return "Int"
+            case ast.Name("float") | "float":
+                return "Float"
+            case ast.Name("bool"):
+                return "Bool"
+            case ast.Name("str") | "str":
+                return "String"
+            case None:
+                raise Exception("implicitly untyped argument")
+            case ast.Name("Any"):
+                raise Exception("Any is not supported")
+            case ast.Name(x):
+                raise Exception(f"unknown name {x}")
+            case ast.Constant(Ellipsis):
+                raise Exception("no ellipsis")
+            case _other:
+                raise Exception(f"unknown annotation: {t}")
+
+    def translate_prompt(self, name: str, args: List[ast.arg], returns, description: str):
+        self.type = [[arg.annotation for arg in args], returns]
+        elm_name = self.translate_identifier(name)
+        comment = "-- " + re.sub(r"\n(\s*)", "\n-- ", description.strip()) + "\n"
+        try:
+            arg_types = [self.translate_type(arg.annotation) for arg in args]
+            ret_type = self.translate_type(returns)
+        except Exception as e:
+            print(e)
+            return None
+        type_parts = arg_types + [ret_type]
+        type_sig = elm_name + " : " + " -> ".join(type_parts)
+        arg_names = [arg.arg for arg in args]
+        func_decl = elm_name + " " + " ".join(arg_names) + " ="
+        imports = "import Platform\n"
+        all_types = " ".join(type_parts)
+        if "Dict.Dict" in all_types:
+            imports += "import Dict\n"
+        return f"module Main exposing (..)\n\n{imports}\n{comment}{type_sig}\n{func_decl}\n"
+
+    def test_suite_prefix_lines(self, entry_point) -> List[str]:
+        return [
+            "",
+            "assert : Bool -> ()",
+            "assert b = if b then () else Debug.todo \"assertion failed\"",
+            "",
+            "main : Program () () ()",
+            "main =",
+            "    Platform.worker",
+            "        { init = \\_ ->",
+            f"            let",
+            f"                candidate = {self.translate_identifier(entry_point)}",
+        ]
+
+    def test_suite_suffix_lines(self) -> List[str]:
+        return [
+            "            in",
+            "            ((), Cmd.none)",
+            "        , update = \\_ _ -> ((), Cmd.none)",
+            "        , subscriptions = \\_ -> Sub.none",
+            "        }",
+        ]
+
+    def deep_equality(self, left: str, right: str) -> str:
+        return f"                _ = assert ({left} == {right})"
+
+    def gen_literal(self, c: bool | str | int | float | None):
+        if type(c) == bool:
+            return str(c)
+        if type(c) == str:
+            escaped = c.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
+            return f'"{escaped}"'
+        if c is None:
+            return "Nothing"
+        if type(c) == int:
+            if c < 0:
+                return f"({repr(c)})"
+            return repr(c)
+        if type(c) == float:
+            return repr(c)
+        return repr(c)
+
+    def gen_var(self, v: str):
+        return self.translate_identifier(v)
+
+    def gen_list(self, l: List[str]):
+        return "[" + ", ".join(l) + "]"
+
+    def gen_tuple(self, t: List[str]):
+        return "(" + ", ".join(t) + ")"
+
+    def gen_dict(self, keys: List[str], values: List[str]):
+        pairs = ", ".join(f"({k}, {v})" for k, v in zip(keys, values))
+        return f"Dict.fromList [{pairs}]"
+
+    def gen_call(self, func: str, args: List[str]):
+        if func == "candidate":
+            args = [self._coerce(arg, self.type[0][i]) for i, arg in enumerate(args)]
+        return "(" + func + " " + " ".join(args) + ")"
+
+    def _coerce(self, expr: str, ann) -> str:
+        match expr, ann:
+            case expr, ast.Subscript(ast.Name("Optional"), _):
+                if expr == "Nothing":
+                    return expr
+                return f"(Just {expr})"
+            case expr, ast.Name("float") | "float" if "." not in expr and expr not in ("Nothing",):
+                return f"(toFloat {expr})"
+            case _:
+                return expr
+
+    def finalize(self, result, context) -> str:
+        match context:
+            case "lhs":
+                return result
+            case "rhs":
+                return self._coerce(result, self.type[1])
+            case _other:
+                raise Exception("bad context to finalize")
diff --git a/dataset_builder/terms.csv b/dataset_builder/terms.csv
@@ -26,3 +26,4 @@ Matlab,m,array,array,array,dictionary,<missing>,true,false
 Haskell,hs,list,list,tuple,association list,Nothing,True,False
 Clojure,clj,vector,list,vector,map,nil,true,false
 Dart,dart,list,list,record,map,null,true,false
+Elm,elm,list,list,tuple,dictionary,Nothing,True,False
diff --git a/evaluation/Dockerfile b/evaluation/Dockerfile
@@ -106,6 +106,13 @@ RUN apt-get update -yqq && apt-get install -yqq dart
 # Lean
 # RUN wget https://github.com/leanprover/lean4/releases/download/v4.6.0-rc1/lean-4.6.0-rc1-linux.zip -O /tmp/lean.zip && unzip /tmp/lean.zip -d /root/lean/ && ln -s /root/lean/bin/lean /bin/lean
 
+# Elm
+RUN npm install -g elm
+WORKDIR /etc/elm
+RUN echo y | elm init > /dev/null
+RUN printf 'module Main exposing (..)\nimport Html\nmain = Html.text ""\n' > src/Main.elm
+RUN elm make src/Main.elm --output=/dev/null
+
 # install numpy for humanevalplus
 RUN python3 -m pip install numpy
 

diff --git a/evaluation/src/containerized_eval.py b/evaluation/src/containerized_eval.py
@@ -29,6 +29,7 @@
 import eval_v
 import eval_lean
 import eval_dart
+import eval_elm
 import tempfile
 
 
@@ -65,6 +66,7 @@
     "coq": (eval_v.eval_script, ".v"),
     "lean": (eval_lean.eval_script, ".lean"),
     "dart": (eval_dart.eval_script, ".dart"),
+    "elm": (eval_elm.eval_script, ".elm"),
 }
 
 def eval_string_script(language, program):

diff --git a/evaluation/src/eval_elm.py b/evaluation/src/eval_elm.py
@@ -0,0 +1,46 @@
+import shutil
+import tempfile
+from pathlib import Path
+from safe_subprocess import run
+
+def eval_script(path: Path):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        project_dir = Path(tmpdir) / "elm"
+        shutil.copytree("/etc/elm", str(project_dir))
+
+        shutil.copy(str(path), str(project_dir / "src" / "Main.elm"))
+
+        output_js = project_dir / "main.js"
+        r = run(
+            ["elm", "make", "src/Main.elm", f"--output={output_js}"],
+            cwd=str(project_dir),
+            timeout_seconds=30,
+        )
+
+        if r.exit_code != 0:
+            return {
+                "status": "SyntaxError",
+                "exit_code": r.exit_code,
+                "stdout": r.stdout,
+                "stderr": r.stderr,
+            }
+
+        r = run(
+            ["node", "-e", "require('./main.js').Elm.Main.init()"],
+            cwd=str(project_dir),
+            timeout_seconds=15,
+        )
+
+        if r.timeout:
+            status = "Timeout"
+        elif r.exit_code == 0:
+            status = "OK"
+        else:
+            status = "Exception"
+
+        return {
+            "status": status,
+            "exit_code": r.exit_code,
+            "stdout": r.stdout,
+            "stderr": r.stderr,
+        }