diff --git a/SkyRL b/SkyRL new file mode 160000 index 0000000..fa866e7 --- /dev/null +++ b/SkyRL @@ -0,0 +1 @@ +Subproject commit fa866e767302846da1df779e80a19acbc3a4563f diff --git a/openenv.yaml b/openenv.yaml new file mode 100644 index 0000000..70a72d5 --- /dev/null +++ b/openenv.yaml @@ -0,0 +1,73 @@ +spec_version: 1 +name: frontier-swe-openenv +type: package +runtime: fastapi +version: "0.1.0" + +description: > + Frontier SWE — OpenEnv. A family of long-horizon software-engineering + environments deployed as Hugging Face Spaces, sharing one OpenEnv-shaped + FastAPI server (Gym-style /reset, /step, /state plus FastMCP tools) and a + multi-layer rubric (gate checks, hidden-test verifier, LLM diff review, + LLM plan review) that produces a normalised [0, 1] reward. Each task + ships as its own GHCR image and is mirrored to a dedicated HF Space. + +repo: https://github.com/3xcaffeine/frontier-swe-openenv + +# Shared package primitives. Each task Space inherits these and overrides +# the per-task subset in its own openenv.yaml. +package: + app: frontier_swe_env.server.app:app + port: 8000 + client: frontier_swe_env.client:FrontierSweEnv + base_classes: + server: openenv.core.env_server.create_app + client: openenv.core.EnvClient + api: + gym_style: + - POST /reset + - POST /step + - GET /state + - GET /health + mcp: + - POST /mcp # OpenEnv POST-only JSON-RPC + - ANY /tools/mcp # FastMCP Streamable HTTP (POST + GET/SSE) + tools: + - submit_plan + - submit_subtask + - get_status + - advance + reserved_tool_names_avoided: + - reset + - step + - state + - close + +# Tasks bundled in this repo. Each entry points at its per-Space manifest +# and the live HF Space URL; judges should pull the per-task openenv.yaml +# from the Space they are evaluating. +tasks: + - name: notebook-compression + manifest: spaces/notebook/openenv.yaml + hf_space: https://huggingface.co/spaces/rycerzes/frontier-swe-notebook + image: ghcr.io/3xcaffeine/frontier-swe-openenv/frontier-swe-notebook:latest + domain: systems / compression + - name: postgres-sqlite-wire-adapter + manifest: spaces/postgres/openenv.yaml + hf_space: https://huggingface.co/spaces/rycerzes/frontier-swe-postgres + image: ghcr.io/3xcaffeine/frontier-swe-openenv/frontier-swe-postgres:latest + domain: systems / databases / Zig + - name: dependent-type-checker + manifest: spaces/type-checker/openenv.yaml + hf_space: https://huggingface.co/spaces/rycerzes/frontier-swe-type-checker + image: ghcr.io/3xcaffeine/frontier-swe-openenv/frontier-swe-dependent-type-checker:latest + domain: programming languages / type theory + - name: libexpat-to-x86asm + manifest: spaces/libexpat-to-x86asm/openenv.yaml + hf_space: https://huggingface.co/spaces/rycerzes/frontier-swe-libexpat-to-x86asm + image: ghcr.io/3xcaffeine/frontier-swe-openenv/frontier-swe-libexpat-to-x86asm:latest + domain: systems programming / x86-64 assembly / XML + +# Default landing task for `openenv pull frontier-swe-openenv` style tooling +# until per-task selection is wired through. +default_task: dependent-type-checker diff --git a/scripts/prepare_hf_space.py b/scripts/prepare_hf_space.py index 7ffdda4..ab6241b 100644 --- a/scripts/prepare_hf_space.py +++ b/scripts/prepare_hf_space.py @@ -1,8 +1,10 @@ """Assemble the push payload for an HF Space. Given a task name, produce a directory that can be force-pushed to the Space: -- Dockerfile and README.md are lifted from ``spaces//`` to the payload root - (HF requires both at the root for Docker Spaces). +- Dockerfile, README.md, and openenv.yaml are lifted from ``spaces//`` + to the payload root (HF requires Dockerfile + README at the root for Docker + Spaces; openenv.yaml goes there so judges pulling the Space see a valid + manifest at the URL root). - The sibling ``spaces//`` subtree is dropped to reduce Space size. - ``.gitattributes`` is preserved so HF correctly interprets the LFS-tracked bundle. @@ -34,12 +36,19 @@ def prepare(task: str, out: Path, repo_root: Path) -> None: if not space_src.is_dir(): raise SystemExit(f"expected {space_src} to exist") + # Required: HF Docker Spaces need Dockerfile + README.md at the root. for name in ("Dockerfile", "README.md"): src = space_src / name if not src.is_file(): raise SystemExit(f"missing {src}") shutil.copy2(src, out / name) + # Optional: lift openenv.yaml to the root so judges pulling the Space URL + # see a valid OpenEnv manifest at the top level. Missing is non-fatal. + manifest = space_src / "openenv.yaml" + if manifest.is_file(): + shutil.copy2(manifest, out / "openenv.yaml") + shutil.rmtree(out / "spaces") print(out) diff --git a/spaces/libexpat-to-x86asm/openenv.yaml b/spaces/libexpat-to-x86asm/openenv.yaml new file mode 100644 index 0000000..011be90 --- /dev/null +++ b/spaces/libexpat-to-x86asm/openenv.yaml @@ -0,0 +1,107 @@ +spec_version: 1 +name: frontier-swe-libexpat-to-x86asm +type: space +runtime: fastapi +app: frontier_swe_env.server.app:app +port: 8000 +version: "0.1.0" + +description: > + Frontier SWE — libexpat to x86-64 Assembly. An OpenEnv-shaped FastAPI + service hosting a long-horizon task: reimplement libexpat 2.6.4 in x86-64 + assembly and produce /app/asm-port/libexpat.so exporting the expat C ABI. + The verifier builds reference C libexpat, runs the upstream test suite and + benchmarks, and writes /logs/verifier/reward.json (0.5 correctness + 0.5 + performance, hard-fail to 0.0 on anti-cheat or missing .so). Scored with + gate checks, structured L1, LLM code/plan review, and weighted episode reward. + +repo: + source: https://github.com/3xcaffeine/frontier-swe-openenv + task_directory: tasks/libexpat-to-x86asm + +environment: + task_name: libexpat-to-x86asm + workspace_dir: /app/asm-port + build_command: "true" + episode_timeout_s: 3600 + max_attempts_per_subtask: 3 + l1_score_mode: reward_json_score + l1_timeout_s: 1500 + reward_json_path: /logs/verifier/reward.json + reward_json_score_field: score + reward_json_score_anchors: [0.0, 1.0] + reward_json_score_higher_is_better: true + task_domain: systems programming / x86-64 assembly / XML parsing + cpus: 4 + memory_mb: 8192 + +rubric: + type: composite + layers: + - name: gate_checks + kind: shell + script: /app/gate_checks.sh + output: GATE_SCORE=N/M + gates: + - /app/asm-port writable, expat.h present + - nasm, as, and ld on PATH + - encrypted gcc + libexpat bundles staged + - name: l1_tests + kind: structured_reward + score_mode: reward_json_score + reward_json_path: /logs/verifier/reward.json + score_field: score + anchors: [0.0, 1.0] + higher_is_better: true + hard_fail_signals: + - score == 0.0 (no .so, anti-cheat, verifier infra failure, zero correctness) + - name: l2_code_review + kind: llm_judge + model_env: FSWE_GRADER_MODEL + api_url_env: FSWE_GRADER_API_URL + api_key_env: FSWE_GRADER_API_KEY + dimensions: + [completeness, correctness, robustness, forward_compatibility] + - name: l3_plan_review + kind: llm_judge + model_env: FSWE_GRADER_MODEL + api_url_env: FSWE_GRADER_API_URL + api_key_env: FSWE_GRADER_API_KEY + - name: episode_aggregator + kind: weighted_blend + output_field: observation.episode_reward + +tools: + - name: submit_plan + description: Propose a subtask plan for the episode (PLANNING -> EXECUTING). + parameters: + - name: subtasks + type: list[dict] + required: true + - name: submit_subtask + description: Submit the current subtask for L1 + L2 scoring. + parameters: + - name: subtask_id + type: str + required: true + - name: get_status + description: Return the current episode status snapshot (phase, scores, time remaining). + - name: advance + description: Freeze the current subtask score and advance to the next subtask. + +metrics: + observation: + - observation.phase + - observation.current_subtask + - observation.frozen_scores + - observation.time_remaining_s + - observation.plan_score + - observation.subtask_feedback + - observation.episode_reward + reward: + - reward.gate_score + - reward.l1_test_score + - reward.l1_blended + - reward.l2_code_review + - reward.l3_plan_review + - reward.episode_reward diff --git a/spaces/notebook/openenv.yaml b/spaces/notebook/openenv.yaml new file mode 100644 index 0000000..ea257ac --- /dev/null +++ b/spaces/notebook/openenv.yaml @@ -0,0 +1,99 @@ +spec_version: 1 +name: frontier-swe-notebook +type: space +runtime: fastapi +app: frontier_swe_env.server.app:app +port: 8000 +version: "0.1.0" + +description: > + Frontier SWE — Notebook Compression. An OpenEnv-shaped FastAPI service that + hosts a long-horizon software engineering task: build a fit/compress/decompress + pipeline for Jupyter notebooks. Agents plan subtasks, edit code in a Linux + workspace, then submit for multi-layer rubric scoring (gate checks, hidden + test verifier, LLM diff review, LLM plan review). The verifier writes a + structured reward.json the rubric layer normalises into a [0, 1] reward. + +repo: + source: https://github.com/3xcaffeine/frontier-swe-openenv + task_directory: tasks/notebook-compression + +# Task-level defaults (overridable via FSWE_TASK_MODE=demo|training and env vars). +environment: + task_name: notebook-compression + workspace_dir: /app + build_command: "" + episode_timeout_s: 3600 + max_attempts_per_subtask: 2 + l1_score_mode: reward_json + reward_json_path: /logs/verifier/reward.json + task_domain: systems / compression + cpus: 8 + memory_mb: 32768 + +# Composite multi-layer rubric. Final episode reward is a weighted blend of +# all layers, normalised to [0, 1]. +rubric: + type: composite + layers: + - name: gate_checks + kind: shell + script: /app/gate_checks.sh + output: GATE_SCORE=N/M (parsed by frontier_swe_env.rubrics.gate_checks) + - name: l1_tests + kind: structured_reward + score_mode: reward_json + reward_json_path: /logs/verifier/reward.json + - name: l2_code_review + kind: llm_judge + model_env: FSWE_GRADER_MODEL + api_url_env: FSWE_GRADER_API_URL + api_key_env: FSWE_GRADER_API_KEY + dimensions: + [completeness, correctness, robustness, forward_compatibility] + - name: l3_plan_review + kind: llm_judge + model_env: FSWE_GRADER_MODEL + api_url_env: FSWE_GRADER_API_URL + api_key_env: FSWE_GRADER_API_KEY + - name: episode_aggregator + kind: weighted_blend + output_field: observation.episode_reward + +# MCP tools exposed by this environment. None of these collide with OpenEnv's +# reserved tool names (reset, step, state, close). +tools: + - name: submit_plan + description: Propose a subtask plan for the episode (PLANNING -> EXECUTING). + parameters: + - name: subtasks + type: list[dict] + required: true + - name: submit_subtask + description: Submit the current subtask for L1 + L2 scoring. + parameters: + - name: subtask_id + type: str + required: true + - name: get_status + description: Return the current episode status snapshot (phase, scores, time remaining). + - name: advance + description: Freeze the current subtask score and advance to the next subtask. + +# Observation fields surfaced to the agent on every /step response. +metrics: + observation: + - observation.phase + - observation.current_subtask + - observation.frozen_scores + - observation.time_remaining_s + - observation.plan_score + - observation.subtask_feedback + - observation.episode_reward + reward: + - reward.gate_score + - reward.l1_test_score + - reward.l1_blended + - reward.l2_code_review + - reward.l3_plan_review + - reward.episode_reward diff --git a/spaces/postgres/openenv.yaml b/spaces/postgres/openenv.yaml new file mode 100644 index 0000000..4e37dfe --- /dev/null +++ b/spaces/postgres/openenv.yaml @@ -0,0 +1,89 @@ +spec_version: 1 +name: frontier-swe-postgres +type: space +runtime: fastapi +app: frontier_swe_env.server.app:app +port: 8000 +version: "0.1.0" + +description: > + Frontier SWE — Postgres / SQLite Wire Adapter. An OpenEnv-shaped FastAPI + service hosting a multi-stage systems-programming task: build a PostgreSQL + wire-protocol-compatible server in Zig that uses SQLite as its storage + backend. Agents plan subtasks, edit Zig source in a Linux workspace, run + the gate + test suite, then submit for multi-layer rubric scoring. + +repo: + source: https://github.com/3xcaffeine/frontier-swe-openenv + task_directory: tasks/postgres-sqlite-wire-adapter + +environment: + task_name: postgres-sqlite-wire-adapter + workspace_dir: /app/postgres-sqlite + episode_timeout_s: 2700 + max_attempts_per_subtask: 2 + l1_score_mode: ratio + l1_output_pattern: 'Total:\s*(\d+)/(\d+)\s*passed' + task_domain: systems / databases / Zig + cpus: 8 + memory_mb: 32768 + +rubric: + type: composite + layers: + - name: gate_checks + kind: shell + script: /app/gate_checks.sh + output: GATE_SCORE=N/M (parsed by frontier_swe_env.rubrics.gate_checks) + - name: l1_tests + kind: regex_ratio + command: /app/test_runner.sh + pattern: 'Total:\s*(\d+)/(\d+)\s*passed' + - name: l2_code_review + kind: llm_judge + model_env: FSWE_GRADER_MODEL + api_url_env: FSWE_GRADER_API_URL + api_key_env: FSWE_GRADER_API_KEY + dimensions: + [completeness, correctness, robustness, forward_compatibility] + - name: l3_plan_review + kind: llm_judge + model_env: FSWE_GRADER_MODEL + - name: episode_aggregator + kind: weighted_blend + output_field: observation.episode_reward + +tools: + - name: submit_plan + description: Propose a subtask plan for the episode (PLANNING -> EXECUTING). + parameters: + - name: subtasks + type: list[dict] + required: true + - name: submit_subtask + description: Submit the current subtask for L1 + L2 scoring. + parameters: + - name: subtask_id + type: str + required: true + - name: get_status + description: Return the current episode status snapshot (phase, scores, time remaining). + - name: advance + description: Freeze the current subtask score and advance to the next subtask. + +metrics: + observation: + - observation.phase + - observation.current_subtask + - observation.frozen_scores + - observation.time_remaining_s + - observation.plan_score + - observation.subtask_feedback + - observation.episode_reward + reward: + - reward.gate_score + - reward.l1_test_score + - reward.l1_blended + - reward.l2_code_review + - reward.l3_plan_review + - reward.episode_reward diff --git a/spaces/type-checker/openenv.yaml b/spaces/type-checker/openenv.yaml new file mode 100644 index 0000000..fb22ad0 --- /dev/null +++ b/spaces/type-checker/openenv.yaml @@ -0,0 +1,106 @@ +spec_version: 1 +name: frontier-swe-type-checker +type: space +runtime: fastapi +app: frontier_swe_env.server.app:app +port: 8000 +version: "0.1.0" + +description: > + Frontier SWE — Dependent Type Checker. An OpenEnv-shaped FastAPI service + hosting a Martin-Löf-style dependently-typed language type checker + implementation task. Agents implement a Rust binary at + /app/type-checker/target/release/type-checker that exits 0 iff every + top-level command in supplied .sexp files type-checks. Scored on + correctness gates (accept-rate >= 0.99, reject-rate >= 0.95) followed by + geometric-mean speedup vs the reference implementation. + +repo: + source: https://github.com/3xcaffeine/frontier-swe-openenv + task_directory: tasks/dependent-type-checker + +environment: + task_name: dependent-type-checker + workspace_dir: /app/type-checker + build_command: cd /app/type-checker && cargo build --release + episode_timeout_s: 3600 + max_attempts_per_subtask: 3 + l1_score_mode: reward_json_score + l1_timeout_s: 600 + reward_json_path: /logs/verifier/reward.json + reward_json_score_field: score + reward_json_score_anchors: [0.0, 2.0] + reward_json_score_higher_is_better: true + task_domain: programming languages / type theory + cpus: 8 + memory_mb: 32768 + +rubric: + type: composite + layers: + - name: gate_checks + kind: shell + script: /app/gate_checks.sh + output: GATE_SCORE=N/M + gates: + - workspace + Cargo.toml present + - rustc + cargo on PATH + - cargo build --release succeeds + - name: l1_tests + kind: structured_reward + score_mode: reward_json_score + reward_json_path: /logs/verifier/reward.json + score_field: score + anchors: [0.0, 2.0] + higher_is_better: true + hard_fail_signals: + - additional_data.reason set (correctness gate fail / source scan match) + - score == 0.0 + - name: l2_code_review + kind: llm_judge + model_env: FSWE_GRADER_MODEL + api_url_env: FSWE_GRADER_API_URL + api_key_env: FSWE_GRADER_API_KEY + dimensions: + [completeness, correctness, robustness, forward_compatibility] + - name: l3_plan_review + kind: llm_judge + model_env: FSWE_GRADER_MODEL + - name: episode_aggregator + kind: weighted_blend + output_field: observation.episode_reward + +tools: + - name: submit_plan + description: Propose a subtask plan for the episode (PLANNING -> EXECUTING). + parameters: + - name: subtasks + type: list[dict] + required: true + - name: submit_subtask + description: Submit the current subtask for L1 + L2 scoring. + parameters: + - name: subtask_id + type: str + required: true + - name: get_status + description: Return the current episode status snapshot (phase, scores, time remaining). + - name: advance + description: Freeze the current subtask score and advance to the next subtask. + +metrics: + observation: + - observation.phase + - observation.current_subtask + - observation.frozen_scores + - observation.time_remaining_s + - observation.plan_score + - observation.subtask_feedback + - observation.episode_reward + reward: + - reward.gate_score + - reward.l1_test_score + - reward.l1_blended + - reward.l2_code_review + - reward.l3_plan_review + - reward.episode_reward