3xcaffeine · rycerzes · Apr 26, 2026 · Apr 25, 2026 · Apr 26, 2026 · Apr 26, 2026
diff --git a/SkyRL b/SkyRL
diff --git a/openenv.yaml b/openenv.yaml
@@ -0,0 +1,73 @@
+spec_version: 1
+name: frontier-swe-openenv
+type: package
+runtime: fastapi
+version: "0.1.0"
+
+description: >
+  Frontier SWE — OpenEnv. A family of long-horizon software-engineering
+  environments deployed as Hugging Face Spaces, sharing one OpenEnv-shaped
+  FastAPI server (Gym-style /reset, /step, /state plus FastMCP tools) and a
+  multi-layer rubric (gate checks, hidden-test verifier, LLM diff review,
+  LLM plan review) that produces a normalised [0, 1] reward. Each task
+  ships as its own GHCR image and is mirrored to a dedicated HF Space.
+
+repo: https://github.com/3xcaffeine/frontier-swe-openenv
+
+# Shared package primitives. Each task Space inherits these and overrides
+# the per-task subset in its own openenv.yaml.
+package:
+  app: frontier_swe_env.server.app:app
+  port: 8000
+  client: frontier_swe_env.client:FrontierSweEnv
+  base_classes:
+    server: openenv.core.env_server.create_app
+    client: openenv.core.EnvClient
+  api:
+    gym_style:
+      - POST /reset
+      - POST /step
+      - GET  /state
+      - GET  /health
+    mcp:
+      - POST /mcp        # OpenEnv POST-only JSON-RPC
+      - ANY  /tools/mcp  # FastMCP Streamable HTTP (POST + GET/SSE)
+  tools:
+    - submit_plan
+    - submit_subtask
+    - get_status
+    - advance
+  reserved_tool_names_avoided:
+    - reset
+    - step
+    - state
+    - close
+
+# Tasks bundled in this repo. Each entry points at its per-Space manifest
+# and the live HF Space URL; judges should pull the per-task openenv.yaml
+# from the Space they are evaluating.
+tasks:
+  - name: notebook-compression
+    manifest: spaces/notebook/openenv.yaml
+    hf_space: https://huggingface.co/spaces/rycerzes/frontier-swe-notebook
+    image: ghcr.io/3xcaffeine/frontier-swe-openenv/frontier-swe-notebook:latest
+    domain: systems / compression
+  - name: postgres-sqlite-wire-adapter
+    manifest: spaces/postgres/openenv.yaml
+    hf_space: https://huggingface.co/spaces/rycerzes/frontier-swe-postgres
+    image: ghcr.io/3xcaffeine/frontier-swe-openenv/frontier-swe-postgres:latest
+    domain: systems / databases / Zig
+  - name: dependent-type-checker
+    manifest: spaces/type-checker/openenv.yaml
+    hf_space: https://huggingface.co/spaces/rycerzes/frontier-swe-type-checker
+    image: ghcr.io/3xcaffeine/frontier-swe-openenv/frontier-swe-dependent-type-checker:latest
+    domain: programming languages / type theory
+  - name: libexpat-to-x86asm
+    manifest: spaces/libexpat-to-x86asm/openenv.yaml
+    hf_space: https://huggingface.co/spaces/rycerzes/frontier-swe-libexpat-to-x86asm
+    image: ghcr.io/3xcaffeine/frontier-swe-openenv/frontier-swe-libexpat-to-x86asm:latest
+    domain: systems programming / x86-64 assembly / XML
+
+# Default landing task for `openenv pull frontier-swe-openenv` style tooling
+# until per-task selection is wired through.
+default_task: dependent-type-checker
diff --git a/scripts/prepare_hf_space.py b/scripts/prepare_hf_space.py
@@ -1,8 +1,10 @@
 """Assemble the push payload for an HF Space.
 
 Given a task name, produce a directory that can be force-pushed to the Space:
-- Dockerfile and README.md are lifted from ``spaces/<task>/`` to the payload root
-  (HF requires both at the root for Docker Spaces).
+- Dockerfile, README.md, and openenv.yaml are lifted from ``spaces/<task>/``
+  to the payload root (HF requires Dockerfile + README at the root for Docker
+  Spaces; openenv.yaml goes there so judges pulling the Space see a valid
+  manifest at the URL root).
 - The sibling ``spaces/<other-task>/`` subtree is dropped to reduce Space size.
 - ``.gitattributes`` is preserved so HF correctly interprets the LFS-tracked
   bundle.
@@ -34,12 +36,19 @@ def prepare(task: str, out: Path, repo_root: Path) -> None:
     if not space_src.is_dir():
         raise SystemExit(f"expected {space_src} to exist")
 
+    # Required: HF Docker Spaces need Dockerfile + README.md at the root.
     for name in ("Dockerfile", "README.md"):
         src = space_src / name
         if not src.is_file():
             raise SystemExit(f"missing {src}")
         shutil.copy2(src, out / name)
 
+    # Optional: lift openenv.yaml to the root so judges pulling the Space URL
+    # see a valid OpenEnv manifest at the top level. Missing is non-fatal.
+    manifest = space_src / "openenv.yaml"
+    if manifest.is_file():
+        shutil.copy2(manifest, out / "openenv.yaml")
+
     shutil.rmtree(out / "spaces")
     print(out)
 

diff --git a/spaces/libexpat-to-x86asm/openenv.yaml b/spaces/libexpat-to-x86asm/openenv.yaml
@@ -0,0 +1,107 @@
+spec_version: 1
+name: frontier-swe-libexpat-to-x86asm
+type: space
+runtime: fastapi
+app: frontier_swe_env.server.app:app
+port: 8000
+version: "0.1.0"
+
+description: >
+  Frontier SWE — libexpat to x86-64 Assembly. An OpenEnv-shaped FastAPI
+  service hosting a long-horizon task: reimplement libexpat 2.6.4 in x86-64
+  assembly and produce /app/asm-port/libexpat.so exporting the expat C ABI.
+  The verifier builds reference C libexpat, runs the upstream test suite and
+  benchmarks, and writes /logs/verifier/reward.json (0.5 correctness + 0.5
+  performance, hard-fail to 0.0 on anti-cheat or missing .so). Scored with
+  gate checks, structured L1, LLM code/plan review, and weighted episode reward.
+
+repo:
+  source: https://github.com/3xcaffeine/frontier-swe-openenv
+  task_directory: tasks/libexpat-to-x86asm
+
+environment:
+  task_name: libexpat-to-x86asm
+  workspace_dir: /app/asm-port
+  build_command: "true"
+  episode_timeout_s: 3600
+  max_attempts_per_subtask: 3
+  l1_score_mode: reward_json_score
+  l1_timeout_s: 1500
+  reward_json_path: /logs/verifier/reward.json
+  reward_json_score_field: score
+  reward_json_score_anchors: [0.0, 1.0]
+  reward_json_score_higher_is_better: true
+  task_domain: systems programming / x86-64 assembly / XML parsing
+  cpus: 4
+  memory_mb: 8192
+
+rubric:
+  type: composite
+  layers:
+    - name: gate_checks
+      kind: shell
+      script: /app/gate_checks.sh
+      output: GATE_SCORE=N/M
+      gates:
+        - /app/asm-port writable, expat.h present
+        - nasm, as, and ld on PATH
+        - encrypted gcc + libexpat bundles staged
+    - name: l1_tests
+      kind: structured_reward
+      score_mode: reward_json_score
+      reward_json_path: /logs/verifier/reward.json
+      score_field: score
+      anchors: [0.0, 1.0]
+      higher_is_better: true
+      hard_fail_signals:
+        - score == 0.0 (no .so, anti-cheat, verifier infra failure, zero correctness)
+    - name: l2_code_review
+      kind: llm_judge
+      model_env: FSWE_GRADER_MODEL
+      api_url_env: FSWE_GRADER_API_URL
+      api_key_env: FSWE_GRADER_API_KEY
+      dimensions:
+        [completeness, correctness, robustness, forward_compatibility]
+    - name: l3_plan_review
+      kind: llm_judge
+      model_env: FSWE_GRADER_MODEL
+      api_url_env: FSWE_GRADER_API_URL
+      api_key_env: FSWE_GRADER_API_KEY
+    - name: episode_aggregator
+      kind: weighted_blend
+      output_field: observation.episode_reward
+
+tools:
+  - name: submit_plan
+    description: Propose a subtask plan for the episode (PLANNING -> EXECUTING).
+    parameters:
+      - name: subtasks
+        type: list[dict]
+        required: true
+  - name: submit_subtask
+    description: Submit the current subtask for L1 + L2 scoring.
+    parameters:
+      - name: subtask_id
+        type: str
+        required: true
+  - name: get_status
+    description: Return the current episode status snapshot (phase, scores, time remaining).
+  - name: advance
+    description: Freeze the current subtask score and advance to the next subtask.
+
+metrics:
+  observation:
+    - observation.phase
+    - observation.current_subtask
+    - observation.frozen_scores
+    - observation.time_remaining_s
+    - observation.plan_score
+    - observation.subtask_feedback
+    - observation.episode_reward
+  reward:
+    - reward.gate_score
+    - reward.l1_test_score
+    - reward.l1_blended
+    - reward.l2_code_review
+    - reward.l3_plan_review
+    - reward.episode_reward
diff --git a/spaces/notebook/openenv.yaml b/spaces/notebook/openenv.yaml
@@ -0,0 +1,99 @@
+spec_version: 1
+name: frontier-swe-notebook
+type: space
+runtime: fastapi
+app: frontier_swe_env.server.app:app
+port: 8000
+version: "0.1.0"
+
+description: >
+  Frontier SWE — Notebook Compression. An OpenEnv-shaped FastAPI service that
+  hosts a long-horizon software engineering task: build a fit/compress/decompress
+  pipeline for Jupyter notebooks. Agents plan subtasks, edit code in a Linux
+  workspace, then submit for multi-layer rubric scoring (gate checks, hidden
+  test verifier, LLM diff review, LLM plan review). The verifier writes a
+  structured reward.json the rubric layer normalises into a [0, 1] reward.
+
+repo:
+  source: https://github.com/3xcaffeine/frontier-swe-openenv
+  task_directory: tasks/notebook-compression
+
+# Task-level defaults (overridable via FSWE_TASK_MODE=demo|training and env vars).
+environment:
+  task_name: notebook-compression
+  workspace_dir: /app
+  build_command: ""
+  episode_timeout_s: 3600
+  max_attempts_per_subtask: 2
+  l1_score_mode: reward_json
+  reward_json_path: /logs/verifier/reward.json
+  task_domain: systems / compression
+  cpus: 8
+  memory_mb: 32768
+
+# Composite multi-layer rubric. Final episode reward is a weighted blend of
+# all layers, normalised to [0, 1].
+rubric:
+  type: composite
+  layers:
+    - name: gate_checks
+      kind: shell
+      script: /app/gate_checks.sh
+      output: GATE_SCORE=N/M (parsed by frontier_swe_env.rubrics.gate_checks)
+    - name: l1_tests
+      kind: structured_reward
+      score_mode: reward_json
+      reward_json_path: /logs/verifier/reward.json
+    - name: l2_code_review
+      kind: llm_judge
+      model_env: FSWE_GRADER_MODEL
+      api_url_env: FSWE_GRADER_API_URL
+      api_key_env: FSWE_GRADER_API_KEY
+      dimensions:
+        [completeness, correctness, robustness, forward_compatibility]
+    - name: l3_plan_review
+      kind: llm_judge
+      model_env: FSWE_GRADER_MODEL
+      api_url_env: FSWE_GRADER_API_URL
+      api_key_env: FSWE_GRADER_API_KEY
+    - name: episode_aggregator
+      kind: weighted_blend
+      output_field: observation.episode_reward
+
+# MCP tools exposed by this environment. None of these collide with OpenEnv's
+# reserved tool names (reset, step, state, close).
+tools:
+  - name: submit_plan
+    description: Propose a subtask plan for the episode (PLANNING -> EXECUTING).
+    parameters:
+      - name: subtasks
+        type: list[dict]
+        required: true
+  - name: submit_subtask
+    description: Submit the current subtask for L1 + L2 scoring.
+    parameters:
+      - name: subtask_id
+        type: str
+        required: true
+  - name: get_status
+    description: Return the current episode status snapshot (phase, scores, time remaining).
+  - name: advance
+    description: Freeze the current subtask score and advance to the next subtask.
+
+# Observation fields surfaced to the agent on every /step response.
+metrics:
+  observation:
+    - observation.phase
+    - observation.current_subtask
+    - observation.frozen_scores
+    - observation.time_remaining_s
+    - observation.plan_score
+    - observation.subtask_feedback
+    - observation.episode_reward
+  reward:
+    - reward.gate_score
+    - reward.l1_test_score
+    - reward.l1_blended
+    - reward.l2_code_review
+    - reward.l3_plan_review
+    - reward.episode_reward