diff --git a/pyproject.toml b/pyproject.toml index 0249be67..c7c5c55f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,9 @@ ttt = [ "fastapi>=0.115.0", "uvicorn>=0.30.0", ] +frontier-cs = [ + "frontier-cs @ git+https://github.com/FrontierCS/Frontier-CS.git", +] dev = [ "pytest>=8.0.0", "pytest-asyncio>=0.24.0", diff --git a/ttt/README.md b/ttt/README.md index 9f5fcd15..10109d9d 100644 --- a/ttt/README.md +++ b/ttt/README.md @@ -91,7 +91,9 @@ This incentivizes changes that improve eval scores relative to the previous comm ttt/ coral_api_server.py FastAPI proxy: SGLang forwarding, logprob extraction, sample creation coral_rollout.py SLIME rollout function: agent lifecycle, eval monitoring, reward assignment - run_coral_rl.sh Training launcher: Ray, SLIME train_async.py, model/GPU config + coral_distill.py Self-distillation rollout: SFT on improving trajectories + run_coral_rl.sh RL training launcher: Ray, SLIME train_async.py, model/GPU config + run_coral_distill.sh Distillation training launcher: SFT variant of RL training run_coral_rl_docker.sh Docker wrapper for run_coral_rl.sh docker/ Dockerfile Builds on SLIME base image, adds CORAL + opencode @@ -99,6 +101,8 @@ ttt/ slime/ SLIME framework (vendored, see acknowledgments) examples/ circle_packing/ Example task: pack 26 circles into a unit square + eplb/ Expert Parallelism Load Balancer (MoE optimization) + frontier_cs/ Frontier-CS Research: cant_be_late scheduling (requires frontier-cs package) README.md This file ``` diff --git a/ttt/examples/eplb/eval/evaluator.py b/ttt/examples/eplb/eval/evaluator.py new file mode 100644 index 00000000..4abbc371 --- /dev/null +++ b/ttt/examples/eplb/eval/evaluator.py @@ -0,0 +1,197 @@ +import functools +import importlib.util +import json +import time +import traceback +from typing import TypedDict + +import torch +import os + +# Get the directory of this file and construct workload path +_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) +WORKLOAD_PATH = os.path.join(_CURRENT_DIR, "expert-load.json") +REBALANCE_INTERVAL = 100 + +NUM_REPLICAS = 288 +NUM_GROUPS = 8 +NUM_GPUS = 32 +NUM_NODES = 4 + +@functools.cache +def load_workloads(path: str) -> list[torch.Tensor]: + with open(path, "r") as f: + data = json.load(f) + + total_len = len(data['load_history']) + workloads = [] + for i in range(0, total_len, REBALANCE_INTERVAL): + start = i + end = min(start + REBALANCE_INTERVAL, total_len) + + load = torch.tensor([x['logical_expert_load'] for x in data['load_history'][start:end]]).sum(dim=0) + workloads.append(load) + + return workloads + +class EvaluationResult(TypedDict, total=False): + balancedness_score_gpu: float + balancedness_score_expert: float + times_algorithm: float + times_inference: float + speed_score: float + combined_score: float + error: str + +def simulate_inference( + log2phy: torch.Tensor, + logcnt: torch.Tensor, + workload: torch.Tensor, + ) -> tuple[float, float]: + ''' + Simulate a MoE inference with the given expert mapping, and return the balancedness factor. + ''' + # workload 形状: (num_layers, num_logical_experts) - 每层每个逻辑专家的负载 + num_layers, num_logical_experts = workload.shape + + # 初始化物理专家负载累积器 + num_physical_experts = NUM_REPLICAS + total_physical_load = torch.zeros(num_layers, num_physical_experts, dtype=torch.float, device=workload.device) + + # 对每个逻辑专家,分配负载到其物理副本 + for layer_id in range(num_layers): + for logical_id in range(num_logical_experts): + # 获取该逻辑专家的负载 + logical_load = workload[layer_id][logical_id].item() + + # 跳过零负载 + if logical_load <= 0: + continue + + num_replicas = int(logcnt[layer_id][logical_id].item()) + + if num_replicas <= 0: + # Expert has load but no replicas — penalize by concentrating + # all its load on physical slot 0 (worst-case imbalance). + total_physical_load[layer_id, 0] += logical_load + continue + + # 获取物理专家映射 + physical_ids = log2phy[layer_id][logical_id][:num_replicas] + + # 计算每个副本的负载(基于有效副本数量) + replica_load = logical_load / num_replicas + + # 分配负载到有效的物理专家 + total_physical_load[layer_id, physical_ids] += replica_load + + # 计算 balancedness + total_load = total_physical_load.sum() + if total_load == 0: + return 0.0, 0.0 + + # Compute expert load + expert_layer_avg = total_physical_load.mean(dim=1).sum().item() + expert_layer_max = total_physical_load.max(dim=1).values.sum().item() + balancedness_expert = expert_layer_avg / expert_layer_max + + # 计算 GPU 负载 + gpu_load = total_physical_load.view(num_layers, NUM_GPUS, -1).sum(dim=2) + + # 计算每层的平均负载和最大负载,然后求和 + layer_avg = gpu_load.mean(dim=1) # (num_layers,) + layer_max = gpu_load.max(dim=1).values # (num_layers,) + + avg_load = layer_avg.sum().item() + max_load = layer_max.sum().item() + + # 计算 balancedness: avg_load / max_load + balancedness_gpu = avg_load / max_load if max_load > 0 else 0.0 + + # print(f'balancedness per GPU: {balancedness}, balancedness per expert: {balancedness_expert}') + + return balancedness_gpu, balancedness_expert + +def evaluate(program_path: str) -> EvaluationResult: + workloads = load_workloads(WORKLOAD_PATH) + + try: + spec = importlib.util.spec_from_file_location("program", program_path) + assert spec is not None + program = importlib.util.module_from_spec(spec) + assert spec.loader is not None + spec.loader.exec_module(program) + + if not hasattr(program, "rebalance_experts"): + print('Error: program does not have `rebalance_experts` function') + return { + "balancedness_score_gpu": 0.0, + "balancedness_score_expert": 0.0, + "times_algorithm": 0.0, + "times_inference": 0.0, + "speed_score": 0.0, + "combined_score": 0.0, + "error": "Missing `rebalance_experts` function", + } + + if not hasattr(program, "rebalance_experts"): + raise ValueError("Program does not have rebalance_experts function") + + balancedness_scores_gpu = [] + balancedness_scores_expert = [] + times_algorithm = [] + times_inference = [] + for i in range(len(workloads) - 1): + start_time = time.perf_counter() + _, log2phy, logcnt = program.rebalance_experts( + workloads[i], + NUM_REPLICAS, + NUM_GROUPS, + NUM_NODES, + NUM_GPUS, + ) + end_time_algorithm = time.perf_counter() + balancedness_score_gpu, balancedness_score_expert = simulate_inference(log2phy, logcnt, workloads[i + 1]) + end_time = time.perf_counter() + balancedness_scores_gpu.append(balancedness_score_gpu) + balancedness_scores_expert.append(balancedness_score_expert) + print(f'time_algorithm: {end_time_algorithm - start_time}, time_inference: {end_time - start_time}') + times_algorithm.append(end_time_algorithm - start_time) + times_inference.append(end_time - start_time) + + avg_balancedness_score_gpu = sum(balancedness_scores_gpu) / len(balancedness_scores_gpu) + avg_balancedness_score_expert = sum(balancedness_scores_expert) / len(balancedness_scores_expert) + avg_time_algorithm = sum(times_algorithm) / len(times_algorithm) + avg_time_inference = sum(times_inference) / len(times_inference) + speed_score = 0.002 / avg_time_inference + print(f'avg_time_algorithm: {avg_time_algorithm}, avg_time_inference: {avg_time_inference}, speed_score: {speed_score}') + combined_score = (avg_balancedness_score_expert + speed_score) / 2 + return { + "balancedness_score_gpu": float(avg_balancedness_score_gpu), + "balancedness_score_expert": float(avg_balancedness_score_expert), + "times_algorithm": float(avg_time_algorithm), + "times_inference": float(avg_time_inference), + "speed_score": float(speed_score), + "combined_score": float(combined_score), + } + except Exception as e: + traceback.print_exc() + print(f'Error during evaluation: {str(e)}') + return { + "balancedness_score_gpu": 0.0, + "balancedness_score_expert": 0.0, + "times_algorithm": 0.0, + "times_inference": 0.0, + "speed_score": 0.0, + "combined_score": 0.0, + "error": str(e), + } + + +if __name__ == "__main__": + # Backwards-compat: bridges old evaluate() -> dict to the container JSON + # protocol. wrapper.py is auto-injected at build time from + # skydiscover/evaluation/wrapper.py. + from wrapper import run + + run(evaluate) \ No newline at end of file diff --git a/ttt/examples/eplb/eval/grader.py b/ttt/examples/eplb/eval/grader.py new file mode 100644 index 00000000..54b3f893 --- /dev/null +++ b/ttt/examples/eplb/eval/grader.py @@ -0,0 +1,71 @@ +"""CORAL grader for the EPLB (Expert Parallelism Load Balancer) task. + +Wraps the skydiscover evaluator. Expects expert-load.json to be present +alongside this file in the eval/ directory. + +Setup: + wget https://huggingface.co/datasets/abmfy/eplb-openevolve/resolve/main/expert-load.json + cp expert-load.json examples/ADRS/eplb/eval/expert-load.json +""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + def evaluate(self) -> ScoreBundle: + program_file = self.args.get("program_file", "initial_program.py") + program_path = os.path.join(self.codebase_path, program_file) + + if not os.path.exists(program_path): + return self.fail(f"Program file not found: {program_file}") + + # The evaluator uses __file__ to locate expert-load.json, so it must + # be imported from its actual location in .coral/private/eval/. + eval_dir = str(Path(self.private_dir) / "eval") + if eval_dir not in sys.path: + sys.path.insert(0, eval_dir) + + data_file = Path(eval_dir) / "expert-load.json" + if not data_file.exists(): + return self.fail( + "expert-load.json not found. Download it and place it in " + "examples/ADRS/eplb/eval/expert-load.json:\n" + " wget https://huggingface.co/datasets/abmfy/eplb-openevolve" + "/resolve/main/expert-load.json" + ) + + try: + import importlib.util + spec = importlib.util.spec_from_file_location( + "eplb_evaluator", str(Path(eval_dir) / "evaluator.py") + ) + evaluator_mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(evaluator_mod) + + result = evaluator_mod.evaluate(program_path) + if "error" in result: + return self.fail(result["error"]) + + combined_score = result.get("combined_score", 0.0) + bal_gpu = result.get("balancedness_score_gpu", 0.0) + bal_expert = result.get("balancedness_score_expert", 0.0) + speed = result.get("speed_score", 0.0) + t_algo = result.get("times_algorithm", 0.0) + t_infer = result.get("times_inference", 0.0) + + explanation = ( + f"combined={combined_score:.4f} | " + f"bal_expert={bal_expert:.4f} | bal_gpu={bal_gpu:.4f} | " + f"speed={speed:.4f} | t_algo={t_algo:.4f}s | t_infer={t_infer:.4f}s" + ) + return self.score(combined_score, explanation) + + except Exception as e: + return self.fail(f"Evaluation error: {e}") diff --git a/ttt/examples/eplb/seed/initial_program.py b/ttt/examples/eplb/seed/initial_program.py new file mode 100644 index 00000000..04cb8c59 --- /dev/null +++ b/ttt/examples/eplb/seed/initial_program.py @@ -0,0 +1,238 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Expert parallelism load balancer (EPLB) for vLLM. + +This module implements the core rearrangement algorithm. + +The rearrangement algorithm is adapted from +[DeepSeek EPLB](https://github.com/deepseek-ai/eplb). + +Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example +on how the EPLB algorithm works. +""" + +# EVOLVE-BLOCK-START + +import torch + + +def balanced_packing(weight: torch.Tensor, + num_packs: int) -> tuple[torch.Tensor, torch.Tensor]: + """ + Pack n weighted objects to m packs, such that each bin contains exactly + n/m objects and the weights of all packs are as balanced as possible. + + Parameters: + weight: [X, n], the weight of each item + num_packs: number of packs + + Returns: + pack_index: [X, n], the pack index of each item + rank_in_pack: [X, n], the rank of the item in the pack + """ + num_layers, num_groups = weight.shape + assert num_groups % num_packs == 0 + groups_per_pack = num_groups // num_packs + + if groups_per_pack == 1: + pack_index = torch.arange(weight.size(-1), + dtype=torch.int64, + device=weight.device).expand(weight.shape) + rank_in_pack = torch.zeros_like(weight, dtype=torch.int64) + return pack_index, rank_in_pack + + indices = weight.float().sort(-1, descending=True).indices.cpu() + pack_index = torch.full_like(weight, + fill_value=-1, + dtype=torch.int64, + device="cpu") + rank_in_pack = torch.full_like(pack_index, fill_value=-1) + for i in range(num_layers): + pack_weights = [0] * num_packs + pack_items = [0] * num_packs + for group in indices[i]: + pack = min( + (i + for i in range(num_packs) if pack_items[i] < groups_per_pack), + key=pack_weights.__getitem__, + ) + assert pack_items[pack] < groups_per_pack + pack_index[i, group] = pack + rank_in_pack[i, group] = pack_items[pack] + pack_weights[pack] += weight[i, group] + pack_items[pack] += 1 + return pack_index, rank_in_pack + + +def replicate_experts( + weight: torch.Tensor, + num_phy: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Replicate `num_log` experts to `num_phy` replicas, such that the maximum + load of all replicas is minimized. + + Parameters: + weight: [X, num_log] + num_phy: total number of experts after replication + + Returns: + phy2log: [X, num_phy], logical expert id of each physical expert + rank: [X, num_phy], the replica rank + logcnt: [X, num_log], number of replicas for each logical expert + """ + n, num_log = weight.shape + num_redundant = num_phy - num_log + assert num_redundant >= 0 + device = weight.device + phy2log = torch.arange(num_phy, dtype=torch.int64, + device=device).repeat(n, 1) + rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device) + logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device) + arangen = torch.arange(n, dtype=torch.int64, device=device) + for i in range(num_log, num_phy): + redundant_indices = (weight / logcnt).max(dim=-1).indices + phy2log[:, i] = redundant_indices + rank[:, i] = logcnt[arangen, redundant_indices] + logcnt[arangen, redundant_indices] += 1 + return phy2log, rank, logcnt + + +def rebalance_experts_hierarchical( + weight: torch.Tensor, + num_physical_experts: int, + num_groups: int, + num_nodes: int, + num_gpus: int, +): + """ + Parameters: + weight: [num_moe_layers, num_logical_experts] + num_physical_experts: number of physical experts after replication + num_groups: number of expert groups + num_nodes: number of server nodes, where the intra-node network + (e.g, NVLink) is faster + num_gpus: number of GPUs, must be a multiple of `num_nodes` + + Returns: + physical_to_logical_map: [num_moe_layers, num_physical_experts] + logical_to_physical_map: [num_moe_layers, num_logical_experts, X] + logical_count: [num_moe_layers, num_logical_experts] + """ + num_layers, num_logical_experts = weight.shape + assert num_logical_experts % num_groups == 0 + group_size = num_logical_experts // num_groups + assert num_groups % num_nodes == 0 + groups_per_node = num_groups // num_nodes + assert num_gpus % num_nodes == 0 + assert num_physical_experts % num_gpus == 0 + phy_experts_per_gpu = num_physical_experts // num_gpus + + def inverse(perm: torch.Tensor) -> torch.Tensor: + inv = torch.empty_like(perm) + inv.scatter_( + 1, + perm, + torch.arange(perm.size(1), dtype=torch.int64, + device=perm.device).expand(perm.shape), + ) + return inv + + # Step 1: pack groups to nodes + tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1) + group_pack_index, group_rank_in_pack = balanced_packing( + tokens_per_group, num_nodes) + log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) * + group_size).unsqueeze(-1) + + torch.arange(group_size, + dtype=torch.int64, + device=group_pack_index.device)).flatten(-2) + mlog2log = inverse(log2mlog) + + # Step 2: construct redundant experts within nodes + # [num_layers * num_nodes, num_logical_experts // num_nodes] + tokens_per_mlog = weight.gather(-1, mlog2log).view( + -1, num_logical_experts // num_nodes) + phy2mlog, phyrank, mlogcnt = replicate_experts( + tokens_per_mlog, num_physical_experts // num_nodes) + + # Step 3: pack physical_experts to GPUs + # [num_layers * num_nodes, num_physical_experts // num_nodes] + tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog) + pack_index, rank_in_pack = balanced_packing(tokens_per_phy, + num_gpus // num_nodes) + phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack + pphy2phy = inverse(phy2pphy) + + pphy2mlog = phy2mlog.gather( + -1, pphy2phy) # [num_layers * num_nodes, num_log_per_nodes] + pphy2mlog = (pphy2mlog.view(num_layers, num_nodes, -1) + torch.arange( + 0, + num_logical_experts, + num_logical_experts // num_nodes, + device=group_pack_index.device, + ).view(1, -1, 1)).flatten(-2) + pphy2log = mlog2log.gather(-1, pphy2mlog) + pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1) + logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog) + return pphy2log, pphyrank, logcnt + + +def rebalance_experts( + weight: torch.Tensor, + num_replicas: int, + num_groups: int, + num_nodes: int, + num_gpus: int, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Entry point for expert-parallelism load balancer. + + Parameters: + weight: [layers, num_logical_experts], the load statistics for all + logical experts + num_replicas: number of physical experts, must be a multiple of + `num_gpus` + num_groups: number of expert groups + num_nodes: number of server nodes, where the intra-node network + (e.g, NVLink) is faster + num_gpus: number of GPUs, must be a multiple of `num_nodes` + + Returns: + physical_to_logical_map: [layers, num_replicas], the expert index of + each replica + logical_to_physical_map: [layers, num_logical_experts, X], the replica + indices for each expert + expert_count: [layers, num_logical_experts], number of physical + replicas for each logical expert + """ + num_layers, num_logical_experts = weight.shape + weight = weight.float().cpu() + if num_groups % num_nodes == 0: + # use hierarchical load-balance policy + phy2log, phyrank, logcnt = rebalance_experts_hierarchical( + weight, num_replicas, num_groups, num_nodes, num_gpus) + else: + # use global load-balance policy + phy2log, phyrank, logcnt = rebalance_experts_hierarchical( + weight, num_replicas, 1, 1, num_gpus) + num_redundant_experts = num_replicas - num_logical_experts + maxlogcnt = num_redundant_experts + 1 + log2phy: torch.Tensor = torch.full( + (num_layers, num_logical_experts, maxlogcnt), + -1, + dtype=torch.int64, + device=logcnt.device, + ) + log2phy.view(num_layers, -1).scatter_( + -1, + phy2log * maxlogcnt + phyrank, + torch.arange(num_replicas, dtype=torch.int64, + device=log2phy.device).expand(num_layers, -1), + ) + return phy2log, log2phy, logcnt + + +# EVOLVE-BLOCK-END + +__all__ = ["rebalance_experts"] + diff --git a/ttt/examples/eplb/seed/litellm_config.yaml b/ttt/examples/eplb/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/eplb/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/eplb/seed/opencode.json b/ttt/examples/eplb/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/eplb/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/eplb/task.yaml b/ttt/examples/eplb/task.yaml new file mode 100644 index 00000000..500bacfe --- /dev/null +++ b/ttt/examples/eplb/task.yaml @@ -0,0 +1,66 @@ +task: + name: "Expert Parallelism Load Balancer (EPLB) TTT" + description: | + You are an expert programmer specializing in optimization algorithms. Your task is + to improve the Mixture-of-Expert models Expert Parallelism Load Balancer (MoE EPLB) + expert rearrangement algorithm. + + This algorithm will take the load metrics recorded by the vLLM server, and rearrange + the experts to balance the load. It can make replicas of some experts to achieve + better load balancing. + + Your goal will be two-fold: + 1. Improve the algorithm to achieve better load balancing; while + 2. Improve the algorithm to be more efficient, i.e. reduce the execution time + of the algorithm itself, since perfect load balancing is NP-hard. + + The current algorithm is implemented in the `rebalance_experts` function: + + def rebalance_experts( + weight: torch.Tensor, # [num_moe_layers, num_logical_experts] load stats + num_replicas: int, # total physical experts (must be multiple of num_gpus) + num_groups: int, # number of expert groups + num_nodes: int, # number of server nodes + num_gpus: int, # number of GPUs (multiple of num_nodes) + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + # Returns: (physical_to_logical_map, logical_to_physical_map, expert_count) + + The score is: 0.5 * balancedness_score_expert + 0.5 * speed_score + where speed_score = 0.002 / avg_inference_time (higher is better). + tips: | + - Eval timeout is 360s. If your solution takes longer, it scores as a timeout. + - The evaluator checks: load balancedness across experts and GPUs, plus algorithm speed. + - torch is always available. + - Focus on both algorithmic quality AND implementation efficiency. + - The current algorithm uses hierarchical packing. Consider: ILP relaxations, + greedy heuristics, GPU-accelerated sorting, vectorized operations. + +grader: + timeout: 360 + direction: maximize + args: + program_file: "initial_program.py" + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/eplb/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs/eval/grader.py b/ttt/examples/frontier_cs/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs/generate_tasks.py b/ttt/examples/frontier_cs/generate_tasks.py new file mode 100644 index 00000000..936bf2d3 --- /dev/null +++ b/ttt/examples/frontier_cs/generate_tasks.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +"""Generate ttt task configs for all CPU-only Frontier-CS research problems. + +Reads from examples/frontier_cs_research/*/task.yaml, filters to needs_gpu=false, +and generates ttt/examples/frontier_cs_tasks//task.yaml with shared eval +and per-problem seed directories. + +Usage: + python ttt/examples/frontier_cs/generate_tasks.py + +All generated tasks share the same grader (eval/grader.py), litellm_config.yaml, +and opencode.json. Each problem gets its own seed/ with solution.py and statement.md. +""" + +import shutil +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent +UPSTREAM_DIR = REPO_ROOT / "examples" / "frontier_cs_research" +OUTPUT_DIR = REPO_ROOT / "ttt" / "examples" / "frontier_cs_tasks" +TEMPLATE_DIR = REPO_ROOT / "ttt" / "examples" / "frontier_cs" + +# Shared files (same for all problems) +SHARED_EVAL = TEMPLATE_DIR / "eval" / "grader.py" +SHARED_LITELLM = TEMPLATE_DIR / "seed" / "litellm_config.yaml" +SHARED_OPENCODE = TEMPLATE_DIR / "seed" / "opencode.json" + +TASK_YAML_TEMPLATE = """\ +task: + name: "Frontier-CS: {display_name} (TTT)" + description: | + Solve the '{display_name}' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: {timeout}s. + - Language: {language}. + +grader: + timeout: {timeout} + direction: maximize + args: + problem_name: {problem_name} + variant_name: "{variant_name}" + language: {language} + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/{dir_name}/seed" + +run: + verbose: false + ui: false + session: local +""" + + +def main(): + cpu_problems = [] + + for task_dir in sorted(UPSTREAM_DIR.iterdir()): + task_yaml = task_dir / "task.yaml" + if not task_yaml.exists(): + continue + + with open(task_yaml) as f: + config = yaml.safe_load(f) + + grader_args = config.get("grader", {}).get("args", {}) + if grader_args.get("needs_gpu", True): + continue + + cpu_problems.append((task_dir, config, grader_args)) + + print(f"Found {len(cpu_problems)} CPU-only problems") + + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + + for task_dir, config, grader_args in cpu_problems: + dir_name = task_dir.name + out_dir = OUTPUT_DIR / dir_name + + # Create directories + (out_dir / "eval").mkdir(parents=True, exist_ok=True) + (out_dir / "seed").mkdir(parents=True, exist_ok=True) + + # Copy shared eval + shutil.copy2(SHARED_EVAL, out_dir / "eval" / "grader.py") + + # Copy shared config files + shutil.copy2(SHARED_LITELLM, out_dir / "seed" / "litellm_config.yaml") + shutil.copy2(SHARED_OPENCODE, out_dir / "seed" / "opencode.json") + + # Copy problem-specific seed files + upstream_seed = task_dir / "seed" + if upstream_seed.exists(): + for f in upstream_seed.iterdir(): + if f.is_file(): + shutil.copy2(f, out_dir / "seed" / f.name) + + # Generate task.yaml + problem_name = grader_args.get("problem_name", "") + variant_name = grader_args.get("variant_name", "") + language = grader_args.get("language", "python") + timeout = config.get("grader", {}).get("timeout", 1800) + + if variant_name: + display_name = f"{problem_name} ({variant_name})" + else: + display_name = problem_name + + task_content = TASK_YAML_TEMPLATE.format( + display_name=display_name, + problem_name=problem_name, + variant_name=variant_name, + language=language, + timeout=timeout, + dir_name=dir_name, + ) + + (out_dir / "task.yaml").write_text(task_content) + print(f" ✓ {dir_name}") + + print(f"\nGenerated {len(cpu_problems)} task configs in {OUTPUT_DIR}") + print(f"\nRun any problem with:") + print(f" CORAL_TASK_YAML=ttt/examples/frontier_cs_tasks//task.yaml \\") + print(f" ./ttt/run_coral_distill.sh") + + +if __name__ == "__main__": + main() diff --git a/ttt/examples/frontier_cs/seed/litellm_config.yaml b/ttt/examples/frontier_cs/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs/seed/opencode.json b/ttt/examples/frontier_cs/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs/seed/solution.py b/ttt/examples/frontier_cs/seed/solution.py new file mode 100644 index 00000000..b7759637 --- /dev/null +++ b/ttt/examples/frontier_cs/seed/solution.py @@ -0,0 +1,66 @@ +""" +Example solution for cant-be-late problem. + +Solution interface: + class Solution(Strategy): + def solve(self, spec_path: str) -> "Solution": + # Read config from spec_path and initialize + return self + + def _step(self, last_cluster_type, has_spot) -> ClusterType: + # Decision logic at each simulation step + ... +""" +import json +import math +from argparse import Namespace + +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + + +class Solution(Strategy): + """Greedy strategy: stay on spot until deadline pressure dictates on-demand.""" + + NAME = "greedy_safety" + + def solve(self, spec_path: str) -> "Solution": + """Initialize the solution from spec_path config.""" + with open(spec_path) as f: + config = json.load(f) + + # Create args object for Strategy base class + args = Namespace( + deadline_hours=float(config["deadline"]), + task_duration_hours=[float(config["duration"])], + restart_overhead_hours=[float(config["overhead"])], + inter_task_overhead=[0.0], + ) + super().__init__(args) + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """Make decision at each simulation step.""" + env = self.env + gap = env.gap_seconds + + work_left = self.task_duration - sum(self.task_done_time) + if work_left <= 1e-9: + return ClusterType.NONE + + left_ticks = max(0, math.floor((self.deadline - env.elapsed_seconds) / gap)) + need1d = math.ceil((work_left + self.restart_overhead) / gap) + need2d = math.ceil((work_left + 2 * self.restart_overhead) / gap) + + # Must switch to on-demand if we can't afford any more preemptions + if need1d >= left_ticks: + return ClusterType.ON_DEMAND + + # Should be cautious if we can only afford one more preemption + if need2d >= left_ticks: + if env.cluster_type == ClusterType.SPOT and has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + # Otherwise, prefer spot if available + return ClusterType.SPOT if has_spot else ClusterType.NONE diff --git a/ttt/examples/frontier_cs/seed/statement.md b/ttt/examples/frontier_cs/seed/statement.md new file mode 100644 index 00000000..7e644282 --- /dev/null +++ b/ttt/examples/frontier_cs/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 70 hours (22-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: High availability (43-78%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs/task.yaml b/ttt/examples/frontier_cs/task.yaml new file mode 100644 index 00000000..cbf5676a --- /dev/null +++ b/ttt/examples/frontier_cs/task.yaml @@ -0,0 +1,64 @@ +task: + name: "Frontier-CS: Cant-Be-Late Scheduling (TTT)" + description: | + You are an expert programmer specializing in optimization and scheduling algorithms. + Your task is to implement a cloud compute scheduling strategy that minimizes cost + while meeting a hard deadline. + + At each time step, you decide whether to use Spot instances (cheap but unreliable), + On-Demand instances (expensive but guaranteed), or pause (NONE). + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your Solution class must inherit from Strategy and implement the _step() method: + + class Solution(Strategy): + NAME = "my_solution" + + def _step(self, last_cluster_type, has_spot) -> ClusterType: + # Return ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + ... + + Score: 0-100 based on cost minimization (lower cost → higher score). + Failing to meet the deadline gives -100000 penalty. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Eval timeout is 1800s. If your solution takes longer, it scores 0. + - Focus on balancing cost vs deadline risk using spot availability patterns. + - The evaluator tests against real Spot instance traces. + - Key attributes in _step: self.env.elapsed_seconds, self.task_duration, + self.deadline, self.restart_overhead. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: high_availability_loose_deadline_large_overhead + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..7e644282 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 70 hours (22-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: High availability (43-78%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..151082a7 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (high_availability_loose_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late (high_availability_loose_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "high_availability_loose_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..22bc41d2 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 70 hours (22-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: High availability (43-78%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..d3cd8b45 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (high_availability_loose_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late (high_availability_loose_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "high_availability_loose_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_loose_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..f45a403b --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 52 hours (4-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: High availability (43-78%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..150bc410 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (high_availability_tight_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late (high_availability_tight_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "high_availability_tight_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..00fbfb2e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 52 hours (4-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: High availability (43-78%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..aac89a75 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (high_availability_tight_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late (high_availability_tight_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "high_availability_tight_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__high_availability_tight_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..1dd5866f --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 70 hours (22-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: Low availability (4-40%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..9fd14408 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (low_availability_loose_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late (low_availability_loose_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "low_availability_loose_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..67721ec1 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 70 hours (22-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: Low availability (4-40%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..2156d0fd --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (low_availability_loose_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late (low_availability_loose_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "low_availability_loose_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_loose_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..04ba5015 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 52 hours (4-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: Low availability (4-40%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..4700c86a --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (low_availability_tight_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late (low_availability_tight_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "low_availability_tight_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..e8ab37e4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 52 hours (4-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: Low availability (4-40%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..20e662b3 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (low_availability_tight_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late (low_availability_tight_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "low_availability_tight_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__low_availability_tight_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..ec3f910f --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 70 hours (22-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: Mixed availability (both high 43-78% and low 4-40%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..02c2771b --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (mixed_availability_loose_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late (mixed_availability_loose_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "mixed_availability_loose_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..398c9088 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 70 hours (22-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: Mixed availability (both high 43-78% and low 4-40%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..cd8e2994 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (mixed_availability_loose_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late (mixed_availability_loose_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "mixed_availability_loose_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_loose_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..8e352d0b --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 52 hours (4-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: Mixed availability (both high 43-78% and low 4-40%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..763b7234 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (mixed_availability_tight_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late (mixed_availability_tight_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "mixed_availability_tight_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..4e176d27 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed/statement.md @@ -0,0 +1,118 @@ +Cant-Be-Late Scheduling Problem +================================ + +Problem Setting +-------- + +You are given a long-running compute job that must complete before a fixed **hard deadline**. +At each time step, you must choose which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + +The evaluation uses many real spot-availability traces. + +--- + +API Specification +----------------- + +Implement a `Solution` class that inherits from `Strategy`: + +```python +from sky_spot.strategies.strategy import Strategy +from sky_spot.utils import ClusterType + +class Solution(Strategy): + NAME = "my_solution" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Optional initialization. Called once before evaluation. + Read spec_path for configuration if needed. + Must return self. + """ + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Called at each time step. Return which cluster type to use next. + + Args: + last_cluster_type: The cluster type used in the previous step + has_spot: Whether spot instances are available this step + + Returns: + ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND + + @classmethod + def _from_args(cls, parser): # REQUIRED: For evaluator instantiation + args, _ = parser.parse_known_args() + return cls(args) +``` + +Available Attributes in `_step`: +- `self.env.elapsed_seconds`: Current time elapsed (seconds) +- `self.env.gap_seconds`: Time step size (seconds) +- `self.env.cluster_type`: Current cluster type +- `self.task_duration`: Total task duration needed (seconds) +- `self.task_done_time`: List of completed work segments +- `self.deadline`: Deadline time (seconds) +- `self.restart_overhead`: Time overhead when restarting (seconds) + +ClusterType Values: +- `ClusterType.SPOT`: Use spot instance +- `ClusterType.ON_DEMAND`: Use on-demand instance +- `ClusterType.NONE`: Do nothing this step (no cost) + +Scoring (0-100) +--------------- +``` +OD_anchor = Cost of running fully on-demand (baseline upper bound) +SPOT_anchor = Cost of running fully on spot (baseline lower bound) +AvgCost = Your strategy's average cost + +normalized_score = (OD_anchor - AvgCost) / (OD_anchor - SPOT_anchor) +score = clip(normalized_score, 0, 1) × 100 +``` + +If you fail to finish before the deadline, you receive a penalty score of -100000. + +Evaluation Details +------------------ +- Tested on real Spot instance traces +- Task duration: 48 hours +- Deadline: 52 hours (4-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand: ~3.06$/hr +- Price of Spot: ~0.97$/hr +- Regions: Mixed availability (both high 43-78% and low 4-40%) + +Your program has a total time limit of 300 seconds. + +Implementation Notes +--------------------- +**Required Elements:** +- `NAME` attribute must be defined on your Solution class +- `_from_args` classmethod must be implemented +- `solve()` must return `self` +- `_step()` must not return `ClusterType.SPOT` when `has_spot=False` + diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..8c0516d7 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late (mixed_availability_tight_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late (mixed_availability_tight_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late + variant_name: "mixed_availability_tight_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late__mixed_availability_tight_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..a057f75f --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed/statement.md @@ -0,0 +1,235 @@ +Cant-Be-Late Multi-Region Scheduling Problem +================================ + +Problem Setting +--------------- + +You are given a long-running compute job that must complete before a fixed hard deadline. +At each time step, you must choose which AWS region to run in and which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +- **Multi-region execution** + - You may switch to another AWS region at any timestep + - Switching regions forces a restart overhead (same as losing the work of the current timestep) + - Spot availability differs per region based on real traces + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Your strategy can also switch to a different region at each step. + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + + +The evaluation uses many real spot-availability traces. +--- + +API Specification +----------------- + +Implement a `Solution` class that extends `MultiRegionStrategy`: + +```python +import json +from argparse import Namespace + +from sky_spot.strategies.multi_strategy import MultiRegionStrategy +from sky_spot.utils import ClusterType + + +class Solution(MultiRegionStrategy): + """Your multi-region scheduling strategy.""" + + NAME = "my_strategy" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Initialize the solution from spec_path config. + + The spec file contains: + - deadline: deadline in hours + - duration: task duration in hours + - overhead: restart overhead in hours + - trace_files: list of trace file paths (one per region) + """ + with open(spec_path) as f: + config = json.load(f) + + args = Namespace( + deadline_hours=float(config["deadline"]), + task_duration_hours=[float(config["duration"])], + restart_overhead_hours=[float(config["overhead"])], + inter_task_overhead=[0.0], + ) + super().__init__(args) + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Decide next action based on current state. + + Available attributes: + - self.env.get_current_region(): Get current region index + - self.env.get_num_regions(): Get total number of regions + - self.env.switch_region(idx): Switch to region by index + - self.env.elapsed_seconds: Current time elapsed + - self.task_duration: Total task duration needed (seconds) + - self.deadline: Deadline time (seconds) + - self.restart_overhead: Restart overhead (seconds) + - self.task_done_time: List of completed work segments + - self.remaining_restart_overhead: Current pending overhead + + Returns: ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND +``` + +Parameters: +--------------- +### ClusterType: +ClusterType has 3 members: + +ClusterType.SPOT: Spot type cluster. + +ClusterType.ON_DEMAND: On Demand type cluster. + +ClusterType.None: None, no cluster. + +#### You are given some fixed parameters: + +env.gap_seconds: The size of each time step, in seconds. + +task_duration: The total amount of work time required to finish the task (in seconds). + +deadline: The task’s deadline (in seconds). + +restart_overhead: The time overhead incurred when a job restarts. + +You should implement the function to return the next cluster type to use as described above. + +#### At each time step, you are given: + +env.elapsed_seconds: Current time elapsed (in second). + +env.cluster_type: The current cluster type running your task. + +task_done_time: A list of completed work segments, where sum(self.task_done_time) = the amount of successful work time accumulated so far. + +has_spot: A boolean indicating whether the Spot cluster is available in the current time step. If False, the strategy must not return ClusterType.SPOT (doing so will raise an error). + +### You can use: + +env.get_current_region(): Get your current region index (0-8). + +env.switch_region(idx): Switch to region by index (no cost). + +#### You should return: + +ClusterType.SPOT: if you want to run the next time step on the Spot cluster. + +ClusterType.ON_DEMAND: if you want to run the next time step on the On-Demand cluster. + +ClusterType.NONE: if you choose not to run on any cluster during the next time step; this incurs no cost. + +Scoring +------- +``` +combined_score = -average_cost_across_all_scenarios +``` + +Negative cost: Lower cost = higher (less negative) score. + +Notice that if you fail to finish the task before the deadline, you will receive a penalty score of -100000. + +Evaluation Details +------------------ +**Stage 1**: Quick check on 2-region scenario (must pass to proceed) +**Stage 2**: Full evaluation on 6 scenarios: +- 2 zones same region (8 traces) +- 2 regions east-west (8 traces) +- 3 regions diverse (6 traces) +- 3 zones same region (6 traces) +- 5 regions high diversity (4 traces) +- All 9 regions (2 traces) + + +- Task duration: 24 hours +- Deadline: 48 hours (24-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand is 3.06$/hr +- Price of Spot is 0.9701$/hr + +- Notice your solution will be tested on real traces with high Spot availability. + +Your program has a total time limit of 300 seconds. You may be evaluated for up to 36 × 60 × 60 = 129600 time steps. Please ensure that your code is efficient under python. + +Implementation Notes +--------------------- +**Required Elements (Missing these will cause evaluation failures):** +- `NAME` attribute must be defined on your Solution class +- `solve(self, spec_path)` method must initialize the strategy and return `self` +- `_step(self, last_cluster_type, has_spot)` method must return a ClusterType +- Ensure proper handling of ClusterType.NONE return values + + + +Concrete Step Example: +---------------------- +Here is a concrete example demonstrating our environment. +Assume we are: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 0 +``` +If we use env.switch_region(1), we will have: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.ON_DEMAND, there will be a restart overhead: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 23400 +task_done_time | [3600, 3600, 2880, 3600, 3600, 2880] +has_spot | True +env.cluster_type | ClusterType.ON_DEMAND +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.SPOT: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 21600 +task_done_time | [3600, 3600, 2880, 3600, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` \ No newline at end of file diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..2082c860 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late_multi (high_availability_loose_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late_multi (high_availability_loose_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late_multi + variant_name: "high_availability_loose_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..67686572 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed/statement.md @@ -0,0 +1,235 @@ +Cant-Be-Late Multi-Region Scheduling Problem +================================ + +Problem Setting +--------------- + +You are given a long-running compute job that must complete before a fixed hard deadline. +At each time step, you must choose which AWS region to run in and which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +- **Multi-region execution** + - You may switch to another AWS region at any timestep + - Switching regions forces a restart overhead (same as losing the work of the current timestep) + - Spot availability differs per region based on real traces + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Your strategy can also switch to a different region at each step. + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + + +The evaluation uses many real spot-availability traces. +--- + +API Specification +----------------- + +Implement a `Solution` class that extends `MultiRegionStrategy`: + +```python +import json +from argparse import Namespace + +from sky_spot.strategies.multi_strategy import MultiRegionStrategy +from sky_spot.utils import ClusterType + + +class Solution(MultiRegionStrategy): + """Your multi-region scheduling strategy.""" + + NAME = "my_strategy" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Initialize the solution from spec_path config. + + The spec file contains: + - deadline: deadline in hours + - duration: task duration in hours + - overhead: restart overhead in hours + - trace_files: list of trace file paths (one per region) + """ + with open(spec_path) as f: + config = json.load(f) + + args = Namespace( + deadline_hours=float(config["deadline"]), + task_duration_hours=[float(config["duration"])], + restart_overhead_hours=[float(config["overhead"])], + inter_task_overhead=[0.0], + ) + super().__init__(args) + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Decide next action based on current state. + + Available attributes: + - self.env.get_current_region(): Get current region index + - self.env.get_num_regions(): Get total number of regions + - self.env.switch_region(idx): Switch to region by index + - self.env.elapsed_seconds: Current time elapsed + - self.task_duration: Total task duration needed (seconds) + - self.deadline: Deadline time (seconds) + - self.restart_overhead: Restart overhead (seconds) + - self.task_done_time: List of completed work segments + - self.remaining_restart_overhead: Current pending overhead + + Returns: ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND +``` + +Parameters: +--------------- +### ClusterType: +ClusterType has 3 members: + +ClusterType.SPOT: Spot type cluster. + +ClusterType.ON_DEMAND: On Demand type cluster. + +ClusterType.None: None, no cluster. + +#### You are given some fixed parameters: + +env.gap_seconds: The size of each time step, in seconds. + +task_duration: The total amount of work time required to finish the task (in seconds). + +deadline: The task’s deadline (in seconds). + +restart_overhead: The time overhead incurred when a job restarts. + +You should implement the function to return the next cluster type to use as described above. + +#### At each time step, you are given: + +env.elapsed_seconds: Current time elapsed (in second). + +env.cluster_type: The current cluster type running your task. + +task_done_time: A list of completed work segments, where sum(self.task_done_time) = the amount of successful work time accumulated so far. + +has_spot: A boolean indicating whether the Spot cluster is available in the current time step. If False, the strategy must not return ClusterType.SPOT (doing so will raise an error). + +### You can use: + +env.get_current_region(): Get your current region index (0-8). + +env.switch_region(idx): Switch to region by index (no cost). + +#### You should return: + +ClusterType.SPOT: if you want to run the next time step on the Spot cluster. + +ClusterType.ON_DEMAND: if you want to run the next time step on the On-Demand cluster. + +ClusterType.NONE: if you choose not to run on any cluster during the next time step; this incurs no cost. + +Scoring +------- +``` +combined_score = -average_cost_across_all_scenarios +``` + +Negative cost: Lower cost = higher (less negative) score. + +Notice that if you fail to finish the task before the deadline, you will receive a penalty score of -100000. + +Evaluation Details +------------------ +**Stage 1**: Quick check on 2-region scenario (must pass to proceed) +**Stage 2**: Full evaluation on 6 scenarios: +- 2 zones same region (8 traces) +- 2 regions east-west (8 traces) +- 3 regions diverse (6 traces) +- 3 zones same region (6 traces) +- 5 regions high diversity (4 traces) +- All 9 regions (2 traces) + + +- Task duration: 24 hours +- Deadline: 48 hours (24-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand is 3.06$/hr +- Price of Spot is 0.9701$/hr + +- Notice your solution will be tested on real traces with high Spot availability. + +Your program has a total time limit of 300 seconds. You may be evaluated for up to 36 × 60 × 60 = 129600 time steps. Please ensure that your code is efficient under python. + +Implementation Notes +--------------------- +**Required Elements (Missing these will cause evaluation failures):** +- `NAME` attribute must be defined on your Solution class +- `solve(self, spec_path)` method must initialize the strategy and return `self` +- `_step(self, last_cluster_type, has_spot)` method must return a ClusterType +- Ensure proper handling of ClusterType.NONE return values + + + +Concrete Step Example: +---------------------- +Here is a concrete example demonstrating our environment. +Assume we are: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 0 +``` +If we use env.switch_region(1), we will have: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.ON_DEMAND, there will be a restart overhead: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 23400 +task_done_time | [3600, 3600, 2880, 3600, 3600, 2880] +has_spot | True +env.cluster_type | ClusterType.ON_DEMAND +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.SPOT: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 21600 +task_done_time | [3600, 3600, 2880, 3600, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` \ No newline at end of file diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..9481993a --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late_multi (high_availability_loose_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late_multi (high_availability_loose_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late_multi + variant_name: "high_availability_loose_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_loose_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..4c2274ce --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed/statement.md @@ -0,0 +1,235 @@ +Cant-Be-Late Multi-Region Scheduling Problem +================================ + +Problem Setting +--------------- + +You are given a long-running compute job that must complete before a fixed hard deadline. +At each time step, you must choose which AWS region to run in and which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +- **Multi-region execution** + - You may switch to another AWS region at any timestep + - Switching regions forces a restart overhead (same as losing the work of the current timestep) + - Spot availability differs per region based on real traces + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Your strategy can also switch to a different region at each step. + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + + +The evaluation uses many real spot-availability traces. +--- + +API Specification +----------------- + +Implement a `Solution` class that extends `MultiRegionStrategy`: + +```python +import json +from argparse import Namespace + +from sky_spot.strategies.multi_strategy import MultiRegionStrategy +from sky_spot.utils import ClusterType + + +class Solution(MultiRegionStrategy): + """Your multi-region scheduling strategy.""" + + NAME = "my_strategy" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Initialize the solution from spec_path config. + + The spec file contains: + - deadline: deadline in hours + - duration: task duration in hours + - overhead: restart overhead in hours + - trace_files: list of trace file paths (one per region) + """ + with open(spec_path) as f: + config = json.load(f) + + args = Namespace( + deadline_hours=float(config["deadline"]), + task_duration_hours=[float(config["duration"])], + restart_overhead_hours=[float(config["overhead"])], + inter_task_overhead=[0.0], + ) + super().__init__(args) + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Decide next action based on current state. + + Available attributes: + - self.env.get_current_region(): Get current region index + - self.env.get_num_regions(): Get total number of regions + - self.env.switch_region(idx): Switch to region by index + - self.env.elapsed_seconds: Current time elapsed + - self.task_duration: Total task duration needed (seconds) + - self.deadline: Deadline time (seconds) + - self.restart_overhead: Restart overhead (seconds) + - self.task_done_time: List of completed work segments + - self.remaining_restart_overhead: Current pending overhead + + Returns: ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND +``` + +Parameters: +--------------- +### ClusterType: +ClusterType has 3 members: + +ClusterType.SPOT: Spot type cluster. + +ClusterType.ON_DEMAND: On Demand type cluster. + +ClusterType.None: None, no cluster. + +#### You are given some fixed parameters: + +env.gap_seconds: The size of each time step, in seconds. + +task_duration: The total amount of work time required to finish the task (in seconds). + +deadline: The task’s deadline (in seconds). + +restart_overhead: The time overhead incurred when a job restarts. + +You should implement the function to return the next cluster type to use as described above. + +#### At each time step, you are given: + +env.elapsed_seconds: Current time elapsed (in second). + +env.cluster_type: The current cluster type running your task. + +task_done_time: A list of completed work segments, where sum(self.task_done_time) = the amount of successful work time accumulated so far. + +has_spot: A boolean indicating whether the Spot cluster is available in the current time step. If False, the strategy must not return ClusterType.SPOT (doing so will raise an error). + +### You can use: + +env.get_current_region(): Get your current region index (0-8). + +env.switch_region(idx): Switch to region by index (no cost). + +#### You should return: + +ClusterType.SPOT: if you want to run the next time step on the Spot cluster. + +ClusterType.ON_DEMAND: if you want to run the next time step on the On-Demand cluster. + +ClusterType.NONE: if you choose not to run on any cluster during the next time step; this incurs no cost. + +Scoring +------- +``` +combined_score = -average_cost_across_all_scenarios +``` + +Negative cost: Lower cost = higher (less negative) score. + +Notice that if you fail to finish the task before the deadline, you will receive a penalty score of -100000. + +Evaluation Details +------------------ +**Stage 1**: Quick check on 2-region scenario (must pass to proceed) +**Stage 2**: Full evaluation on 6 scenarios: +- 2 zones same region (8 traces) +- 2 regions east-west (8 traces) +- 3 regions diverse (6 traces) +- 3 zones same region (6 traces) +- 5 regions high diversity (4 traces) +- All 9 regions (2 traces) + + +- Task duration: 24 hours +- Deadline: 36 hours (12-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand is 3.06$/hr +- Price of Spot is 0.9701$/hr + +- Notice your solution will be tested on real traces with high Spot availability. + +Your program has a total time limit of 300 seconds. You may be evaluated for up to 36 × 60 × 60 = 129600 time steps. Please ensure that your code is efficient under python. + +Implementation Notes +--------------------- +**Required Elements (Missing these will cause evaluation failures):** +- `NAME` attribute must be defined on your Solution class +- `solve(self, spec_path)` method must initialize the strategy and return `self` +- `_step(self, last_cluster_type, has_spot)` method must return a ClusterType +- Ensure proper handling of ClusterType.NONE return values + + + +Concrete Step Example: +---------------------- +Here is a concrete example demonstrating our environment. +Assume we are: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 0 +``` +If we use env.switch_region(1), we will have: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.ON_DEMAND, there will be a restart overhead: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 23400 +task_done_time | [3600, 3600, 2880, 3600, 3600, 2880] +has_spot | True +env.cluster_type | ClusterType.ON_DEMAND +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.SPOT: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 21600 +task_done_time | [3600, 3600, 2880, 3600, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` \ No newline at end of file diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..25d02f90 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late_multi (high_availability_tight_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late_multi (high_availability_tight_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late_multi + variant_name: "high_availability_tight_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..890f5f09 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed/statement.md @@ -0,0 +1,235 @@ +Cant-Be-Late Multi-Region Scheduling Problem +================================ + +Problem Setting +--------------- + +You are given a long-running compute job that must complete before a fixed hard deadline. +At each time step, you must choose which AWS region to run in and which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +- **Multi-region execution** + - You may switch to another AWS region at any timestep + - Switching regions forces a restart overhead (same as losing the work of the current timestep) + - Spot availability differs per region based on real traces + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Your strategy can also switch to a different region at each step. + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + + +The evaluation uses many real spot-availability traces. +--- + +API Specification +----------------- + +Implement a `Solution` class that extends `MultiRegionStrategy`: + +```python +import json +from argparse import Namespace + +from sky_spot.strategies.multi_strategy import MultiRegionStrategy +from sky_spot.utils import ClusterType + + +class Solution(MultiRegionStrategy): + """Your multi-region scheduling strategy.""" + + NAME = "my_strategy" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Initialize the solution from spec_path config. + + The spec file contains: + - deadline: deadline in hours + - duration: task duration in hours + - overhead: restart overhead in hours + - trace_files: list of trace file paths (one per region) + """ + with open(spec_path) as f: + config = json.load(f) + + args = Namespace( + deadline_hours=float(config["deadline"]), + task_duration_hours=[float(config["duration"])], + restart_overhead_hours=[float(config["overhead"])], + inter_task_overhead=[0.0], + ) + super().__init__(args) + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Decide next action based on current state. + + Available attributes: + - self.env.get_current_region(): Get current region index + - self.env.get_num_regions(): Get total number of regions + - self.env.switch_region(idx): Switch to region by index + - self.env.elapsed_seconds: Current time elapsed + - self.task_duration: Total task duration needed (seconds) + - self.deadline: Deadline time (seconds) + - self.restart_overhead: Restart overhead (seconds) + - self.task_done_time: List of completed work segments + - self.remaining_restart_overhead: Current pending overhead + + Returns: ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND +``` + +Parameters: +--------------- +### ClusterType: +ClusterType has 3 members: + +ClusterType.SPOT: Spot type cluster. + +ClusterType.ON_DEMAND: On Demand type cluster. + +ClusterType.None: None, no cluster. + +#### You are given some fixed parameters: + +env.gap_seconds: The size of each time step, in seconds. + +task_duration: The total amount of work time required to finish the task (in seconds). + +deadline: The task’s deadline (in seconds). + +restart_overhead: The time overhead incurred when a job restarts. + +You should implement the function to return the next cluster type to use as described above. + +#### At each time step, you are given: + +env.elapsed_seconds: Current time elapsed (in second). + +env.cluster_type: The current cluster type running your task. + +task_done_time: A list of completed work segments, where sum(self.task_done_time) = the amount of successful work time accumulated so far. + +has_spot: A boolean indicating whether the Spot cluster is available in the current time step. If False, the strategy must not return ClusterType.SPOT (doing so will raise an error). + +### You can use: + +env.get_current_region(): Get your current region index (0-8). + +env.switch_region(idx): Switch to region by index (no cost). + +#### You should return: + +ClusterType.SPOT: if you want to run the next time step on the Spot cluster. + +ClusterType.ON_DEMAND: if you want to run the next time step on the On-Demand cluster. + +ClusterType.NONE: if you choose not to run on any cluster during the next time step; this incurs no cost. + +Scoring +------- +``` +combined_score = -average_cost_across_all_scenarios +``` + +Negative cost: Lower cost = higher (less negative) score. + +Notice that if you fail to finish the task before the deadline, you will receive a penalty score of -100000. + +Evaluation Details +------------------ +**Stage 1**: Quick check on 2-region scenario (must pass to proceed) +**Stage 2**: Full evaluation on 6 scenarios: +- 2 zones same region (8 traces) +- 2 regions east-west (8 traces) +- 3 regions diverse (6 traces) +- 3 zones same region (6 traces) +- 5 regions high diversity (4 traces) +- All 9 regions (2 traces) + + +- Task duration: 24 hours +- Deadline: 36 hours (12-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand is 3.06$/hr +- Price of Spot is 0.9701$/hr + +- Notice your solution will be tested on real traces with high Spot availability. + +Your program has a total time limit of 300 seconds. You may be evaluated for up to 36 × 60 × 60 = 129600 time steps. Please ensure that your code is efficient under python. + +Implementation Notes +--------------------- +**Required Elements (Missing these will cause evaluation failures):** +- `NAME` attribute must be defined on your Solution class +- `solve(self, spec_path)` method must initialize the strategy and return `self` +- `_step(self, last_cluster_type, has_spot)` method must return a ClusterType +- Ensure proper handling of ClusterType.NONE return values + + + +Concrete Step Example: +---------------------- +Here is a concrete example demonstrating our environment. +Assume we are: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 0 +``` +If we use env.switch_region(1), we will have: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.ON_DEMAND, there will be a restart overhead: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 23400 +task_done_time | [3600, 3600, 2880, 3600, 3600, 2880] +has_spot | True +env.cluster_type | ClusterType.ON_DEMAND +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.SPOT: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 21600 +task_done_time | [3600, 3600, 2880, 3600, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` \ No newline at end of file diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..0e96094a --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late_multi (high_availability_tight_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late_multi (high_availability_tight_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late_multi + variant_name: "high_availability_tight_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late_multi__high_availability_tight_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..0e56c1e4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed/statement.md @@ -0,0 +1,233 @@ +Cant-Be-Late Multi-Region Scheduling Problem +================================ + +Problem Setting +--------------- + +You are given a long-running compute job that must complete before a fixed hard deadline. +At each time step, you must choose which AWS region to run in and which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +- **Multi-region execution** + - You may switch to another AWS region at any timestep + - Switching regions forces a restart overhead (same as losing the work of the current timestep) + - Spot availability differs per region based on real traces + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Your strategy can also switch to a different region at each step. + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + + +The evaluation uses many real spot-availability traces. +--- + +API Specification +----------------- + +Implement a `Solution` class that extends `MultiRegionStrategy`: + +```python +import json +from argparse import Namespace + +from sky_spot.strategies.multi_strategy import MultiRegionStrategy +from sky_spot.utils import ClusterType + + +class Solution(MultiRegionStrategy): + """Your multi-region scheduling strategy.""" + + NAME = "my_strategy" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Initialize the solution from spec_path config. + + The spec file contains: + - deadline: deadline in hours + - duration: task duration in hours + - overhead: restart overhead in hours + - trace_files: list of trace file paths (one per region) + """ + with open(spec_path) as f: + config = json.load(f) + + args = Namespace( + deadline_hours=float(config["deadline"]), + task_duration_hours=[float(config["duration"])], + restart_overhead_hours=[float(config["overhead"])], + inter_task_overhead=[0.0], + ) + super().__init__(args) + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Decide next action based on current state. + + Available attributes: + - self.env.get_current_region(): Get current region index + - self.env.get_num_regions(): Get total number of regions + - self.env.switch_region(idx): Switch to region by index + - self.env.elapsed_seconds: Current time elapsed + - self.task_duration: Total task duration needed (seconds) + - self.deadline: Deadline time (seconds) + - self.restart_overhead: Restart overhead (seconds) + - self.task_done_time: List of completed work segments + - self.remaining_restart_overhead: Current pending overhead + + Returns: ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND +``` + +Parameters: +--------------- +### ClusterType: +ClusterType has 3 members: + +ClusterType.SPOT: Spot type cluster. + +ClusterType.ON_DEMAND: On Demand type cluster. + +ClusterType.None: None, no cluster. + +#### You are given some fixed parameters: + +env.gap_seconds: The size of each time step, in seconds. + +task_duration: The total amount of work time required to finish the task (in seconds). + +deadline: The task’s deadline (in seconds). + +restart_overhead: The time overhead incurred when a job restarts. + +You should implement the function to return the next cluster type to use as described above. + +#### At each time step, you are given: + +env.elapsed_seconds: Current time elapsed (in second). + +env.cluster_type: The current cluster type running your task. + +task_done_time: A list of completed work segments, where sum(self.task_done_time) = the amount of successful work time accumulated so far. + +has_spot: A boolean indicating whether the Spot cluster is available in the current time step. If False, the strategy must not return ClusterType.SPOT (doing so will raise an error). + +### You can use: + +env.get_current_region(): Get your current region index (0-8). + +env.switch_region(idx): Switch to region by index (no cost). + +#### You should return: + +ClusterType.SPOT: if you want to run the next time step on the Spot cluster. + +ClusterType.ON_DEMAND: if you want to run the next time step on the On-Demand cluster. + +ClusterType.NONE: if you choose not to run on any cluster during the next time step; this incurs no cost. + +Scoring +------- +``` +combined_score = -average_cost_across_all_scenarios +``` + +Negative cost: Lower cost = higher (less negative) score. + +Notice that if you fail to finish the task before the deadline, you will receive a penalty score of -100000. + +Evaluation Details +------------------ +**Stage 1**: Quick check on 2-region scenario (must pass to proceed) +**Stage 2**: Full evaluation on 4 scenarios: +- 2 zones west (8 traces) +- 3 zones west (6 traces) +- 2 regions west-east2 (8 traces) +- 5 regions mixed (4 traces) + + +- Task duration: 24 hours +- Deadline: 48 hours (24-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand is 3.06$/hr +- Price of Spot is 0.9701$/hr + +- Notice your solution will be tested on real traces with low Spot availability. + +Your program has a total time limit of 300 seconds. You may be evaluated for up to 36 × 60 × 60 = 129600 time steps. Please ensure that your code is efficient under python. + +Implementation Notes +--------------------- +**Required Elements (Missing these will cause evaluation failures):** +- `NAME` attribute must be defined on your Solution class +- `solve(self, spec_path)` method must initialize the strategy and return `self` +- `_step(self, last_cluster_type, has_spot)` method must return a ClusterType +- Ensure proper handling of ClusterType.NONE return values + + + +Concrete Step Example: +---------------------- +Here is a concrete example demonstrating our environment. +Assume we are: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 0 +``` +If we use env.switch_region(1), we will have: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.ON_DEMAND, there will be a restart overhead: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 23400 +task_done_time | [3600, 3600, 2880, 3600, 3600, 2880] +has_spot | True +env.cluster_type | ClusterType.ON_DEMAND +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.SPOT: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 21600 +task_done_time | [3600, 3600, 2880, 3600, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` \ No newline at end of file diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..5df832c5 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late_multi (low_availability_loose_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late_multi (low_availability_loose_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late_multi + variant_name: "low_availability_loose_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..3cdf06f6 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed/statement.md @@ -0,0 +1,233 @@ +Cant-Be-Late Multi-Region Scheduling Problem +================================ + +Problem Setting +--------------- + +You are given a long-running compute job that must complete before a fixed hard deadline. +At each time step, you must choose which AWS region to run in and which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +- **Multi-region execution** + - You may switch to another AWS region at any timestep + - Switching regions forces a restart overhead (same as losing the work of the current timestep) + - Spot availability differs per region based on real traces + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Your strategy can also switch to a different region at each step. + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + + +The evaluation uses many real spot-availability traces. +--- + +API Specification +----------------- + +Implement a `Solution` class that extends `MultiRegionStrategy`: + +```python +import json +from argparse import Namespace + +from sky_spot.strategies.multi_strategy import MultiRegionStrategy +from sky_spot.utils import ClusterType + + +class Solution(MultiRegionStrategy): + """Your multi-region scheduling strategy.""" + + NAME = "my_strategy" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Initialize the solution from spec_path config. + + The spec file contains: + - deadline: deadline in hours + - duration: task duration in hours + - overhead: restart overhead in hours + - trace_files: list of trace file paths (one per region) + """ + with open(spec_path) as f: + config = json.load(f) + + args = Namespace( + deadline_hours=float(config["deadline"]), + task_duration_hours=[float(config["duration"])], + restart_overhead_hours=[float(config["overhead"])], + inter_task_overhead=[0.0], + ) + super().__init__(args) + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Decide next action based on current state. + + Available attributes: + - self.env.get_current_region(): Get current region index + - self.env.get_num_regions(): Get total number of regions + - self.env.switch_region(idx): Switch to region by index + - self.env.elapsed_seconds: Current time elapsed + - self.task_duration: Total task duration needed (seconds) + - self.deadline: Deadline time (seconds) + - self.restart_overhead: Restart overhead (seconds) + - self.task_done_time: List of completed work segments + - self.remaining_restart_overhead: Current pending overhead + + Returns: ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND +``` + +Parameters: +--------------- +### ClusterType: +ClusterType has 3 members: + +ClusterType.SPOT: Spot type cluster. + +ClusterType.ON_DEMAND: On Demand type cluster. + +ClusterType.None: None, no cluster. + +#### You are given some fixed parameters: + +env.gap_seconds: The size of each time step, in seconds. + +task_duration: The total amount of work time required to finish the task (in seconds). + +deadline: The task’s deadline (in seconds). + +restart_overhead: The time overhead incurred when a job restarts. + +You should implement the function to return the next cluster type to use as described above. + +#### At each time step, you are given: + +env.elapsed_seconds: Current time elapsed (in second). + +env.cluster_type: The current cluster type running your task. + +task_done_time: A list of completed work segments, where sum(self.task_done_time) = the amount of successful work time accumulated so far. + +has_spot: A boolean indicating whether the Spot cluster is available in the current time step. If False, the strategy must not return ClusterType.SPOT (doing so will raise an error). + +### You can use: + +env.get_current_region(): Get your current region index (0-8). + +env.switch_region(idx): Switch to region by index (no cost). + +#### You should return: + +ClusterType.SPOT: if you want to run the next time step on the Spot cluster. + +ClusterType.ON_DEMAND: if you want to run the next time step on the On-Demand cluster. + +ClusterType.NONE: if you choose not to run on any cluster during the next time step; this incurs no cost. + +Scoring +------- +``` +combined_score = -average_cost_across_all_scenarios +``` + +Negative cost: Lower cost = higher (less negative) score. + +Notice that if you fail to finish the task before the deadline, you will receive a penalty score of -100000. + +Evaluation Details +------------------ +**Stage 1**: Quick check on 2-region scenario (must pass to proceed) +**Stage 2**: Full evaluation on 4 scenarios: +- 2 zones west (8 traces) +- 3 zones west (6 traces) +- 2 regions west-east2 (8 traces) +- 5 regions mixed (4 traces) + + +- Task duration: 24 hours +- Deadline: 48 hours (24-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand is 3.06$/hr +- Price of Spot is 0.9701$/hr + +- Notice your solution will be tested on real traces with low Spot availability. + +Your program has a total time limit of 300 seconds. You may be evaluated for up to 36 × 60 × 60 = 129600 time steps. Please ensure that your code is efficient under python. + +Implementation Notes +--------------------- +**Required Elements (Missing these will cause evaluation failures):** +- `NAME` attribute must be defined on your Solution class +- `solve(self, spec_path)` method must initialize the strategy and return `self` +- `_step(self, last_cluster_type, has_spot)` method must return a ClusterType +- Ensure proper handling of ClusterType.NONE return values + + + +Concrete Step Example: +---------------------- +Here is a concrete example demonstrating our environment. +Assume we are: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 0 +``` +If we use env.switch_region(1), we will have: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.ON_DEMAND, there will be a restart overhead: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 23400 +task_done_time | [3600, 3600, 2880, 3600, 3600, 2880] +has_spot | True +env.cluster_type | ClusterType.ON_DEMAND +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.SPOT: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 21600 +task_done_time | [3600, 3600, 2880, 3600, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` \ No newline at end of file diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..6938b778 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late_multi (low_availability_loose_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late_multi (low_availability_loose_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late_multi + variant_name: "low_availability_loose_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_loose_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/statement.md new file mode 100644 index 00000000..56331dbe --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed/statement.md @@ -0,0 +1,233 @@ +Cant-Be-Late Multi-Region Scheduling Problem +================================ + +Problem Setting +--------------- + +You are given a long-running compute job that must complete before a fixed hard deadline. +At each time step, you must choose which AWS region to run in and which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +- **Multi-region execution** + - You may switch to another AWS region at any timestep + - Switching regions forces a restart overhead (same as losing the work of the current timestep) + - Spot availability differs per region based on real traces + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Your strategy can also switch to a different region at each step. + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + + +The evaluation uses many real spot-availability traces. +--- + +API Specification +----------------- + +Implement a `Solution` class that extends `MultiRegionStrategy`: + +```python +import json +from argparse import Namespace + +from sky_spot.strategies.multi_strategy import MultiRegionStrategy +from sky_spot.utils import ClusterType + + +class Solution(MultiRegionStrategy): + """Your multi-region scheduling strategy.""" + + NAME = "my_strategy" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Initialize the solution from spec_path config. + + The spec file contains: + - deadline: deadline in hours + - duration: task duration in hours + - overhead: restart overhead in hours + - trace_files: list of trace file paths (one per region) + """ + with open(spec_path) as f: + config = json.load(f) + + args = Namespace( + deadline_hours=float(config["deadline"]), + task_duration_hours=[float(config["duration"])], + restart_overhead_hours=[float(config["overhead"])], + inter_task_overhead=[0.0], + ) + super().__init__(args) + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Decide next action based on current state. + + Available attributes: + - self.env.get_current_region(): Get current region index + - self.env.get_num_regions(): Get total number of regions + - self.env.switch_region(idx): Switch to region by index + - self.env.elapsed_seconds: Current time elapsed + - self.task_duration: Total task duration needed (seconds) + - self.deadline: Deadline time (seconds) + - self.restart_overhead: Restart overhead (seconds) + - self.task_done_time: List of completed work segments + - self.remaining_restart_overhead: Current pending overhead + + Returns: ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND +``` + +Parameters: +--------------- +### ClusterType: +ClusterType has 3 members: + +ClusterType.SPOT: Spot type cluster. + +ClusterType.ON_DEMAND: On Demand type cluster. + +ClusterType.None: None, no cluster. + +#### You are given some fixed parameters: + +env.gap_seconds: The size of each time step, in seconds. + +task_duration: The total amount of work time required to finish the task (in seconds). + +deadline: The task’s deadline (in seconds). + +restart_overhead: The time overhead incurred when a job restarts. + +You should implement the function to return the next cluster type to use as described above. + +#### At each time step, you are given: + +env.elapsed_seconds: Current time elapsed (in second). + +env.cluster_type: The current cluster type running your task. + +task_done_time: A list of completed work segments, where sum(self.task_done_time) = the amount of successful work time accumulated so far. + +has_spot: A boolean indicating whether the Spot cluster is available in the current time step. If False, the strategy must not return ClusterType.SPOT (doing so will raise an error). + +### You can use: + +env.get_current_region(): Get your current region index (0-8). + +env.switch_region(idx): Switch to region by index (no cost). + +#### You should return: + +ClusterType.SPOT: if you want to run the next time step on the Spot cluster. + +ClusterType.ON_DEMAND: if you want to run the next time step on the On-Demand cluster. + +ClusterType.NONE: if you choose not to run on any cluster during the next time step; this incurs no cost. + +Scoring +------- +``` +combined_score = -average_cost_across_all_scenarios +``` + +Negative cost: Lower cost = higher (less negative) score. + +Notice that if you fail to finish the task before the deadline, you will receive a penalty score of -100000. + +Evaluation Details +------------------ +**Stage 1**: Quick check on 2-region scenario (must pass to proceed) +**Stage 2**: Full evaluation on 4 scenarios: +- 2 zones west (8 traces) +- 3 zones west (6 traces) +- 2 regions west-east2 (8 traces) +- 5 regions mixed (4 traces) + + +- Task duration: 24 hours +- Deadline: 36 hours (12-hour slack) +- Restart overhead: 0.20 hours (12 minutes) +- Price of on-demand is 3.06$/hr +- Price of Spot is 0.9701$/hr + +- Notice your solution will be tested on real traces with low Spot availability. + +Your program has a total time limit of 300 seconds. You may be evaluated for up to 36 × 60 × 60 = 129600 time steps. Please ensure that your code is efficient under python. + +Implementation Notes +--------------------- +**Required Elements (Missing these will cause evaluation failures):** +- `NAME` attribute must be defined on your Solution class +- `solve(self, spec_path)` method must initialize the strategy and return `self` +- `_step(self, last_cluster_type, has_spot)` method must return a ClusterType +- Ensure proper handling of ClusterType.NONE return values + + + +Concrete Step Example: +---------------------- +Here is a concrete example demonstrating our environment. +Assume we are: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 0 +``` +If we use env.switch_region(1), we will have: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.ON_DEMAND, there will be a restart overhead: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 23400 +task_done_time | [3600, 3600, 2880, 3600, 3600, 2880] +has_spot | True +env.cluster_type | ClusterType.ON_DEMAND +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.SPOT: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 21600 +task_done_time | [3600, 3600, 2880, 3600, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` \ No newline at end of file diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/task.yaml new file mode 100644 index 00000000..0c4ea797 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late_multi (low_availability_tight_deadline_large_overhead) (TTT)" + description: | + Solve the 'cant_be_late_multi (low_availability_tight_deadline_large_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late_multi + variant_name: "low_availability_tight_deadline_large_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_large_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/eval/grader.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/solution.py b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/statement.md b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/statement.md new file mode 100644 index 00000000..2b4ae077 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed/statement.md @@ -0,0 +1,233 @@ +Cant-Be-Late Multi-Region Scheduling Problem +================================ + +Problem Setting +--------------- + +You are given a long-running compute job that must complete before a fixed hard deadline. +At each time step, you must choose which AWS region to run in and which type of cloud compute resource to use: + +- **Spot instances** + - Very cheap + - May become unavailable at certain timesteps + - Can be preempted at any time, the job will incur a **restart overhead** + +- **On-demand instances** + - Guaranteed available + - Expensive + - Never interrupted + +- **Multi-region execution** + - You may switch to another AWS region at any timestep + - Switching regions forces a restart overhead (same as losing the work of the current timestep) + - Spot availability differs per region based on real traces + +Your strategy must decide at every timestep whether to use Spot, use On-Demand, or pause (NONE). + +Your strategy can also switch to a different region at each step. + +Restart overheads do not stack: launching a new instance while an old overhead is still pending will replace the previous remaining restart overhead with the new one. + +Your goal is to **finish before the deadline** while **minimizing cost**. + + +The evaluation uses many real spot-availability traces. +--- + +API Specification +----------------- + +Implement a `Solution` class that extends `MultiRegionStrategy`: + +```python +import json +from argparse import Namespace + +from sky_spot.strategies.multi_strategy import MultiRegionStrategy +from sky_spot.utils import ClusterType + + +class Solution(MultiRegionStrategy): + """Your multi-region scheduling strategy.""" + + NAME = "my_strategy" # REQUIRED: unique identifier + + def solve(self, spec_path: str) -> "Solution": + """ + Initialize the solution from spec_path config. + + The spec file contains: + - deadline: deadline in hours + - duration: task duration in hours + - overhead: restart overhead in hours + - trace_files: list of trace file paths (one per region) + """ + with open(spec_path) as f: + config = json.load(f) + + args = Namespace( + deadline_hours=float(config["deadline"]), + task_duration_hours=[float(config["duration"])], + restart_overhead_hours=[float(config["overhead"])], + inter_task_overhead=[0.0], + ) + super().__init__(args) + return self + + def _step(self, last_cluster_type: ClusterType, has_spot: bool) -> ClusterType: + """ + Decide next action based on current state. + + Available attributes: + - self.env.get_current_region(): Get current region index + - self.env.get_num_regions(): Get total number of regions + - self.env.switch_region(idx): Switch to region by index + - self.env.elapsed_seconds: Current time elapsed + - self.task_duration: Total task duration needed (seconds) + - self.deadline: Deadline time (seconds) + - self.restart_overhead: Restart overhead (seconds) + - self.task_done_time: List of completed work segments + - self.remaining_restart_overhead: Current pending overhead + + Returns: ClusterType.SPOT, ClusterType.ON_DEMAND, or ClusterType.NONE + """ + # Your decision logic here + if has_spot: + return ClusterType.SPOT + return ClusterType.ON_DEMAND +``` + +Parameters: +--------------- +### ClusterType: +ClusterType has 3 members: + +ClusterType.SPOT: Spot type cluster. + +ClusterType.ON_DEMAND: On Demand type cluster. + +ClusterType.None: None, no cluster. + +#### You are given some fixed parameters: + +env.gap_seconds: The size of each time step, in seconds. + +task_duration: The total amount of work time required to finish the task (in seconds). + +deadline: The task’s deadline (in seconds). + +restart_overhead: The time overhead incurred when a job restarts. + +You should implement the function to return the next cluster type to use as described above. + +#### At each time step, you are given: + +env.elapsed_seconds: Current time elapsed (in second). + +env.cluster_type: The current cluster type running your task. + +task_done_time: A list of completed work segments, where sum(self.task_done_time) = the amount of successful work time accumulated so far. + +has_spot: A boolean indicating whether the Spot cluster is available in the current time step. If False, the strategy must not return ClusterType.SPOT (doing so will raise an error). + +### You can use: + +env.get_current_region(): Get your current region index (0-8). + +env.switch_region(idx): Switch to region by index (no cost). + +#### You should return: + +ClusterType.SPOT: if you want to run the next time step on the Spot cluster. + +ClusterType.ON_DEMAND: if you want to run the next time step on the On-Demand cluster. + +ClusterType.NONE: if you choose not to run on any cluster during the next time step; this incurs no cost. + +Scoring +------- +``` +combined_score = -average_cost_across_all_scenarios +``` + +Negative cost: Lower cost = higher (less negative) score. + +Notice that if you fail to finish the task before the deadline, you will receive a penalty score of -100000. + +Evaluation Details +------------------ +**Stage 1**: Quick check on 2-region scenario (must pass to proceed) +**Stage 2**: Full evaluation on 4 scenarios: +- 2 zones west (8 traces) +- 3 zones west (6 traces) +- 2 regions west-east2 (8 traces) +- 5 regions mixed (4 traces) + + +- Task duration: 24 hours +- Deadline: 36 hours (12-hour slack) +- Restart overhead: 0.05 hours (3 minutes) +- Price of on-demand is 3.06$/hr +- Price of Spot is 0.9701$/hr + +- Notice your solution will be tested on real traces with low Spot availability. + +Your program has a total time limit of 300 seconds. You may be evaluated for up to 36 × 60 × 60 = 129600 time steps. Please ensure that your code is efficient under python. + +Implementation Notes +--------------------- +**Required Elements (Missing these will cause evaluation failures):** +- `NAME` attribute must be defined on your Solution class +- `solve(self, spec_path)` method must initialize the strategy and return `self` +- `_step(self, last_cluster_type, has_spot)` method must return a ClusterType +- Ensure proper handling of ClusterType.NONE return values + + + +Concrete Step Example: +---------------------- +Here is a concrete example demonstrating our environment. +Assume we are: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 0 +``` +If we use env.switch_region(1), we will have: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 18000 +task_done_time | [3600, 3600, 2880, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.ON_DEMAND, there will be a restart overhead: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 23400 +task_done_time | [3600, 3600, 2880, 3600, 3600, 2880] +has_spot | True +env.cluster_type | ClusterType.ON_DEMAND +env.get_current_region() | 1 +``` +If our strategy returns ClusterType.SPOT: +``` +Parameter | Value +-------------------------|------------------------ +env.gap_seconds | 3600.0 +env.elapsed_seconds | 21600 +task_done_time | [3600, 3600, 2880, 3600, 3600, 3600] +has_spot | True +env.cluster_type | ClusterType.SPOT +env.get_current_region() | 1 +``` \ No newline at end of file diff --git a/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/task.yaml b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/task.yaml new file mode 100644 index 00000000..4d74e29e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cant_be_late_multi (low_availability_tight_deadline_small_overhead) (TTT)" + description: | + Solve the 'cant_be_late_multi (low_availability_tight_deadline_small_overhead)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cant_be_late_multi + variant_name: "low_availability_tight_deadline_small_overhead" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cant_be_late_multi__low_availability_tight_deadline_small_overhead/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/cloudcast/eval/grader.py b/ttt/examples/frontier_cs_tasks/cloudcast/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cloudcast/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/cloudcast/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/cloudcast/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cloudcast/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/cloudcast/seed/opencode.json b/ttt/examples/frontier_cs_tasks/cloudcast/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cloudcast/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/cloudcast/seed/solution.py b/ttt/examples/frontier_cs_tasks/cloudcast/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cloudcast/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/cloudcast/seed/statement.md b/ttt/examples/frontier_cs_tasks/cloudcast/seed/statement.md new file mode 100644 index 00000000..d8086233 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cloudcast/seed/statement.md @@ -0,0 +1,204 @@ +Cloudcast Broadcast Optimization Problem +======================================== + +Problem Setting +--------------- +Design broadcast topology optimization for multi-cloud data distribution. Given a source node and multiple destination nodes across AWS, Azure, and GCP, find the optimal broadcast paths that minimize transfer cost while respecting bandwidth constraints. + +The data is split into multiple partitions that can be transmitted independently. Different partitions can take different routes to the same destination, allowing for parallel transmission and load balancing across the network. + +**Optimization Goal**: Minimize total cost ($) + +$$ +\text{Total Cost} = C_{\text{egress}} + C_{\text{instance}} +$$ + +Where: + +$$ +C_{\text{egress}} = \sum_{e \in E} \left( |P_e| \times s_{\text{partition}} \times c_e \right) +$$ + +$$ +C_{\text{instance}} = |V| \times n_{\text{vm}} \times \frac{r_{\text{instance}}}{3600} \times t_{\text{transfer}} +$$ + +**Notation**: +- $E$: Set of all edges used in the broadcast topology (union of all partition paths) +- $P_e$: Set of partitions using edge $e$ (automatically computed by evaluator) +- $s_{\text{partition}}$: Size of each partition (GB) = $\frac{\text{data\_vol}}{\text{num\_partitions}}$ +- $c_e$: Cost per GB for edge $e$ (\$/GB) +- $V$: Set of all nodes appearing in any partition path (automatically computed by evaluator) +- $n_{\text{vm}}$: Number of VMs per region (default: 2) +- $r_{\text{instance}}$: Instance hourly rate (\$/hour) = \$0.54 +- $t_{\text{transfer}}$: Total transfer time (seconds) = $\max_{d \in D} \max_{p \in [0, n_p)} \max_{e \in \text{path}(d,p)} \frac{|P_e| \times s_{\text{partition}} \times 8}{f_e}$ + - $D$: Set of destination nodes + - $n_p$: Number of partitions + - $f_e$: Actual throughput (flow) on edge $e$ after bandwidth constraint enforcement (Gbps) + +API Specification +----------------- +Implement a `Solution` class that returns a search algorithm: + +```python +class Solution: + def solve(self, spec_path: str = None) -> dict: + """ + Returns a dict with either: + - {"code": "python_code_string"} + - {"program_path": "path/to/algorithm.py"} + """ + # Your implementation + pass +``` + +Your algorithm code must implement: + +```python +import networkx as nx + +def search_algorithm(src: str, dsts: list[str], G: nx.DiGraph, num_partitions: int) -> BroadCastTopology: + """ + Design routing paths for broadcasting data partitions to multiple destinations. + + Args: + src: Source node (e.g., "aws:ap-northeast-1") + dsts: List of destination nodes (e.g., ["aws:us-east-1", "gcp:us-central1"]) + G: NetworkX DiGraph with edge attributes: + - "cost": float ($/GB) - egress cost for transferring data + - "throughput": float (Gbps) - maximum bandwidth capacity + num_partitions: Number of data partitions to broadcast + + Returns: + BroadCastTopology object with routing paths for all (destination, partition) pairs + """ + pass + + +class BroadCastTopology: + def __init__(self, src: str, dsts: list[str], num_partitions: int): + self.src = src + self.dsts = dsts + self.num_partitions = int(num_partitions) + # Structure: {dst: {partition_id: [edges]}} + # Each edge is [src_node, dst_node, edge_data_dict] + self.paths = {dst: {str(i): None for i in range(self.num_partitions)} for dst in dsts} + + def append_dst_partition_path(self, dst: str, partition: int, path: list): + """ + Append an edge to the path for a specific destination-partition pair. + + Args: + dst: Destination node + partition: Partition ID (0 to num_partitions-1) + path: Edge represented as [src_node, dst_node, edge_data_dict] + where edge_data_dict = G[src_node][dst_node] + """ + partition = str(partition) + if self.paths[dst][partition] is None: + self.paths[dst][partition] = [] + self.paths[dst][partition].append(path) + + def set_dst_partition_paths(self, dst: str, partition: int, paths: list[list]): + """ + Set the complete path (list of edges) for a destination-partition pair. + + Args: + dst: Destination node + partition: Partition ID + paths: List of edges, each edge is [src_node, dst_node, edge_data_dict] + """ + partition = str(partition) + self.paths[dst][partition] = paths + + def set_num_partitions(self, num_partitions: int): + """Update number of partitions""" + self.num_partitions = num_partitions +``` + +Bandwidth Constraints +--------------------- +Each cloud provider has ingress/egress limits (Gbps) per region: +- AWS: 10 Gbps ingress, 5 Gbps egress +- GCP: 16 Gbps ingress, 7 Gbps egress +- Azure: 16 Gbps ingress, 16 Gbps egress + +These limits are multiplied by the number of VMs per region. + +When multiple edges share a node and exceed its limits: +- Flow is **equally distributed** among incoming/outgoing edges (each edge gets $\frac{\text{limit}}{n_{\text{edges}}}$) +- Transfer time increases as actual throughput decreases +- Example: If a node has 3 outgoing edges and 5 Gbps egress limit, each edge gets min(original_flow, 5/3 Gbps) + +**Strategy tip**: Different partitions can use different paths to the same destination, potentially avoiding bottlenecks by distributing load across the network. + +Scoring (0-100) +--------------- +```python +score = 1.0 / (1.0 + total_cost) * 100 +``` + +Lower total cost → higher score + +Example: Basic Implementation +------------------------------ +```python +def search_algorithm(src, dsts, G, num_partitions): + bc_topology = BroadCastTopology(src, dsts, num_partitions) + + for dst in dsts: + path = nx.dijkstra_path(G, src, dst, weight="cost") + for i in range(len(path) - 1): + for partition_id in range(num_partitions): + bc_topology.append_dst_partition_path(dst, partition_id, + [path[i], path[i + 1], G[path[i]][path[i + 1]]]) + + return bc_topology +``` + +Evaluation Details +------------------ +- **Test configurations**: 5 network scenarios + - intra-AWS: Broadcasting within AWS regions + - intra-Azure: Broadcasting within Azure regions + - intra-GCP: Broadcasting within GCP regions + - inter-AGZ: Broadcasting across AWS, GCP, Azure + - inter-GAZ2: Another multi-cloud scenario +- **Network scale**: ~20-50 regions per provider +- **Default setup**: 2 VMs per region +- **Data volume**: Varies by configuration (e.g., 300 GB) +- **Partitions**: Varies by configuration (e.g., 10 partitions) +- **Instance cost**: $0.54/hour (based on m5.8xlarge spot instances) + +Input Format +------------ +The `spec_path` parameter is a string containing the file path to the specification JSON file. + +**spec_path file format:** +```json +{ + "config_files": ["examples/config/intra_aws.json", ...], + "num_vms": 2 +} +``` + +Each config file contains: +```json +{ + "source_node": "aws:ap-northeast-1", + "dest_nodes": ["aws:us-east-1", "aws:eu-west-1", ...], + "data_vol": 300, + "num_partitions": 10, + "ingress_limit": {"aws": 10, "gcp": 16, "azure": 16}, + "egress_limit": {"aws": 5, "gcp": 7, "azure": 16} +} +``` + +Requirements and Constraints +----------------------------- +- All partitions (0 to num_partitions-1) must have valid paths to each destination +- Paths must start from the source node and end at the specified destination +- Self-loops are not allowed +- Different partitions can use different routes to the same destination +- Multiple destinations can share intermediate nodes (tree topology) +- The BroadCastTopology class is provided in the evaluation environment diff --git a/ttt/examples/frontier_cs_tasks/cloudcast/task.yaml b/ttt/examples/frontier_cs_tasks/cloudcast/task.yaml new file mode 100644 index 00000000..304177de --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/cloudcast/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: cloudcast (TTT)" + description: | + Solve the 'cloudcast' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: cloudcast + variant_name: "" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/cloudcast/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/eval/grader.py b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/opencode.json b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/solution.py b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/statement.md b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/statement.md new file mode 100644 index 00000000..3888ca90 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed/statement.md @@ -0,0 +1,146 @@ +SQL Parser Fuzzer Optimization Problem +====================================== + +Problem Setting +--------------- +Design and implement a high-performance fuzzer that maximizes code coverage of a SQL parser within a fixed time budget. This problem focuses on implementing efficient test generation strategies that systematically explore parser behavior. + +The challenge involves optimizing: +- **Input generation**: Efficiently generating or mutating SQL statements to trigger diverse parser paths +- **Coverage exploration**: Discovering inputs that exercise different parser branches and edge cases +- **Time efficiency**: Maximizing coverage achieved within the 60-second time budget +- **Feedback utilization**: Optionally using coverage feedback to guide generation strategies + +Target +------ +- **Primary**: Maximize line coverage percentage of the SQL engine (higher is better) +- **Secondary**: Maximize branch coverage percentage (used as tiebreaker) +- **Tertiary**: Achieve high coverage with fewer parser invocations (efficiency bonus) + +API Specification +----------------- +Implement a `Solution` class that returns fuzzer code: + +```python +class Solution: + def solve(self, resources_path: str) -> dict: + """ + Returns a dict with either: + - {"code": "python_code_string"} + - {"program_path": "path/to/fuzzer.py"} + """ + # Your implementation + pass +``` + +Your fuzzer implementation must provide: + +```python +def fuzz(parse_sql): + """ + Generate SQL statements and execute them through the parser. + + This function will be called repeatedly by the evaluator until the time + budget (60 seconds) is exhausted. Each call should generate a batch of + SQL statements and pass them to parse_sql for execution. + + Args: + parse_sql: A function that accepts a list[str] of SQL statements. + Call parse_sql(["SELECT * FROM t", "INSERT INTO t VALUES (1)"]) + to execute statements through the parser. The parser will + attempt to parse each statement, contributing to coverage. + + Returns: + bool: Return True to continue fuzzing, False to stop early. + The evaluator will keep calling fuzz() until either: + - The time budget (60 seconds) is exhausted + - fuzz() returns False + + Example: + def fuzz(parse_sql): + # Generate some SQL statements + statements = [ + "SELECT * FROM users", + "INSERT INTO orders (id, name) VALUES (1, 'test')", + "UPDATE products SET price = 100 WHERE id = 1", + ] + # Execute through parser (this contributes to coverage) + parse_sql(statements) + return True # Continue fuzzing + """ + pass +``` + +Resources +--------- +The `resources_path` directory contains: +``` +resources/ +├── sql_grammar.txt # BNF-style grammar describing valid SQL syntax +└── sql_engine/ # Target SQL parser package + ├── __init__.py + ├── parser.py # Recursive descent parser + ├── tokenizer.py # SQL tokenizer + └── ast_nodes.py # AST node definitions +``` + +You may explore these resources to understand the parser's structure and develop your fuzzing strategy. Various approaches can be effective: +- Grammar-based generation +- Coverage-guided mutation +- Random testing with heuristics +- Hybrid approaches + +Fuzzer Interface Details +------------------------ +- **parse_sql function**: Accepts `list[str]` of SQL statements + - Each statement is parsed independently + - Exceptions during parsing are caught and do not halt fuzzing + - All executed statements contribute to the cumulative coverage measurement +- **fuzz() calls**: The evaluator calls `fuzz(parse_sql)` repeatedly in a loop + - Multiple calls allow for incremental fuzzing strategies + - Coverage accumulates across all calls + - Return `True` to continue, `False` to stop early +- **Stateful fuzzing**: Your fuzzer can maintain state between calls (e.g., corpus, coverage map) + +Scoring (0-100) +--------------- +Performance is measured based on code coverage achieved: + +``` +# Coverage metrics (0-100 each) +line_coverage = lines_covered / total_lines * 100 +branch_coverage = branches_covered / total_branches * 100 + +# Weighted coverage (0-100) +weighted_cov = 0.6 * line_coverage + 0.4 * branch_coverage + +# Non-linear coverage score (0-70 points) +adjusted_cov = (weighted_cov / 100)^3 * 100 +coverage_score = 0.7 * adjusted_cov + +# Efficiency bonus (0-30 points): fewer parser calls = higher bonus +# N = number of parse_sql calls, N_ref = 500 (reference count) +efficiency_bonus = 30 * 2^(-N / N_ref) + +score = coverage_score + efficiency_bonus +``` + +- Coverage determines 70% of the score (non-linear: high coverage is rewarded more) +- Efficiency bonus (30%) rewards achieving coverage with fewer parser invocations +- Achieving high coverage efficiently yields higher scores + +Evaluation Details +------------------ +- **Time Budget**: 60 seconds total for fuzzing execution +- **Coverage Tool**: Python `coverage` module with branch coverage enabled +- **Target Files**: parser.py, tokenizer.py, ast_nodes.py from sql_engine +- **Timing**: Starts when the first `fuzz()` call begins + +Additional Notes +---------------- +- The evaluator handles all coverage measurement; your fuzzer only needs to generate inputs +- Parse errors during fuzzing are expected and do not penalize the score +- The `parse_sql` function catches exceptions internally; your fuzzer won't crash from bad SQL +- Consider generating both valid and edge-case SQL to maximize coverage +- State can be maintained across `fuzz()` calls for incremental exploration + diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/task.yaml b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/task.yaml new file mode 100644 index 00000000..1c771d64 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: grammar_fuzzing (fuzzer_sql) (TTT)" + description: | + Solve the 'grammar_fuzzing (fuzzer_sql)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 300s. + - Language: python. + +grader: + timeout: 300 + direction: maximize + args: + problem_name: grammar_fuzzing + variant_name: "fuzzer_sql" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/grammar_fuzzing__fuzzer_sql/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/eval/grader.py b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/opencode.json b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/solution.py b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/statement.md b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/statement.md new file mode 100644 index 00000000..5338a041 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed/statement.md @@ -0,0 +1,107 @@ +SQL Parser Test Case Generation +================================ + +Problem Setting +--------------- +Generate SQL test cases that maximize code coverage of a SQL parser. + +You are provided with: +1. **SQL Grammar** (`resources/sql_grammar.txt`): BNF-style grammar describing valid SQL syntax +2. **SQL Engine** (`resources/sql_engine/`): Target SQL parser package containing `parser.py`, `tokenizer.py`, and `ast_nodes.py` + +Your task is to generate SQL statements that achieve maximum code coverage when parsed by the SQL engine. + +Target +------ +- **Primary**: Maximize line coverage percentage of the SQL engine (higher is better) +- **Secondary**: Maximize branch coverage percentage (tiebreaker) + +API Specification +----------------- +Implement a `Solution` class: + +```python +class Solution: + def solve(self, resources_path: str) -> list[str]: + """ + Return SQL test cases designed to maximize parser coverage. + + Args: + resources_path: Path to the resources directory containing: + - sql_grammar.txt: BNF-style grammar file + - sql_engine/: Target SQL parser package (parser.py, tokenizer.py, ast_nodes.py) + + Returns: + list[str]: List of SQL statement strings + """ + pass +``` + +Resources Directory Structure +----------------------------- +The `resources_path` directory contains: +``` +resources/ +├── sql_grammar.txt # BNF-style grammar describing valid SQL syntax +└── sql_engine/ # Target SQL parser package + ├── __init__.py + ├── parser.py # Recursive descent parser + ├── tokenizer.py # SQL tokenizer + └── ast_nodes.py # AST node definitions +``` + +You should explore these files to understand: +- The grammar rules for generating valid SQL +- The parser implementation to understand coverage targets + + +Output Specifications +--------------------- +- Return a list of SQL statement strings +- Each statement is parsed via `parse_sql(statement)` to measure coverage +- Invalid statements (that cause parser exceptions) don't contribute to coverage + +Correctness Requirements +------------------------ +- Statements should be syntactically valid according to the grammar +- The parser is called via `parse_sql(statement)` from the sql_engine package +- Parser exceptions are caught but those statements don't improve coverage + +Scoring (0-100) +--------------- + +``` +# Coverage metrics (0-100 each) +line_coverage = lines_covered / total_lines * 100 +branch_coverage = branches_covered / total_branches * 100 + +# Weighted coverage (0-100) +weighted_cov = 0.6 * line_coverage + 0.4 * branch_coverage + +# Non-linear coverage score (0-70 points) +adjusted_cov = (weighted_cov / 100)^3 * 100 +coverage_score = 0.7 * adjusted_cov + +# Efficiency bonus (0-30 points): fewer test cases = higher bonus +# N = number of test cases, N_ref = 50 (reference count) +efficiency_bonus = 30 * 2^(-N / N_ref) + +score = coverage_score + efficiency_bonus +``` + +- Coverage determines 70% of the score (non-linear: high coverage is rewarded more) +- Efficiency bonus (30%) rewards achieving coverage with fewer test cases +- Achieving high coverage with fewer test cases yields higher scores + +Evaluation Details +------------------ +- **Coverage Tool**: Python `coverage` module with branch coverage enabled +- **Target Files**: `parser.py`, `tokenizer.py`, and `ast_nodes.py` in the sql_engine package +- **Measurement**: Each generated statement is parsed and coverage is accumulated + +Additional Notes +---------------- +- You may read and analyze the grammar file and parser source code to understand the coverage targets +- The SQL engine supports various statement types, clauses, expressions, joins, functions, and subqueries +- Focus on generating diverse statements that exercise different code paths in the parser + diff --git a/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/task.yaml b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/task.yaml new file mode 100644 index 00000000..e31cfd76 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: grammar_fuzzing (seed_sql) (TTT)" + description: | + Solve the 'grammar_fuzzing (seed_sql)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 300s. + - Language: python. + +grader: + timeout: 300 + direction: maximize + args: + problem_name: grammar_fuzzing + variant_name: "seed_sql" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/grammar_fuzzing__seed_sql/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/eval/grader.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/opencode.json b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/solution.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/statement.md b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/statement.md new file mode 100644 index 00000000..245dd126 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed/statement.md @@ -0,0 +1,226 @@ +ImageNet Pareto Optimization - 1M Parameter Variant +=================================================== + +Problem Setting +--------------- +Train a neural network on a synthetic ImageNet-like dataset to maximize accuracy while staying within a parameter budget of 1,000,000 parameters. + +Objective: Achieve the highest possible accuracy without exceeding the parameter constraint. + +Target +------ +**Primary**: Maximize test accuracy +**Secondary**: Maintain model efficiency (stay under parameter budget) + +API Specification +---------------- +Implement a `Solution` class: + +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None) -> torch.nn.Module: + """ + Train a model and return it. + + Args: + train_loader: PyTorch DataLoader with training data + val_loader: PyTorch DataLoader with validation data + metadata: Dict with keys: + - num_classes: int (128) + - input_dim: int (384) + - param_limit: int (1,000,000) + - baseline_accuracy: float (0.8) + - train_samples: int + - val_samples: int + - test_samples: int + - device: str ("cpu") + + Returns: + Trained torch.nn.Module ready for evaluation + """ + # Your implementation + pass +``` + +**Implementation Requirements**: +- Use `metadata["input_dim"]` and `metadata["num_classes"]` for model architecture +- Keep model parameters <= 1,000,000 (hard constraint - models exceeding this receive 0 score) +- Return a trained model ready for evaluation +- Ensure model works with the provided device + +Parameter Constraint +-------------------- +**HARD LIMIT: 1,000,000 trainable parameters** + +- This is an absolute constraint enforced during evaluation +- Models exceeding 1,000,000 parameters will receive a score of 0.0 +- The constraint cannot be waived under any circumstances +- You must design your architecture carefully to stay under this limit + +Example: A model with 1,000,001 parameters → Score 0.0 (constraint violated) +Example: A model with 1,000,000 parameters → Score based on accuracy + +Baseline Accuracy +----------------- +**Baseline Accuracy for this variant: 80%** + +- This is the expected performance level for a simple model at this parameter budget +- Solutions must achieve accuracy **above** this baseline to receive a positive score +- Accuracy **below** baseline results in 0 points +- Accuracy improvements are scored linearly + +Scoring Formula +--------------- + +The scoring is based purely on **linear accuracy scaling** from baseline to 100%: + +``` +If model exceeds parameter limit (1,000,000): + Score = 0.0 (constraint violation) + +Else: + Score = (accuracy - 0.8) / (1.0 - 0.8) × 100.0 + + Where: + - accuracy = achieved test accuracy (0.0 to 1.0) + - 0.8 = baseline accuracy for this variant + - 1.0 = target (100% accuracy = 100 points) + + Score is clamped to [0, 100] range +``` + +**Linearly Scaled Scoring for 1M variant:** + +| Accuracy | Score | Notes | +|----------|-------|-------| +| 80.0% | 0 | At baseline (0 points) | +| 85.0% | ~25 | 5% above baseline | +| 90.0% | ~50 | 10% above baseline | +| 95.0% | ~75 | 15% above baseline | +| 100% | 100 | Perfect accuracy (max score) | + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Build Synthetic Dataset +```python +# Generate synthetic ImageNet-like data +train_loader, val_loader, test_loader = make_dataloaders() +# Each sample: (384,) feature vector, label in [0, 127] +``` + +### 2. Call Solution +```python +from solution import Solution +solution = Solution() +model = solution.solve(train_loader, val_loader, metadata) +# metadata contains: num_classes, input_dim, param_limit, baseline_accuracy, device +``` + +### 3. Validate Model +```python +param_count = sum(p.numel() for p in model.parameters() if p.requires_grad) +if param_count > 1000000: + score = 0.0 # Constraint violation +``` + +### 4. Evaluate Accuracy +```python +model.eval() +correct = 0 +total = 0 +for inputs, targets in test_loader: + outputs = model(inputs) + preds = outputs.argmax(dim=1) + correct += (preds == targets).sum().item() + total += targets.numel() +accuracy = correct / total +``` + +### 5. Calculate Score +```python +score = (accuracy - 0.8) / (1.0 - 0.8) * 100.0 +score = max(0.0, min(100.0, score)) +``` + +Evaluation Details +------------------ +- 128 classes, 384-dimensional feature vectors +- Training: 2,048 samples (16 per class) +- Validation: 512 samples (4 per class) +- Test: 1,024 samples (8 per class) +- Data generated synthetically with controlled noise + +Environment Details +------------------- +- **Device**: CPU only (`device="cpu"`) +- **Python Environment**: + - Python 3 + - PyTorch 2.2-2.4 + - NumPy ≥1.24 + - tqdm ≥4.64 +- **Timeout**: 1 hour (3600 seconds) for entire evaluation + +Key Points +---------- +1. **Parameter Constraint is Hard**: Models exceeding 1,000,000 parameters always score 0 +2. **Baseline is Lower Bound**: Must achieve 80%+ accuracy to score points +3. **Linear Scoring**: Every accuracy improvement scales linearly to the score +4. **100% is Target**: Achieving 100% accuracy gives full 100 points +5. **Accuracy is Primary**: Focus on accuracy within the parameter budget + +Example: Simple Baseline +------------------------- +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None): + # Simple 2-layer MLP + input_dim = metadata["input_dim"] # 384 + num_classes = metadata["num_classes"] # 128 + hidden_dim = 512 + + model = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, num_classes) + ) + + # Parameter count: 384*512 + 512 + 512*128 + 128 = ~262,784 + + # Simple training loop + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + for epoch in range(50): + model.train() + for inputs, targets in train_loader: + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + return model +``` + +**Note**: This baseline achieves ~80% accuracy with ~263K parameters. To reach higher accuracy within the 1M budget, consider deeper networks, residual connections, or better optimization. + +Implementation Tips +------------------- +- Monitor parameter count: `sum(p.numel() for p in model.parameters() if p.requires_grad)` +- Gradually improve architecture while staying under budget +- Use techniques like batch normalization, dropout, or residual connections +- Higher capacity (more parameters) generally improves accuracy up to the limit + +Baseline Performance +-------------------- +- **Baseline Accuracy**: 80% +- **Baseline Parameters**: Approximately 1,000,000 +- This represents a simple model at this parameter budget diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/task.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/task.yaml new file mode 100644 index 00000000..1b57f1d3 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: imagenet_pareto (1m) (TTT)" + description: | + Solve the 'imagenet_pareto (1m)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: imagenet_pareto + variant_name: "1m" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/imagenet_pareto__1m/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/eval/grader.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/opencode.json b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/solution.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/statement.md b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/statement.md new file mode 100644 index 00000000..9c105ef0 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed/statement.md @@ -0,0 +1,226 @@ +ImageNet Pareto Optimization - 200K Parameter Variant +===================================================== + +Problem Setting +--------------- +Train a neural network on a synthetic ImageNet-like dataset to maximize accuracy while staying within a parameter budget of 200,000 parameters. + +Objective: Achieve the highest possible accuracy without exceeding the parameter constraint. + +Target +------ +**Primary**: Maximize test accuracy +**Secondary**: Maintain model efficiency (stay under parameter budget) + +API Specification +---------------- +Implement a `Solution` class: + +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None) -> torch.nn.Module: + """ + Train a model and return it. + + Args: + train_loader: PyTorch DataLoader with training data + val_loader: PyTorch DataLoader with validation data + metadata: Dict with keys: + - num_classes: int (128) + - input_dim: int (384) + - param_limit: int (200,000) + - baseline_accuracy: float (0.65) + - train_samples: int + - val_samples: int + - test_samples: int + - device: str ("cpu") + + Returns: + Trained torch.nn.Module ready for evaluation + """ + # Your implementation + pass +``` + +**Implementation Requirements**: +- Use `metadata["input_dim"]` and `metadata["num_classes"]` for model architecture +- Keep model parameters <= 200,000 (hard constraint - models exceeding this receive 0 score) +- Return a trained model ready for evaluation +- Ensure model works with the provided device + +Parameter Constraint +-------------------- +**HARD LIMIT: 200,000 trainable parameters** + +- This is an absolute constraint enforced during evaluation +- Models exceeding 200,000 parameters will receive a score of 0.0 +- The constraint cannot be waived under any circumstances +- You must design your architecture carefully to stay under this limit + +Example: A model with 200,001 parameters → Score 0.0 (constraint violated) +Example: A model with 200,000 parameters → Score based on accuracy + +Baseline Accuracy +----------------- +**Baseline Accuracy for this variant: 65%** + +- This is the expected performance level for a simple model at this parameter budget +- Solutions must achieve accuracy **above** this baseline to receive a positive score +- Accuracy **below** baseline results in 0 points +- Accuracy improvements are scored linearly + +Scoring Formula +--------------- + +The scoring is based purely on **linear accuracy scaling** from baseline to 100%: + +``` +If model exceeds parameter limit (200,000): + Score = 0.0 (constraint violation) + +Else: + Score = (accuracy - 0.65) / (1.0 - 0.65) × 100.0 + + Where: + - accuracy = achieved test accuracy (0.0 to 1.0) + - 0.65 = baseline accuracy for this variant + - 1.0 = target (100% accuracy = 100 points) + + Score is clamped to [0, 100] range +``` + +**Linearly Scaled Scoring for 200K variant:** + +| Accuracy | Score | Notes | +|----------|-------|-------| +| 65.0% | 0 | At baseline (0 points) | +| 70.0% | ~14 | 5% above baseline | +| 75.0% | ~28 | 10% above baseline | +| 80.0% | ~42 | 15% above baseline | +| 100% | 100 | Perfect accuracy (max score) | + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Build Synthetic Dataset +```python +# Generate synthetic ImageNet-like data +train_loader, val_loader, test_loader = make_dataloaders() +# Each sample: (384,) feature vector, label in [0, 127] +``` + +### 2. Call Solution +```python +from solution import Solution +solution = Solution() +model = solution.solve(train_loader, val_loader, metadata) +# metadata contains: num_classes, input_dim, param_limit, baseline_accuracy, device +``` + +### 3. Validate Model +```python +param_count = sum(p.numel() for p in model.parameters() if p.requires_grad) +if param_count > 200000: + score = 0.0 # Constraint violation +``` + +### 4. Evaluate Accuracy +```python +model.eval() +correct = 0 +total = 0 +for inputs, targets in test_loader: + outputs = model(inputs) + preds = outputs.argmax(dim=1) + correct += (preds == targets).sum().item() + total += targets.numel() +accuracy = correct / total +``` + +### 5. Calculate Score +```python +score = (accuracy - 0.65) / (1.0 - 0.65) * 100.0 +score = max(0.0, min(100.0, score)) +``` + +Evaluation Details +------------------ +- 128 classes, 384-dimensional feature vectors +- Training: 2,048 samples (16 per class) +- Validation: 512 samples (4 per class) +- Test: 1,024 samples (8 per class) +- Data generated synthetically with controlled noise + +Environment Details +------------------- +- **Device**: CPU only (`device="cpu"`) +- **Python Environment**: + - Python 3 + - PyTorch 2.2-2.4 + - NumPy ≥1.24 + - tqdm ≥4.64 +- **Timeout**: 1 hour (3600 seconds) for entire evaluation + +Key Points +---------- +1. **Parameter Constraint is Hard**: Models exceeding 200,000 parameters always score 0 +2. **Baseline is Lower Bound**: Must achieve 65%+ accuracy to score points +3. **Linear Scoring**: Every accuracy improvement scales linearly to the score +4. **100% is Target**: Achieving 100% accuracy gives full 100 points +5. **Accuracy is Primary**: Focus on accuracy within the parameter budget + +Example: Simple Baseline +------------------------- +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None): + # Simple 2-layer MLP + input_dim = metadata["input_dim"] # 384 + num_classes = metadata["num_classes"] # 128 + hidden_dim = 256 + + model = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, num_classes) + ) + + # Parameter count: 384*256 + 256 + 256*128 + 128 = ~131,456 + + # Simple training loop + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + for epoch in range(50): + model.train() + for inputs, targets in train_loader: + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + return model +``` + +**Note**: This baseline achieves ~65% accuracy with ~131K parameters. To reach higher accuracy within the 200K budget, consider deeper networks or better optimization. + +Implementation Tips +------------------- +- Monitor parameter count: `sum(p.numel() for p in model.parameters() if p.requires_grad)` +- Gradually improve architecture while staying under budget +- Use techniques like batch normalization, dropout, or residual connections +- Higher capacity (more parameters) generally improves accuracy up to the limit + +Baseline Performance +-------------------- +- **Baseline Accuracy**: 65% +- **Baseline Parameters**: Approximately 200,000 +- This represents a simple model at this parameter budget diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/task.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/task.yaml new file mode 100644 index 00000000..b41df9b1 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: imagenet_pareto (200k) (TTT)" + description: | + Solve the 'imagenet_pareto (200k)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: imagenet_pareto + variant_name: "200k" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/imagenet_pareto__200k/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/eval/grader.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/opencode.json b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/solution.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/statement.md b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/statement.md new file mode 100644 index 00000000..47c46cef --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed/statement.md @@ -0,0 +1,228 @@ +ImageNet Pareto Optimization - 2.5M Parameter Variant +===================================================== + +Problem Setting +--------------- +Train a neural network on a synthetic ImageNet-like dataset to maximize accuracy while staying within a parameter budget of 2,500,000 parameters. + +Objective: Achieve the highest possible accuracy without exceeding the parameter constraint. + +Target +------ +**Primary**: Maximize test accuracy +**Secondary**: Maintain model efficiency (stay under parameter budget) + +API Specification +---------------- +Implement a `Solution` class: + +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None) -> torch.nn.Module: + """ + Train a model and return it. + + Args: + train_loader: PyTorch DataLoader with training data + val_loader: PyTorch DataLoader with validation data + metadata: Dict with keys: + - num_classes: int (128) + - input_dim: int (384) + - param_limit: int (2,500,000) + - baseline_accuracy: float (0.85) + - train_samples: int + - val_samples: int + - test_samples: int + - device: str ("cpu") + + Returns: + Trained torch.nn.Module ready for evaluation + """ + # Your implementation + pass +``` + +**Implementation Requirements**: +- Use `metadata["input_dim"]` and `metadata["num_classes"]` for model architecture +- Keep model parameters <= 2,500,000 (hard constraint - models exceeding this receive 0 score) +- Return a trained model ready for evaluation +- Ensure model works with the provided device + +Parameter Constraint +-------------------- +**HARD LIMIT: 2,500,000 trainable parameters** + +- This is an absolute constraint enforced during evaluation +- Models exceeding 2,500,000 parameters will receive a score of 0.0 +- The constraint cannot be waived under any circumstances +- You must design your architecture carefully to stay under this limit + +Example: A model with 2,500,001 parameters → Score 0.0 (constraint violated) +Example: A model with 2,500,000 parameters → Score based on accuracy + +Baseline Accuracy +----------------- +**Baseline Accuracy for this variant: 85%** + +- This is the expected performance level for a simple model at this parameter budget +- Solutions must achieve accuracy **above** this baseline to receive a positive score +- Accuracy **below** baseline results in 0 points +- Accuracy improvements are scored linearly + +Scoring Formula +--------------- + +The scoring is based purely on **linear accuracy scaling** from baseline to 100%: + +``` +If model exceeds parameter limit (2,500,000): + Score = 0.0 (constraint violation) + +Else: + Score = (accuracy - 0.85) / (1.0 - 0.85) × 100.0 + + Where: + - accuracy = achieved test accuracy (0.0 to 1.0) + - 0.85 = baseline accuracy for this variant + - 1.0 = target (100% accuracy = 100 points) + + Score is clamped to [0, 100] range +``` + +**Linearly Scaled Scoring for 2.5M variant:** + +| Accuracy | Score | Notes | +|----------|-------|-------| +| 85.0% | 0 | At baseline (0 points) | +| 90.0% | ~33 | 5% above baseline | +| 95.0% | ~66 | 10% above baseline | +| 100.0% | ~100 | 15% above baseline | +| 100% | 100 | Perfect accuracy (max score) | + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Build Synthetic Dataset +```python +# Generate synthetic ImageNet-like data +train_loader, val_loader, test_loader = make_dataloaders() +# Each sample: (384,) feature vector, label in [0, 127] +``` + +### 2. Call Solution +```python +from solution import Solution +solution = Solution() +model = solution.solve(train_loader, val_loader, metadata) +# metadata contains: num_classes, input_dim, param_limit, baseline_accuracy, device +``` + +### 3. Validate Model +```python +param_count = sum(p.numel() for p in model.parameters() if p.requires_grad) +if param_count > 2500000: + score = 0.0 # Constraint violation +``` + +### 4. Evaluate Accuracy +```python +model.eval() +correct = 0 +total = 0 +for inputs, targets in test_loader: + outputs = model(inputs) + preds = outputs.argmax(dim=1) + correct += (preds == targets).sum().item() + total += targets.numel() +accuracy = correct / total +``` + +### 5. Calculate Score +```python +score = (accuracy - 0.85) / (1.0 - 0.85) * 100.0 +score = max(0.0, min(100.0, score)) +``` + +Evaluation Details +------------------ +- 128 classes, 384-dimensional feature vectors +- Training: 2,048 samples (16 per class) +- Validation: 512 samples (4 per class) +- Test: 1,024 samples (8 per class) +- Data generated synthetically with controlled noise + +Environment Details +------------------- +- **Device**: CPU only (`device="cpu"`) +- **Python Environment**: + - Python 3 + - PyTorch 2.2-2.4 + - NumPy ≥1.24 + - tqdm ≥4.64 +- **Timeout**: 1 hour (3600 seconds) for entire evaluation + +Key Points +---------- +1. **Parameter Constraint is Hard**: Models exceeding 2,500,000 parameters always score 0 +2. **Baseline is Lower Bound**: Must achieve 85%+ accuracy to score points +3. **Linear Scoring**: Every accuracy improvement scales linearly to the score +4. **100% is Target**: Achieving 100% accuracy gives full 100 points +5. **Accuracy is Primary**: Focus on accuracy within the parameter budget + +Example: Simple Baseline +------------------------- +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None): + # Simple 3-layer MLP + input_dim = metadata["input_dim"] # 384 + num_classes = metadata["num_classes"] # 128 + hidden_dim = 1024 + + model = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, num_classes) + ) + + # Parameter count: 384*1024 + 1024 + 1024*1024 + 1024 + 1024*128 + 128 = ~1,577,728 + + # Simple training loop + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + for epoch in range(50): + model.train() + for inputs, targets in train_loader: + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + return model +``` + +**Note**: This baseline achieves ~85% accuracy with ~1.58M parameters. To reach higher accuracy within the 2.5M budget, consider deeper networks or better optimization. + +Implementation Tips +------------------- +- Monitor parameter count: `sum(p.numel() for p in model.parameters() if p.requires_grad)` +- Gradually improve architecture while staying under budget +- Use techniques like batch normalization, dropout, or residual connections +- Higher capacity (more parameters) generally improves accuracy up to the limit + +Baseline Performance +-------------------- +- **Baseline Accuracy**: 85% +- **Baseline Parameters**: Approximately 2,500,000 +- This represents a simple model at this parameter budget diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/task.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/task.yaml new file mode 100644 index 00000000..3c70a289 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: imagenet_pareto (2_5m) (TTT)" + description: | + Solve the 'imagenet_pareto (2_5m)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: imagenet_pareto + variant_name: "2_5m" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/imagenet_pareto__2_5m/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/eval/grader.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/opencode.json b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/solution.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/statement.md b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/statement.md new file mode 100644 index 00000000..c3d35068 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed/statement.md @@ -0,0 +1,226 @@ +ImageNet Pareto Optimization - 500K Parameter Variant +===================================================== + +Problem Setting +--------------- +Train a neural network on a synthetic ImageNet-like dataset to maximize accuracy while staying within a parameter budget of 500,000 parameters. + +Objective: Achieve the highest possible accuracy without exceeding the parameter constraint. + +Target +------ +**Primary**: Maximize test accuracy +**Secondary**: Maintain model efficiency (stay under parameter budget) + +API Specification +---------------- +Implement a `Solution` class: + +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None) -> torch.nn.Module: + """ + Train a model and return it. + + Args: + train_loader: PyTorch DataLoader with training data + val_loader: PyTorch DataLoader with validation data + metadata: Dict with keys: + - num_classes: int (128) + - input_dim: int (384) + - param_limit: int (500,000) + - baseline_accuracy: float (0.72) + - train_samples: int + - val_samples: int + - test_samples: int + - device: str ("cpu") + + Returns: + Trained torch.nn.Module ready for evaluation + """ + # Your implementation + pass +``` + +**Implementation Requirements**: +- Use `metadata["input_dim"]` and `metadata["num_classes"]` for model architecture +- Keep model parameters <= 500,000 (hard constraint - models exceeding this receive 0 score) +- Return a trained model ready for evaluation +- Ensure model works with the provided device + +Parameter Constraint +-------------------- +**HARD LIMIT: 500,000 trainable parameters** + +- This is an absolute constraint enforced during evaluation +- Models exceeding 500,000 parameters will receive a score of 0.0 +- The constraint cannot be waived under any circumstances +- You must design your architecture carefully to stay under this limit + +Example: A model with 500,001 parameters → Score 0.0 (constraint violated) +Example: A model with 500,000 parameters → Score based on accuracy + +Baseline Accuracy +----------------- +**Baseline Accuracy for this variant: 72%** + +- This is the expected performance level for a simple model at this parameter budget +- Solutions must achieve accuracy **above** this baseline to receive a positive score +- Accuracy **below** baseline results in 0 points +- Accuracy improvements are scored linearly + +Scoring Formula +--------------- + +The scoring is based purely on **linear accuracy scaling** from baseline to 100%: + +``` +If model exceeds parameter limit (500,000): + Score = 0.0 (constraint violation) + +Else: + Score = (accuracy - 0.72) / (1.0 - 0.72) × 100.0 + + Where: + - accuracy = achieved test accuracy (0.0 to 1.0) + - 0.72 = baseline accuracy for this variant + - 1.0 = target (100% accuracy = 100 points) + + Score is clamped to [0, 100] range +``` + +**Linearly Scaled Scoring for 500K variant:** + +| Accuracy | Score | Notes | +|----------|-------|-------| +| 72.0% | 0 | At baseline (0 points) | +| 77.0% | ~17 | 5% above baseline | +| 82.0% | ~35 | 10% above baseline | +| 87.0% | ~53 | 15% above baseline | +| 100% | 100 | Perfect accuracy (max score) | + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Build Synthetic Dataset +```python +# Generate synthetic ImageNet-like data +train_loader, val_loader, test_loader = make_dataloaders() +# Each sample: (384,) feature vector, label in [0, 127] +``` + +### 2. Call Solution +```python +from solution import Solution +solution = Solution() +model = solution.solve(train_loader, val_loader, metadata) +# metadata contains: num_classes, input_dim, param_limit, baseline_accuracy, device +``` + +### 3. Validate Model +```python +param_count = sum(p.numel() for p in model.parameters() if p.requires_grad) +if param_count > 500000: + score = 0.0 # Constraint violation +``` + +### 4. Evaluate Accuracy +```python +model.eval() +correct = 0 +total = 0 +for inputs, targets in test_loader: + outputs = model(inputs) + preds = outputs.argmax(dim=1) + correct += (preds == targets).sum().item() + total += targets.numel() +accuracy = correct / total +``` + +### 5. Calculate Score +```python +score = (accuracy - 0.72) / (1.0 - 0.72) * 100.0 +score = max(0.0, min(100.0, score)) +``` + +Evaluation Details +------------------ +- 128 classes, 384-dimensional feature vectors +- Training: 2,048 samples (16 per class) +- Validation: 512 samples (4 per class) +- Test: 1,024 samples (8 per class) +- Data generated synthetically with controlled noise + +Environment Details +------------------- +- **Device**: CPU only (`device="cpu"`) +- **Python Environment**: + - Python 3 + - PyTorch 2.2-2.4 + - NumPy ≥1.24 + - tqdm ≥4.64 +- **Timeout**: 1 hour (3600 seconds) for entire evaluation + +Key Points +---------- +1. **Parameter Constraint is Hard**: Models exceeding 500,000 parameters always score 0 +2. **Baseline is Lower Bound**: Must achieve 72%+ accuracy to score points +3. **Linear Scoring**: Every accuracy improvement scales linearly to the score +4. **100% is Target**: Achieving 100% accuracy gives full 100 points +5. **Accuracy is Primary**: Focus on accuracy within the parameter budget + +Example: Simple Baseline +------------------------- +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None): + # Simple 2-layer MLP + input_dim = metadata["input_dim"] # 384 + num_classes = metadata["num_classes"] # 128 + hidden_dim = 384 + + model = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, num_classes) + ) + + # Parameter count: 384*384 + 384 + 384*128 + 128 = ~196,992 + + # Simple training loop + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + for epoch in range(50): + model.train() + for inputs, targets in train_loader: + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + return model +``` + +**Note**: This baseline achieves ~72% accuracy with ~197K parameters. To reach higher accuracy within the 500K budget, consider deeper networks or better optimization. + +Implementation Tips +------------------- +- Monitor parameter count: `sum(p.numel() for p in model.parameters() if p.requires_grad)` +- Gradually improve architecture while staying under budget +- Use techniques like batch normalization, dropout, or residual connections +- Higher capacity (more parameters) generally improves accuracy up to the limit + +Baseline Performance +-------------------- +- **Baseline Accuracy**: 72% +- **Baseline Parameters**: Approximately 500,000 +- This represents a simple model at this parameter budget diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/task.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/task.yaml new file mode 100644 index 00000000..9c4c39d4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: imagenet_pareto (500k) (TTT)" + description: | + Solve the 'imagenet_pareto (500k)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: imagenet_pareto + variant_name: "500k" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/imagenet_pareto__500k/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/eval/grader.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/opencode.json b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/solution.py b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/statement.md b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/statement.md new file mode 100644 index 00000000..3e3ebba3 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed/statement.md @@ -0,0 +1,230 @@ +ImageNet Pareto Optimization - 5M Parameter Variant +=================================================== + +Problem Setting +--------------- +Train a neural network on a synthetic ImageNet-like dataset to maximize accuracy while staying within a parameter budget of 5,000,000 parameters. + +Objective: Achieve the highest possible accuracy without exceeding the parameter constraint. + +Target +------ +**Primary**: Maximize test accuracy +**Secondary**: Maintain model efficiency (stay under parameter budget) + +API Specification +---------------- +Implement a `Solution` class: + +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None) -> torch.nn.Module: + """ + Train a model and return it. + + Args: + train_loader: PyTorch DataLoader with training data + val_loader: PyTorch DataLoader with validation data + metadata: Dict with keys: + - num_classes: int (128) + - input_dim: int (384) + - param_limit: int (5,000,000) + - baseline_accuracy: float (0.88) + - train_samples: int + - val_samples: int + - test_samples: int + - device: str ("cpu") + + Returns: + Trained torch.nn.Module ready for evaluation + """ + # Your implementation + pass +``` + +**Implementation Requirements**: +- Use `metadata["input_dim"]` and `metadata["num_classes"]` for model architecture +- Keep model parameters <= 5,000,000 (hard constraint - models exceeding this receive 0 score) +- Return a trained model ready for evaluation +- Ensure model works with the provided device + +Parameter Constraint +-------------------- +**HARD LIMIT: 5,000,000 trainable parameters** + +- This is an absolute constraint enforced during evaluation +- Models exceeding 5,000,000 parameters will receive a score of 0.0 +- The constraint cannot be waived under any circumstances +- You must design your architecture carefully to stay under this limit + +Example: A model with 5,000,001 parameters → Score 0.0 (constraint violated) +Example: A model with 5,000,000 parameters → Score based on accuracy + +Baseline Accuracy +----------------- +**Baseline Accuracy for this variant: 88%** + +- This is the expected performance level for a simple model at this parameter budget +- Solutions must achieve accuracy **above** this baseline to receive a positive score +- Accuracy **below** baseline results in 0 points +- Accuracy improvements are scored linearly + +Scoring Formula +--------------- + +The scoring is based purely on **linear accuracy scaling** from baseline to 100%: + +``` +If model exceeds parameter limit (5,000,000): + Score = 0.0 (constraint violation) + +Else: + Score = (accuracy - 0.88) / (1.0 - 0.88) × 100.0 + + Where: + - accuracy = achieved test accuracy (0.0 to 1.0) + - 0.88 = baseline accuracy for this variant + - 1.0 = target (100% accuracy = 100 points) + + Score is clamped to [0, 100] range +``` + +**Linearly Scaled Scoring for 5M variant:** + +| Accuracy | Score | Notes | +|----------|-------|-------| +| 88.0% | 0 | At baseline (0 points) | +| 93.0% | ~41 | 5% above baseline | +| 98.0% | ~83 | 10% above baseline | +| 103.0% | ~125 | 15% above baseline | +| 100% | 100 | Perfect accuracy (max score) | + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Build Synthetic Dataset +```python +# Generate synthetic ImageNet-like data +train_loader, val_loader, test_loader = make_dataloaders() +# Each sample: (384,) feature vector, label in [0, 127] +``` + +### 2. Call Solution +```python +from solution import Solution +solution = Solution() +model = solution.solve(train_loader, val_loader, metadata) +# metadata contains: num_classes, input_dim, param_limit, baseline_accuracy, device +``` + +### 3. Validate Model +```python +param_count = sum(p.numel() for p in model.parameters() if p.requires_grad) +if param_count > 5000000: + score = 0.0 # Constraint violation +``` + +### 4. Evaluate Accuracy +```python +model.eval() +correct = 0 +total = 0 +for inputs, targets in test_loader: + outputs = model(inputs) + preds = outputs.argmax(dim=1) + correct += (preds == targets).sum().item() + total += targets.numel() +accuracy = correct / total +``` + +### 5. Calculate Score +```python +score = (accuracy - 0.88) / (1.0 - 0.88) * 100.0 +score = max(0.0, min(100.0, score)) +``` + +Evaluation Details +------------------ +- 128 classes, 384-dimensional feature vectors +- Training: 2,048 samples (16 per class) +- Validation: 512 samples (4 per class) +- Test: 1,024 samples (8 per class) +- Data generated synthetically with controlled noise + +Environment Details +------------------- +- **Device**: CPU only (`device="cpu"`) +- **Python Environment**: + - Python 3 + - PyTorch 2.2-2.4 + - NumPy ≥1.24 + - tqdm ≥4.64 +- **Timeout**: 1 hour (3600 seconds) for entire evaluation + +Key Points +---------- +1. **Parameter Constraint is Hard**: Models exceeding 5,000,000 parameters always score 0 +2. **Baseline is Lower Bound**: Must achieve 88%+ accuracy to score points +3. **Linear Scoring**: Every accuracy improvement scales linearly to the score +4. **100% is Target**: Achieving 100% accuracy gives full 100 points +5. **Accuracy is Primary**: Focus on accuracy within the parameter budget + +Example: Simple Baseline +------------------------- +```python +import torch +import torch.nn as nn + +class Solution: + def solve(self, train_loader, val_loader, metadata: dict = None): + # Simple 4-layer MLP + input_dim = metadata["input_dim"] # 384 + num_classes = metadata["num_classes"] # 128 + hidden_dim = 1536 + + model = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, num_classes) + ) + + # Parameter count: ~4.9M + + # Simple training loop + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + criterion = nn.CrossEntropyLoss() + + for epoch in range(50): + model.train() + for inputs, targets in train_loader: + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + return model +``` + +**Note**: This baseline achieves ~88% accuracy with ~4.9M parameters. To reach higher accuracy within the 5M budget, consider deeper networks or better optimization. + +Implementation Tips +------------------- +- Monitor parameter count: `sum(p.numel() for p in model.parameters() if p.requires_grad)` +- Gradually improve architecture while staying under budget +- Use techniques like batch normalization, dropout, or residual connections +- Higher capacity (more parameters) generally improves accuracy up to the limit + +Baseline Performance +-------------------- +- **Baseline Accuracy**: 88% +- **Baseline Parameters**: Approximately 5,000,000 +- This represents a simple model at this parameter budget diff --git a/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/task.yaml b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/task.yaml new file mode 100644 index 00000000..54ff5e3a --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: imagenet_pareto (5m) (TTT)" + description: | + Solve the 'imagenet_pareto (5m)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: imagenet_pareto + variant_name: "5m" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/imagenet_pareto__5m/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/llm_router/eval/grader.py b/ttt/examples/frontier_cs_tasks/llm_router/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_router/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/llm_router/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/llm_router/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_router/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/llm_router/seed/opencode.json b/ttt/examples/frontier_cs_tasks/llm_router/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_router/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/llm_router/seed/solution.py b/ttt/examples/frontier_cs_tasks/llm_router/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_router/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/llm_router/seed/statement.md b/ttt/examples/frontier_cs_tasks/llm_router/seed/statement.md new file mode 100644 index 00000000..1ee5c089 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_router/seed/statement.md @@ -0,0 +1,143 @@ +LLM Router +================================ + +Overview +-------- +This benchmark evaluates a language model's ability to implement an LLM routing policy. Given a user query, the router must choose one model from a small candidate set with different cost–quality tradeoffs. The goal is to maximize accuracy while minimizing inference cost. The task is fully offline: model correctness and costs are precomputed. The router must generalize from query text alone. + +Problem Setting +-------- +You operate a router that sits in front of a pool of large language models (LLMs). For each incoming query q, the router must select exactly one model from a fixed candidate set: ["cheap", "mid", "expensive"]. + +These are abstract routing tiers. Each tier corresponds to a concrete LLM with a known cost and accuracy profile, but this mapping is not visible to the router. Intuitively: +- cheap: fast and inexpensive, but less reliable +- mid: moderate cost and accuracy +- expensive: highest accuracy, highest cost +No single model is optimal for all queries. + +You have access to a reference dataset of queries, each labeled with which concrete LLMs produced correct answers and their costs. During evaluation, the router must generalize to unseen queries, selecting the best model from the candidate set based on the query text alone. + +You are allowed to develop heuristics or machine learning models to implement the routing policy. However, the solution must be stateless: each query is handled independently without memory of previous queries. + +Target +-------- +The goal is to achieve high accuracy while minimizing average inference cost. + +API Specification +-------- +Implement a `Solution` class: + +```python +class Solution: + def solve(self, query: str, eval_name: str, candidate_models: list[str] + ) -> str: + """ + Select exactly one routing option for the given query. + + Args: + query: The user query. + eval_name: The dataset or task name (e.g., "mbpp"). + candidate_models: A list of available routing options + (["cheap", "mid", "expensive"] by default). + + Returns: + A single string from candidate_models indicating + the chosen model. + """ +``` + +**Constraints**: +- The return value must be an element of candidate_models. +- The method is called once per query. +- The solution must be stateless across queries. +- External API calls and internet access are not allowed. + +Returning an invalid value results in a score of 0 for that query. + +Dataset +-------- +You will be provided with a dataset of queries, each associated with multiple concrete LLMs, whether they generate correct answers, and costs. + +During evaluation, there will be a separate evaluation dataset. For each query in this dataset, the router receives only: +- query +- eval_name +- candidate_models + +One example mapping of routing tiers to concrete LLMs is: +- "cheap": "mistralai/mistral-7b-chat", +- "mid": "mistralai/mixtral-8x7b-chat", +- "expensive": "gpt-4-1106-preview". + +Scoring (0-100) +-------- +The router is evaluated on a fixed set of queries. + +For each query: +- The evaluator calls Solution.solve(...). +- The chosen model's correctness and cost are looked up. +- Accuracy and cost are accumulated. + +Let: +- accuracy = fraction of queries answered correctly +- avg_cost = average inference cost per query + +The raw score is computed as: raw_score = accuracy − λ × avg_cost, where λ = 150.0. Naively guessing "cheap"/"mid"/"expensive" all the time is expected to yield a uniformly low score. + +The final benchmark score is normalized to the range [0, 100], where the oracle router always gets 100. + +Reference Dataset +-------- +The reference dataset is provided as a CSV file that your solution can read at runtime: + +```python +import pandas as pd +import os + +# Get the directory where this solution file is located +# The resources/ folder is in the problem directory +problem_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +data_path = os.path.join(problem_dir, "resources", "reference_data.csv") +# Or simply use relative path (current working directory is the problem directory): +data_path = "resources/reference_data.csv" + +df = pd.read_csv(data_path) +``` + +**Columns:** +- `sample_id`: Unique identifier (e.g., "mmlu-sociology.val.78") +- `prompt`: The query text (may contain newlines, escaped as \n) +- `eval_name`: Dataset/task name (e.g., "mbpp", "mmlu-sociology", "hellaswag") +- `{model_name}`: Correctness score (0.0 or 1.0) for each LLM +- `{model_name}|model_response`: The actual response text from each LLM +- `{model_name}|total_cost`: Inference cost for each LLM +- `oracle_model_to_route_to`: The optimal model for this query + +**Models in dataset:** +- WizardLM/WizardLM-13B-V1.2 +- claude-instant-v1, claude-v1, claude-v2 +- gpt-3.5-turbo-1106, gpt-4-1106-preview +- meta/code-llama-instruct-34b-chat, meta/llama-2-70b-chat +- mistralai/mistral-7b-chat, mistralai/mixtral-8x7b-chat +- zero-one-ai/Yi-34B-Chat + +**Example row (key columns only):** +``` +sample_id: mmlu-sociology.val.78 +prompt: "['Please answer with the letter...Which of the following best describes...?\nA) Ethnocentrism\nB) Institutionalization\nC) Stereotyping\nD) Scapegoating\n...']" +eval_name: mmlu-sociology + +# Correctness (1.0 = correct, 0.0 = wrong): +mistralai/mistral-7b-chat: 1.0 +mistralai/mixtral-8x7b-chat: 1.0 +gpt-4-1106-preview: 1.0 +WizardLM/WizardLM-13B-V1.2: 1.0 + +# Costs: +mistralai/mistral-7b-chat|total_cost: 1.74e-05 +mistralai/mixtral-8x7b-chat|total_cost: 6.75e-05 +gpt-4-1106-preview|total_cost: 0.00088 + +oracle_model_to_route_to: mistralai/mistral-7b-chat +``` + +In this example, all models answered correctly, but mistral-7b-chat has the lowest cost, so it's the oracle choice. diff --git a/ttt/examples/frontier_cs_tasks/llm_router/task.yaml b/ttt/examples/frontier_cs_tasks/llm_router/task.yaml new file mode 100644 index 00000000..60aee254 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_router/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: llm_router (TTT)" + description: | + Solve the 'llm_router' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: llm_router + variant_name: "" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/llm_router/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__large/eval/grader.py b/ttt/examples/frontier_cs_tasks/llm_sql__large/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__large/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/opencode.json b/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/solution.py b/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/statement.md b/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/statement.md new file mode 100644 index 00000000..c7ce329b --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__large/seed/statement.md @@ -0,0 +1,147 @@ +Problem Setting +--------------- + +Consider a CSV file with $N$ rows and $M$ columns, where $M \leq 10$. We feed each row to an LLM inference engine (with a prefix KV cache) by concatenating all column values in that row. For the $i$-th row with entries $A[i,1], A[i,2], \ldots, A[i,M]$, we construct the input string: + +```math +S_i = \text{Concat}(\text{string}(A[i,1]), \text{string}(A[i,2]), \ldots, \text{string}(A[i,M])) +```` + +When requesting $S_i$ for $i > 1$, the prefix KV-cache hit rate depends on the longest common prefix with any previously seen request: + +```math +\text{hit\_rate}_i = +\frac{\max_{1 \le j < i} \text{LCP}(S_i, S_j)}{|S_i|} +``` + +where $LCP(S, T)$ is the length of the longest common prefix between strings $S$ and $T$. + +You are allowed to reorder the CSV columns. Let $p$ be a permutation of $\{1, 2, ..., M\}$. The reordered string for row $i$ becomes: + +```math +S'_i = \text{Concat}(\text{string}(A[i,p_1]), \text{string}(A[i,p_2]), \ldots, \text{string}(A[i,p_M])) +``` + +The goal is to choose a permutation $p$ that maximizes the overall KV-cache hit rate: + +```math +\max_p\; +\frac{\sum_{i=2}^N \max_{1 \le j < i} \text{LCP}(S'_i, S'_j)} + {\sum_{i=1}^N |S'_i|} +``` + + + + +Target +--- +Maximize prefix hit rate shown above (higher is better) + +- **Hard Constraint**: Average runtime per dataset must be $\leq 10$ seconds (score = 0 if exceeded) and correctly handle column merge constraint. + +**Column Merges**: +- Column merge specs are provided per dataset +- Columns in each merge group are concatenated into a single column +- The merged column replaces the original columns +- Merge operations are applied before column reordering + +API Specification +--- +Implement a `Solution` class: + +```python +import pandas as pd + +class Solution: + def solve( + self, + df: pd.DataFrame, + early_stop: int = 100000, + row_stop: int = 4, + col_stop: int = 2, + col_merge: list = None, + one_way_dep: list = None, + distinct_value_threshold: float = 0.7, + parallel: bool = True, + ) -> pd.DataFrame: + """ + Reorder columns in the DataFrame to maximize prefix hit rate. + + Args: + df: Input DataFrame to optimize + early_stop: Early stopping parameter (default: 100000) + row_stop: Row stopping parameter (default: 4) + col_stop: Column stopping parameter (default: 2) + col_merge: List of column groups to merge (columns in each group are merged into one) + one_way_dep: List of one-way dependencies (not used in this variant) + distinct_value_threshold: Threshold for distinct values (default: 0.7) + parallel: Whether to use parallel processing (default: True) + + Returns: + DataFrame with reordered columns (same rows, different column order) + """ + # Your implementation + pass +``` + +**Evaluation Process**: +1. Column merges are applied if specified +2. Your `solve()` method reorders the remaining columns +3. Rows are concatenated (no spaces) and prefix hit rate is calculated + +Scoring (0-100) +--- + +baseline_hit_rate = Average prefix hit rate using original column order (0-point anchor) +avg_hit_rate = Your solution's average prefix hit rate across all datasets + +For each dataset: + dataset_score = ((hit_rate - baseline_hit_rate) / (1.0 - baseline_hit_rate)) × 100 + +final_score = Average of individual dataset scores + +Score is clamped to [0, 100] range + + +**Runtime Constraint**: +- Average runtime per dataset must be ≤ 10 seconds +- If average runtime exceeds 10 seconds, score = 0.0 + +**Scoring Examples**: +- baseline_hit_rate = 0.0 (worst), avg_hit_rate = 1.0 (perfect) → Score = 100 +- baseline_hit_rate = 0.5, avg_hit_rate = 0.5 → Score = 0 +- baseline_hit_rate = 0.5, avg_hit_rate = 0.75 → Score = 50 +- baseline_hit_rate = 0.5, avg_hit_rate = 1.0 → Score = 100 + +Implementation Notes +--- +- Row values are concatenated without spaces: `"".join(row.values)` +- Column reordering should optimize for maximum prefix overlap in the concatenated string representation +- Consider column dependencies, distinct value distributions, and merge requirements when reordering +- Large datasets with $M > 10$ columns require efficient algorithms due to larger search space +- In our larger dataset, $50k \leq N \leq 100k$ and $4 \leq M \leq 9$ + +**Example input** +please ignore the $> 10$ column number here +--- +```csv +ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month +1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1 +2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1 +3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0 +4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0 +5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0 +... +``` + +**Example output** +--- +``` +ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month +1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1 +2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1 +3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0 +4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0 +5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0 +``` +($p$ = $1, 2, \ldots M$) diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__large/task.yaml b/ttt/examples/frontier_cs_tasks/llm_sql__large/task.yaml new file mode 100644 index 00000000..45d07104 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__large/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: llm_sql (large) (TTT)" + description: | + Solve the 'llm_sql (large)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: llm_sql + variant_name: "large" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/llm_sql__large/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__small/eval/grader.py b/ttt/examples/frontier_cs_tasks/llm_sql__small/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__small/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/opencode.json b/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/solution.py b/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/statement.md b/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/statement.md new file mode 100644 index 00000000..499d1630 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__small/seed/statement.md @@ -0,0 +1,147 @@ +Problem Setting +--------------- + +Consider a CSV file with $N$ rows and $M$ columns, where $M \leq 10$. We feed each row to an LLM inference engine (with a prefix KV cache) by concatenating all column values in that row. For the $i$-th row with entries $A[i,1], A[i,2], \ldots, A[i,M]$, we construct the input string: + +```math +S_i = \text{Concat}(\text{string}(A[i,1]), \text{string}(A[i,2]), \ldots, \text{string}(A[i,M])) +```` + +When requesting $S_i$ for $i > 1$, the prefix KV-cache hit rate depends on the longest common prefix with any previously seen request: + +```math +\text{hit\_rate}_i = +\frac{\max_{1 \le j < i} \text{LCP}(S_i, S_j)}{|S_i|} +``` + +where $LCP(S, T)$ is the length of the longest common prefix between strings $S$ and $T$. + +You are allowed to reorder the CSV columns. Let $p$ be a permutation of $\{1, 2, ..., M\}$. The reordered string for row $i$ becomes: + +```math +S'_i = \text{Concat}(\text{string}(A[i,p_1]), \text{string}(A[i,p_2]), \ldots, \text{string}(A[i,p_M])) +``` + +The goal is to choose a permutation $p$ that maximizes the overall KV-cache hit rate: + +```math +\max_p\; +\frac{\sum_{i=2}^N \max_{1 \le j < i} \text{LCP}(S'_i, S'_j)} + {\sum_{i=1}^N |S'_i|} +``` + + + + +Target +--- +Maximize prefix hit rate shown above (higher is better) + +- **Hard Constraint**: Average runtime per dataset must be $\leq 10$ seconds (score = 0 if exceeded) and correctly handle column merge constraint. + +**Column Merges**: +- Column merge specs are provided per dataset +- Columns in each merge group are concatenated into a single column +- The merged column replaces the original columns +- Merge operations are applied before column reordering + +API Specification +--- +Implement a `Solution` class: + +```python +import pandas as pd + +class Solution: + def solve( + self, + df: pd.DataFrame, + early_stop: int = 100000, + row_stop: int = 4, + col_stop: int = 2, + col_merge: list = None, + one_way_dep: list = None, + distinct_value_threshold: float = 0.7, + parallel: bool = True, + ) -> pd.DataFrame: + """ + Reorder columns in the DataFrame to maximize prefix hit rate. + + Args: + df: Input DataFrame to optimize + early_stop: Early stopping parameter (default: 100000) + row_stop: Row stopping parameter (default: 4) + col_stop: Column stopping parameter (default: 2) + col_merge: List of column groups to merge (columns in each group are merged into one) + one_way_dep: List of one-way dependencies (not used in this variant) + distinct_value_threshold: Threshold for distinct values (default: 0.7) + parallel: Whether to use parallel processing (default: True) + + Returns: + DataFrame with reordered columns (same rows, different column order) + """ + # Your implementation + pass +``` + +**Evaluation Process**: +1. Column merges are applied if specified +2. Your `solve()` method reorders the remaining columns +3. Rows are concatenated (no spaces) and prefix hit rate is calculated + +Scoring (0-100) +--- + +baseline_hit_rate = Average prefix hit rate using original column order (0-point anchor) +avg_hit_rate = Your solution's average prefix hit rate across all datasets + +For each dataset: + dataset_score = ((hit_rate - baseline_hit_rate) / (1.0 - baseline_hit_rate)) × 100 + +final_score = Average of individual dataset scores + +Score is clamped to [0, 100] range + + +**Runtime Constraint**: +- Average runtime per dataset must be ≤ 10 seconds +- If average runtime exceeds 10 seconds, score = 0.0 + +**Scoring Examples**: +- baseline_hit_rate = 0.0 (worst), avg_hit_rate = 1.0 (perfect) → Score = 100 +- baseline_hit_rate = 0.5, avg_hit_rate = 0.5 → Score = 0 +- baseline_hit_rate = 0.5, avg_hit_rate = 0.75 → Score = 50 +- baseline_hit_rate = 0.5, avg_hit_rate = 1.0 → Score = 100 + +Implementation Notes +--- +- Row values are concatenated without spaces: `"".join(row.values)` +- Column reordering should optimize for maximum prefix overlap in the concatenated string representation +- Consider column dependencies, distinct value distributions, and merge requirements when reordering +- Large datasets with $M > 10$ columns require efficient algorithms due to larger search space +- In our smaller dataset, $15k \leq N \leq 28k$ and $4 \leq M \leq 9$ + +**Example input** +please ignore the $> 10$ column number here +--- +```csv +ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month +1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1 +2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1 +3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0 +4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0 +5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0 +... +``` + +**Example output** +--- +``` +ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month +1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1 +2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1 +3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0 +4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0 +5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0 +``` +($p$ = $1, 2, \ldots M$) diff --git a/ttt/examples/frontier_cs_tasks/llm_sql__small/task.yaml b/ttt/examples/frontier_cs_tasks/llm_sql__small/task.yaml new file mode 100644 index 00000000..95aa1fd3 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/llm_sql__small/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: llm_sql (small) (TTT)" + description: | + Solve the 'llm_sql (small)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: llm_sql + variant_name: "small" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/llm_sql__small/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/eval/grader.py b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/opencode.json b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/solution.cpp b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/solution.cpp new file mode 100644 index 00000000..f4b3bcbf --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/solution.cpp @@ -0,0 +1,11 @@ +// Solution for Frontier-CS research problem +// Read the problem statement in statement.md for details. +// Implement your solution here. + +#include +using namespace std; + +int main() { + // TODO: Implement solution + return 0; +} diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/statement.md b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/statement.md new file mode 100644 index 00000000..73382112 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed/statement.md @@ -0,0 +1,260 @@ +N-Body Simulation Problem - 100,000 Particles +============================================= + +Problem Setting +--------------- +Design and optimize a high-performance parallel N-body simulation. In physics and astronomy, an N-body simulation models the dynamics of particles under gravitational forces. The available hardware is an AWS c7i.4xlarge. + +The challenge involves optimizing: +- **Loop parallelization**: Efficient parallel force computation across particles +- **Acceleration structures**: Use structures such as quad-tree for O(N log N) instead of O(N²), or other structures. +- **Load balancing**: Handling varying workloads per particle +- **Parallel Programming Libraries**: Proper use of libraries like OpenMP + +This variant tests performance on **100,000 particles** with 3 simulation iterations. + +Target +------ +- **Primary**: Ensure numerical correctness (tolerance: 1e-2) +- **Secondary**: Maximize speedup over parallel brute-force baseline (higher is better) +- **Tertiary**: Use algorithmic improvements (quad-tree, spatial hashing) to beat O(N²) + +Solution Format +--------------- +Submit a single C++ file (`.cpp`) that implements a `Simulator` class: + +```cpp +#include "world.h" +#include + +class MySimulator : public Simulator { +private: + // Persistent state across simulation steps + int numThreads = 8; + // Could store acceleration structures, pre-allocated buffers, etc. + +public: + void init(int numParticles, StepParameters params) override { + // Called once before simulation starts + // Set thread count, pre-allocate structures, etc. + omp_set_num_threads(numThreads); + } + + void simulateStep(std::vector &particles, + std::vector &newParticles, + StepParameters params) override { + // Called each simulation step + // For each particle i: + // 1. Compute total force from particles within params.cullRadius + // 2. Update particle using updateParticle() + // 3. Store result in newParticles[i] + } +}; + +// Factory function - must be implemented +Simulator* createSimulator() { + return new MySimulator(); +} +``` + +Provided Types and Functions (in world.h) +----------------------------------------- +```cpp +struct Vec2 { + float x, y; + // Operators: +, -, *, length(), length2() +}; + +struct Particle { + int id; + float mass; + Vec2 position; + Vec2 velocity; +}; + +struct StepParameters { + float deltaTime = 0.2f; + float cullRadius = 1.0f; // Only consider particles within this distance +}; + +// Simulator base class +class Simulator { +public: + virtual ~Simulator() = default; + virtual void init(int numParticles, StepParameters params) {} // Optional + virtual void simulateStep(std::vector &particles, + std::vector &newParticles, + StepParameters params) = 0; // Required +}; + +// Compute gravitational force between two particles +// Returns Vec2(0,0) if distance > cullRadius or distance < 1e-3 +inline Vec2 computeForce(const Particle &target, const Particle &attractor, + float cullRadius) { + auto dir = (attractor.position - target.position); + auto dist = dir.length(); + if (dist < 1e-3f) + return Vec2(0.0f, 0.0f); + dir *= (1.0f / dist); + if (dist > cullRadius) + return Vec2(0.0f, 0.0f); + if (dist < 1e-1f) + dist = 1e-1f; + const float G = 0.01f; + Vec2 force = dir * target.mass * attractor.mass * (G / (dist * dist)); + if (dist > cullRadius * 0.75f) { + float decay = 1.0f - (dist - cullRadius * 0.75f) / (cullRadius * 0.25f); + force *= decay; + } + return force; +} + +// Apply force to particle and integrate position/velocity +inline Particle updateParticle(const Particle &pi, Vec2 force, + float deltaTime) { + Particle result = pi; + result.velocity += force * (deltaTime / pi.mass); + result.position += result.velocity * deltaTime; + return result; +} +``` + +Baseline +-------- +The baseline is a simple OpenMP parallel brute-force O(N²) implementation: + +```cpp +// Baseline for N-body simulation - simple OpenMP parallel brute-force +// O(N²) approach with parallel outer loop +// Solutions should aim to beat this baseline + +#include "world.h" +#include + +class BaselineSimulator : public Simulator { +private: + int numThreads = 8; + +public: + void init(int numParticles, StepParameters params) override { + omp_set_num_threads(numThreads); + } + + void simulateStep(std::vector &particles, + std::vector &newParticles, + StepParameters params) override { + #pragma omp parallel for schedule(dynamic, 16) + for (int i = 0; i < (int)particles.size(); i++) { + auto pi = particles[i]; + Vec2 force = Vec2(0.0f, 0.0f); + + for (size_t j = 0; j < particles.size(); j++) { + if (j == (size_t)i) continue; + if ((pi.position - particles[j].position).length() < params.cullRadius) { + force += computeForce(pi, particles[j], params.cullRadius); + } + } + + newParticles[i] = updateParticle(pi, force, params.deltaTime); + } + } +}; + +Simulator* createSimulator() { + return new BaselineSimulator(); +} +``` + +To beat the baseline, use algorithmic improvements like acceleration structures. + +Please generate a `.cpp` file that follows the solution's interface above, with the exact same +signatures. The `Simulator` you write will be used in the following way: + +```cpp +double runSimulation(World& world, Simulator* sim, + StepParameters params, int numIterations) { + Timer timer; + timer.reset(); + + // Initialize simulator at the start of each run (clean state) + sim->init(world.particles.size(), params); + + for (int iter = 0; iter < numIterations; iter++) { + world.newParticles.resize(world.particles.size()); + sim->simulateStep(world.particles, world.newParticles, params); + world.particles.swap(world.newParticles); + } + + return timer.elapsed(); +} +``` + +Compilation +----------- +Your code is compiled with: +```bash +g++ -O2 -fopenmp -std=c++17 -I. -o benchmark solution.cpp +``` + +Requirements: +- Can use OpenMP for parallelization +- Must implement a `Simulator` subclass and `createSimulator()` factory function +- May define additional helper classes/functions as needed +- Do NOT modify `computeForce` or `updateParticle` functions + +Correctness +----------- + +We will use the `BaselineSimulator` to get a reference particles positions and compare the solution you generated with the following code. We use a tolerance of `1e-2f`. If you fail the correctness check, you will get a score of zero. + +```cpp +bool checkForCorrectness(const World& refW, const World& w, float tolerance = 1e-2f) { + if (w.particles.size() != refW.particles.size()) { + std::cerr << "Mismatch: number of particles " << w.particles.size() + << " does not match reference " << refW.particles.size() << std::endl; + return false; + } + + for (size_t i = 0; i < w.particles.size(); i++) { + auto errorX = std::abs(w.particles[i].position.x - refW.particles[i].position.x); + auto errorY = std::abs(w.particles[i].position.y - refW.particles[i].position.y); + if (errorX > tolerance || errorY > tolerance) { + std::cerr << "Mismatch at index " << i + << ": result (" << w.particles[i].position.x << ", " + << w.particles[i].position.y << ")" + << " should be (" << refW.particles[i].position.x << ", " + << refW.particles[i].position.y << ")" << std::endl; + return false; + } + } + return true; +} +``` + +Scoring (0-100) +--------------- +Performance is measured by speedup over the parallel brute-force baseline: + +``` +speedup = baseline_time / solution_time +raw_score = min(speedup, 10.0) # Cap at 10x speedup +score = (raw_score - 1.0) / 9.0 * 100 # Map 1x-10x to 0-100 +``` + +- 0 points = No speedup (1x baseline performance) +- ~11 points = 2x speedup +- ~33 points = 4x speedup +- ~56 points = 6x speedup +- 100 points = 10x+ speedup + +Note: With 100k particles, algorithmic improvements can yield massive speedups. +The brute-force baseline is extremely slow, so good solutions should achieve high speedups. + +Evaluation Details +------------------ +- Tested with 100,000 particles +- 3 simulation iterations +- Space size: 100.0, cullRadius: 25.0 +- Performance measured as median of 3 runs +- Correctness verified with tolerance: position error < 1e-2 +- Fixed random seed for reproducibility diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/task.yaml b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/task.yaml new file mode 100644 index 00000000..d1504793 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: nbody_simulation (random_100k) (TTT)" + description: | + Solve the 'nbody_simulation (random_100k)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 600s. + - Language: cpp. + +grader: + timeout: 600 + direction: maximize + args: + problem_name: nbody_simulation + variant_name: "random_100k" + language: cpp + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/nbody_simulation__random_100k/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/eval/grader.py b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/opencode.json b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/solution.cpp b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/solution.cpp new file mode 100644 index 00000000..f4b3bcbf --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/solution.cpp @@ -0,0 +1,11 @@ +// Solution for Frontier-CS research problem +// Read the problem statement in statement.md for details. +// Implement your solution here. + +#include +using namespace std; + +int main() { + // TODO: Implement solution + return 0; +} diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/statement.md b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/statement.md new file mode 100644 index 00000000..61066c3b --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed/statement.md @@ -0,0 +1,257 @@ +N-Body Simulation Problem - 10,000 Particles +============================================= + +Problem Setting +--------------- +Design and optimize a high-performance parallel N-body simulation. In physics and astronomy, an N-body simulation models the dynamics of particles under gravitational forces. The available hardware is an AWS c7i.4xlarge. + +The challenge involves optimizing: +- **Loop parallelization**: Efficient parallel force computation across particles +- **Acceleration structures**: Use structures such as quad-tree for O(N log N) instead of O(N²), or other structures. +- **Load balancing**: Handling varying workloads per particle +- **Parallel Programming Libraries**: Proper use of libraries like OpenMP + +This variant tests performance on **10,000 particles** with 5 simulation iterations. + +Target +------ +- **Primary**: Ensure numerical correctness (tolerance: 1e-2) +- **Secondary**: Maximize speedup over parallel brute-force baseline (higher is better) +- **Tertiary**: Use algorithmic improvements (quad-tree, spatial hashing) to beat O(N²) + +Solution Format +--------------- +Submit a single C++ file (`.cpp`) that implements a `Simulator` class: + +```cpp +#include "world.h" +#include + +class MySimulator : public Simulator { +private: + // Persistent state across simulation steps + int numThreads = 8; + // Could store acceleration structures, pre-allocated buffers, etc. + +public: + void init(int numParticles, StepParameters params) override { + // Called once before simulation starts + // Set thread count, pre-allocate structures, etc. + omp_set_num_threads(numThreads); + } + + void simulateStep(std::vector &particles, + std::vector &newParticles, + StepParameters params) override { + // Called each simulation step + // For each particle i: + // 1. Compute total force from particles within params.cullRadius + // 2. Update particle using updateParticle() + // 3. Store result in newParticles[i] + } +}; + +// Factory function - must be implemented +Simulator* createSimulator() { + return new MySimulator(); +} +``` + +Provided Types and Functions (in world.h) +----------------------------------------- +```cpp +struct Vec2 { + float x, y; + // Operators: +, -, *, length(), length2() +}; + +struct Particle { + int id; + float mass; + Vec2 position; + Vec2 velocity; +}; + +struct StepParameters { + float deltaTime = 0.2f; + float cullRadius = 1.0f; // Only consider particles within this distance +}; + +// Simulator base class +class Simulator { +public: + virtual ~Simulator() = default; + virtual void init(int numParticles, StepParameters params) {} // Optional + virtual void simulateStep(std::vector &particles, + std::vector &newParticles, + StepParameters params) = 0; // Required +}; + +// Compute gravitational force between two particles +// Returns Vec2(0,0) if distance > cullRadius or distance < 1e-3 +inline Vec2 computeForce(const Particle &target, const Particle &attractor, + float cullRadius) { + auto dir = (attractor.position - target.position); + auto dist = dir.length(); + if (dist < 1e-3f) + return Vec2(0.0f, 0.0f); + dir *= (1.0f / dist); + if (dist > cullRadius) + return Vec2(0.0f, 0.0f); + if (dist < 1e-1f) + dist = 1e-1f; + const float G = 0.01f; + Vec2 force = dir * target.mass * attractor.mass * (G / (dist * dist)); + if (dist > cullRadius * 0.75f) { + float decay = 1.0f - (dist - cullRadius * 0.75f) / (cullRadius * 0.25f); + force *= decay; + } + return force; +} + +// Apply force to particle and integrate position/velocity +inline Particle updateParticle(const Particle &pi, Vec2 force, + float deltaTime) { + Particle result = pi; + result.velocity += force * (deltaTime / pi.mass); + result.position += result.velocity * deltaTime; + return result; +} +``` + +Baseline +-------- +The baseline is a simple OpenMP parallel brute-force O(N²) implementation: + +```cpp +// Baseline for N-body simulation - simple OpenMP parallel brute-force +// O(N²) approach with parallel outer loop +// Solutions should aim to beat this baseline + +#include "world.h" +#include + +class BaselineSimulator : public Simulator { +private: + int numThreads = 8; + +public: + void init(int numParticles, StepParameters params) override { + omp_set_num_threads(numThreads); + } + + void simulateStep(std::vector &particles, + std::vector &newParticles, + StepParameters params) override { + #pragma omp parallel for schedule(dynamic, 16) + for (int i = 0; i < (int)particles.size(); i++) { + auto pi = particles[i]; + Vec2 force = Vec2(0.0f, 0.0f); + + for (size_t j = 0; j < particles.size(); j++) { + if (j == (size_t)i) continue; + if ((pi.position - particles[j].position).length() < params.cullRadius) { + force += computeForce(pi, particles[j], params.cullRadius); + } + } + + newParticles[i] = updateParticle(pi, force, params.deltaTime); + } + } +}; + +Simulator* createSimulator() { + return new BaselineSimulator(); +} +``` + +To beat the baseline, use algorithmic improvements like acceleration structures. + +Please generate a `.cpp` file that follows the solution's interface above, with the exact same +signatures. The `Simulator` you write will be used in the following way: + +```cpp +double runSimulation(World& world, Simulator* sim, + StepParameters params, int numIterations) { + Timer timer; + timer.reset(); + + // Initialize simulator at the start of each run (clean state) + sim->init(world.particles.size(), params); + + for (int iter = 0; iter < numIterations; iter++) { + world.newParticles.resize(world.particles.size()); + sim->simulateStep(world.particles, world.newParticles, params); + world.particles.swap(world.newParticles); + } + + return timer.elapsed(); +} +``` + +Compilation +----------- +Your code is compiled with: +```bash +g++ -O2 -fopenmp -std=c++17 -I. -o benchmark solution.cpp +``` + +Requirements: +- Can use OpenMP for parallelization +- Must implement a `Simulator` subclass and `createSimulator()` factory function +- May define additional helper classes/functions as needed +- Do NOT modify `computeForce` or `updateParticle` functions + +Correctness +----------- + +We will use the `BaselineSimulator` to get a reference particles positions and compare the solution you generated with the following code. We use a tolerance of `1e-2f`. If you fail the correctness check, you will get a score of zero. + +```cpp +bool checkForCorrectness(const World& refW, const World& w, float tolerance = 1e-2f) { + if (w.particles.size() != refW.particles.size()) { + std::cerr << "Mismatch: number of particles " << w.particles.size() + << " does not match reference " << refW.particles.size() << std::endl; + return false; + } + + for (size_t i = 0; i < w.particles.size(); i++) { + auto errorX = std::abs(w.particles[i].position.x - refW.particles[i].position.x); + auto errorY = std::abs(w.particles[i].position.y - refW.particles[i].position.y); + if (errorX > tolerance || errorY > tolerance) { + std::cerr << "Mismatch at index " << i + << ": result (" << w.particles[i].position.x << ", " + << w.particles[i].position.y << ")" + << " should be (" << refW.particles[i].position.x << ", " + << refW.particles[i].position.y << ")" << std::endl; + return false; + } + } + return true; +} +``` + +Scoring (0-100) +--------------- +Performance is measured by speedup over the parallel brute-force baseline: + +``` +speedup = baseline_time / solution_time +raw_score = min(speedup, 3.0) # Cap at 3x speedup +score = (raw_score - 1.0) / 2.0 * 100 # Map 1x-3x to 0-100 +``` + +- 0 points = No speedup (1x baseline performance) +- 50 points = 2x speedup +- 100 points = 3x+ speedup + +Note: Since baseline is already parallelized, achieving speedup requires algorithmic improvements. + +Evaluation Details +------------------ +- Tested with 10,000 particles +- 5 simulation iterations +- Space size: 100.0, cullRadius: 25.0 +- Performance measured as median of 3 runs +- Correctness verified with tolerance: position error < 1e-2 +- Fixed random seed for reproducibility diff --git a/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/task.yaml b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/task.yaml new file mode 100644 index 00000000..c5d91c30 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: nbody_simulation (random_10k) (TTT)" + description: | + Solve the 'nbody_simulation (random_10k)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 600s. + - Language: cpp. + +grader: + timeout: 600 + direction: maximize + args: + problem_name: nbody_simulation + variant_name: "random_10k" + language: cpp + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/nbody_simulation__random_10k/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/eval/grader.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/opencode.json b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/solution.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/statement.md b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/statement.md new file mode 100644 index 00000000..6a6ae91f --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed/statement.md @@ -0,0 +1,201 @@ +Symbolic Regression Benchmark - McCormick Dataset +================================================= + +Problem Setting +--------------- +Learn a closed-form symbolic expression `f(x1, x2)` that predicts the target `y`. + +This dataset is derived from the McCormick function, a classic 2D optimization test function featuring a combination of trigonometric and polynomial terms. The function exhibits a smooth, wavy surface with a global minimum. + +Input Format +------------ +- Your `Solution.solve` receives: + - `X`: numpy.ndarray of shape `(n, 2)` containing feature values + - `y`: numpy.ndarray of shape `(n,)` containing target values +- Dataset columns: `x1, x2, y` + +Output Specification +-------------------- +Implement a `Solution` class in `solution.py`: + +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + """ + Args: + X: Feature matrix of shape (n, 2) + y: Target values of shape (n,) + + Returns: + dict with keys: + - "expression": str, a Python-evaluable expression using x1, x2 + - "predictions": list/array of length n (optional) + - "details": dict with optional "complexity" int + """ + # Example: fit a symbolic expression to the data + expression = "x1 + x2" # placeholder + return { + "expression": expression, + "predictions": None, # will be computed from expression if omitted + "details": {} + } +``` + +Expression Requirements: +- Must be a valid Python expression string +- Use variable names: `x1`, `x2` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` +- Numeric constants are allowed + +Dependencies (pinned versions) +------------------------------ +``` +pysr==0.19.0 +numpy==1.26.4 +pandas==2.2.2 +sympy==1.13.3 +``` + +Minimal Working Examples +------------------------ + +**Example 1: Using PySR (recommended)** +```python +import numpy as np +from pysr import PySRRegressor + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + model = PySRRegressor( + niterations=40, + binary_operators=["+", "-", "*", "/"], + unary_operators=["sin", "cos", "exp", "log"], + populations=15, + population_size=33, + maxsize=25, + verbosity=0, + progress=False, + random_state=42, + ) + model.fit(X, y, variable_names=["x1", "x2"]) + + # Get best expression as sympy, convert to string + best_expr = model.sympy() + expression = str(best_expr) + + # Predictions + predictions = model.predict(X) + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +**Example 2: Manual expression (simple baseline)** +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + # Simple linear combination as baseline + x1, x2 = X[:, 0], X[:, 1] + + # Fit coefficients via least squares + A = np.column_stack([x1, x2, np.ones_like(x1)]) + coeffs, _, _, _ = np.linalg.lstsq(A, y, rcond=None) + a, b, c = coeffs + + expression = f"{a:.6f}*x1 + {b:.6f}*x2 + {c:.6f}" + predictions = a * x1 + b * x2 + c + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +**Example 3: Using sympy for expression manipulation** +```python +import numpy as np +import sympy as sp +from pysr import PySRRegressor + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + model = PySRRegressor( + niterations=30, + binary_operators=["+", "-", "*", "/"], + unary_operators=["sin", "cos"], + verbosity=0, + progress=False, + ) + model.fit(X, y, variable_names=["x1", "x2"]) + + # Get sympy expression and simplify + sympy_expr = model.sympy() + simplified = sp.simplify(sympy_expr) + + # Convert to evaluable string + expression = str(simplified) + + return { + "expression": expression, + "predictions": None, # evaluator will compute from expression + "details": {} + } +``` + +PySR API Notes (v0.19.0) +------------------------ +- `model.fit(X, y, variable_names=["x1", "x2"])` - use variable_names to match expected output +- `model.sympy()` - returns best expression as sympy object +- `model.predict(X)` - returns predictions array +- `model.equations_` - DataFrame of all discovered equations +- Common parameters: + - `niterations`: number of evolution iterations (more = better but slower) + - `populations`: number of parallel populations + - `maxsize`: maximum expression complexity + - `verbosity=0, progress=False`: suppress output + +Expression Format Requirements +------------------------------ +- Must be a valid Python expression string +- Use variable names: `x1`, `x2` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` (NO `np.` prefix) +- Numeric constants are allowed +- The evaluator uses `sympy.sympify()` to parse your expression + +Scoring +------- +``` +MSE = (1/n) Σ (y_i - ŷ_i)² +Score = 100 × clamp((m_base - MSE) / (m_base - m_ref), 0, 1) × 0.99^max(C - C_ref, 0) +``` + +- `m_base`: linear regression baseline MSE +- `m_ref`, `C_ref`: reference solution MSE and complexity +- `C = 2 × (#binary ops) + (#unary ops)` +- Lower MSE and lower complexity yield higher scores + +Environment +----------- +Run `set_up_env.sh` to install dependencies. diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/task.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/task.yaml new file mode 100644 index 00000000..baafcb31 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: symbolic_regression (mccormick) (TTT)" + description: | + Solve the 'symbolic_regression (mccormick)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: symbolic_regression + variant_name: "mccormick" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/symbolic_regression__mccormick/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/eval/grader.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/opencode.json b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/solution.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/statement.md b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/statement.md new file mode 100644 index 00000000..54c34ea6 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed/statement.md @@ -0,0 +1,167 @@ +Symbolic Regression Benchmark - Mixed PolyExp 4D Dataset +========================================================= + +Problem Setting +--------------- +Learn a closed-form symbolic expression `f(x1, x2, x3, x4)` that predicts the target `y`. + +This is a higher-dimensional dataset (4 input features) combining polynomial interactions with exponential decay. The function involves cross-terms between variables and Gaussian-like damping, making it more challenging than the 2D variants. + +Input Format +------------ +- Your `Solution.solve` receives: + - `X`: numpy.ndarray of shape `(n, 4)` containing feature values + - `y`: numpy.ndarray of shape `(n,)` containing target values +- Dataset columns: `x1, x2, x3, x4, y` + +Output Specification +-------------------- +Implement a `Solution` class in `solution.py`: + +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + """ + Args: + X: Feature matrix of shape (n, 4) + y: Target values of shape (n,) + + Returns: + dict with keys: + - "expression": str, a Python-evaluable expression using x1, x2, x3, x4 + - "predictions": list/array of length n (optional) + - "details": dict with optional "complexity" int + """ + # Example: fit a symbolic expression to the data + expression = "x1 + x2 + x3 + x4" # placeholder + return { + "expression": expression, + "predictions": None, # will be computed from expression if omitted + "details": {} + } +``` + +Expression Requirements: +- Must be a valid Python expression string +- Use variable names: `x1`, `x2`, `x3`, `x4` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` +- Numeric constants are allowed + +Dependencies (pinned versions) +------------------------------ +``` +pysr==0.19.0 +numpy==1.26.4 +pandas==2.2.2 +sympy==1.13.3 +``` + +Minimal Working Examples +------------------------ + +**Example 1: Using PySR (recommended)** +```python +import numpy as np +from pysr import PySRRegressor + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + model = PySRRegressor( + niterations=50, # more iterations for 4D + binary_operators=["+", "-", "*", "/"], + unary_operators=["sin", "cos", "exp", "log"], + populations=20, + population_size=40, + maxsize=30, # larger for 4D complexity + verbosity=0, + progress=False, + random_state=42, + ) + model.fit(X, y, variable_names=["x1", "x2", "x3", "x4"]) + + # Get best expression as sympy, convert to string + best_expr = model.sympy() + expression = str(best_expr) + + # Predictions + predictions = model.predict(X) + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +**Example 2: Manual expression (simple baseline)** +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + # Simple linear combination as baseline + x1, x2, x3, x4 = X[:, 0], X[:, 1], X[:, 2], X[:, 3] + + # Fit coefficients via least squares + A = np.column_stack([x1, x2, x3, x4, np.ones_like(x1)]) + coeffs, _, _, _ = np.linalg.lstsq(A, y, rcond=None) + a, b, c, d, e = coeffs + + expression = f"{a:.6f}*x1 + {b:.6f}*x2 + {c:.6f}*x3 + {d:.6f}*x4 + {e:.6f}" + predictions = a * x1 + b * x2 + c * x3 + d * x4 + e + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +PySR API Notes (v0.19.0) +------------------------ +- `model.fit(X, y, variable_names=["x1", "x2", "x3", "x4"])` - use variable_names to match expected output +- `model.sympy()` - returns best expression as sympy object +- `model.predict(X)` - returns predictions array +- `model.equations_` - DataFrame of all discovered equations +- Common parameters: + - `niterations`: number of evolution iterations (more = better but slower) + - `populations`: number of parallel populations + - `maxsize`: maximum expression complexity + - `verbosity=0, progress=False`: suppress output + +Expression Format Requirements +------------------------------ +- Must be a valid Python expression string +- Use variable names: `x1`, `x2`, `x3`, `x4` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` (NO `np.` prefix) +- Numeric constants are allowed +- The evaluator uses `sympy.sympify()` to parse your expression + +Scoring +------- +``` +MSE = (1/n) Σ (y_i - ŷ_i)² +Score = 100 × clamp((m_base - MSE) / (m_base - m_ref), 0, 1) × 0.99^max(C - C_ref, 0) +``` + +- `m_base`: linear regression baseline MSE +- `m_ref`, `C_ref`: reference solution MSE and complexity +- `C = 2 × (#binary ops) + (#unary ops)` +- Lower MSE and lower complexity yield higher scores + +Environment +----------- +Run `set_up_env.sh` to install dependencies. diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/task.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/task.yaml new file mode 100644 index 00000000..475c4fae --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: symbolic_regression (mixed_polyexp_4d) (TTT)" + description: | + Solve the 'symbolic_regression (mixed_polyexp_4d)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: symbolic_regression + variant_name: "mixed_polyexp_4d" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/symbolic_regression__mixed_polyexp_4d/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/eval/grader.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/opencode.json b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/solution.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/statement.md b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/statement.md new file mode 100644 index 00000000..ed3929c0 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed/statement.md @@ -0,0 +1,167 @@ +Symbolic Regression Benchmark - Peaks Dataset +============================================== + +Problem Setting +--------------- +Learn a closed-form symbolic expression `f(x1, x2)` that predicts the target `y`. + +This dataset is based on a peaks-like function, characterized by exponential terms that create localized peaks and valleys across the 2D input space. The underlying function involves interactions between polynomial and exponential components. + +Input Format +------------ +- Your `Solution.solve` receives: + - `X`: numpy.ndarray of shape `(n, 2)` containing feature values + - `y`: numpy.ndarray of shape `(n,)` containing target values +- Dataset columns: `x1, x2, y` + +Output Specification +-------------------- +Implement a `Solution` class in `solution.py`: + +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + """ + Args: + X: Feature matrix of shape (n, 2) + y: Target values of shape (n,) + + Returns: + dict with keys: + - "expression": str, a Python-evaluable expression using x1, x2 + - "predictions": list/array of length n (optional) + - "details": dict with optional "complexity" int + """ + # Example: fit a symbolic expression to the data + expression = "x1 + x2" # placeholder + return { + "expression": expression, + "predictions": None, # will be computed from expression if omitted + "details": {} + } +``` + +Expression Requirements: +- Must be a valid Python expression string +- Use variable names: `x1`, `x2` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` +- Numeric constants are allowed + +Dependencies (pinned versions) +------------------------------ +``` +pysr==0.19.0 +numpy==1.26.4 +pandas==2.2.2 +sympy==1.13.3 +``` + +Minimal Working Examples +------------------------ + +**Example 1: Using PySR (recommended)** +```python +import numpy as np +from pysr import PySRRegressor + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + model = PySRRegressor( + niterations=40, + binary_operators=["+", "-", "*", "/"], + unary_operators=["sin", "cos", "exp", "log"], + populations=15, + population_size=33, + maxsize=25, + verbosity=0, + progress=False, + random_state=42, + ) + model.fit(X, y, variable_names=["x1", "x2"]) + + # Get best expression as sympy, convert to string + best_expr = model.sympy() + expression = str(best_expr) + + # Predictions + predictions = model.predict(X) + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +**Example 2: Manual expression (simple baseline)** +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + # Simple linear combination as baseline + x1, x2 = X[:, 0], X[:, 1] + + # Fit coefficients via least squares + A = np.column_stack([x1, x2, np.ones_like(x1)]) + coeffs, _, _, _ = np.linalg.lstsq(A, y, rcond=None) + a, b, c = coeffs + + expression = f"{a:.6f}*x1 + {b:.6f}*x2 + {c:.6f}" + predictions = a * x1 + b * x2 + c + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +PySR API Notes (v0.19.0) +------------------------ +- `model.fit(X, y, variable_names=["x1", "x2"])` - use variable_names to match expected output +- `model.sympy()` - returns best expression as sympy object +- `model.predict(X)` - returns predictions array +- `model.equations_` - DataFrame of all discovered equations +- Common parameters: + - `niterations`: number of evolution iterations (more = better but slower) + - `populations`: number of parallel populations + - `maxsize`: maximum expression complexity + - `verbosity=0, progress=False`: suppress output + +Expression Format Requirements +------------------------------ +- Must be a valid Python expression string +- Use variable names: `x1`, `x2` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` (NO `np.` prefix) +- Numeric constants are allowed +- The evaluator uses `sympy.sympify()` to parse your expression + +Scoring +------- +``` +MSE = (1/n) Σ (y_i - ŷ_i)² +Score = 100 × clamp((m_base - MSE) / (m_base - m_ref), 0, 1) × 0.99^max(C - C_ref, 0) +``` + +- `m_base`: linear regression baseline MSE +- `m_ref`, `C_ref`: reference solution MSE and complexity +- `C = 2 × (#binary ops) + (#unary ops)` +- Lower MSE and lower complexity yield higher scores + +Environment +----------- +Run `set_up_env.sh` to install dependencies. diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/task.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/task.yaml new file mode 100644 index 00000000..f6305b83 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: symbolic_regression (peaks) (TTT)" + description: | + Solve the 'symbolic_regression (peaks)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: symbolic_regression + variant_name: "peaks" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/symbolic_regression__peaks/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/eval/grader.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/opencode.json b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/solution.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/statement.md b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/statement.md new file mode 100644 index 00000000..8d545a03 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed/statement.md @@ -0,0 +1,167 @@ +Symbolic Regression Benchmark - Ripple Dataset +=============================================== + +Problem Setting +--------------- +Learn a closed-form symbolic expression `f(x1, x2)` that predicts the target `y`. + +This dataset is generated from a ripple-like function that combines polynomial amplitude modulation with high-frequency trigonometric oscillations. The function creates concentric wave patterns with varying intensity across the domain. + +Input Format +------------ +- Your `Solution.solve` receives: + - `X`: numpy.ndarray of shape `(n, 2)` containing feature values + - `y`: numpy.ndarray of shape `(n,)` containing target values +- Dataset columns: `x1, x2, y` + +Output Specification +-------------------- +Implement a `Solution` class in `solution.py`: + +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + """ + Args: + X: Feature matrix of shape (n, 2) + y: Target values of shape (n,) + + Returns: + dict with keys: + - "expression": str, a Python-evaluable expression using x1, x2 + - "predictions": list/array of length n (optional) + - "details": dict with optional "complexity" int + """ + # Example: fit a symbolic expression to the data + expression = "x1 + x2" # placeholder + return { + "expression": expression, + "predictions": None, # will be computed from expression if omitted + "details": {} + } +``` + +Expression Requirements: +- Must be a valid Python expression string +- Use variable names: `x1`, `x2` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` +- Numeric constants are allowed + +Dependencies (pinned versions) +------------------------------ +``` +pysr==0.19.0 +numpy==1.26.4 +pandas==2.2.2 +sympy==1.13.3 +``` + +Minimal Working Examples +------------------------ + +**Example 1: Using PySR (recommended)** +```python +import numpy as np +from pysr import PySRRegressor + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + model = PySRRegressor( + niterations=40, + binary_operators=["+", "-", "*", "/"], + unary_operators=["sin", "cos", "exp", "log"], + populations=15, + population_size=33, + maxsize=25, + verbosity=0, + progress=False, + random_state=42, + ) + model.fit(X, y, variable_names=["x1", "x2"]) + + # Get best expression as sympy, convert to string + best_expr = model.sympy() + expression = str(best_expr) + + # Predictions + predictions = model.predict(X) + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +**Example 2: Manual expression (simple baseline)** +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + # Simple linear combination as baseline + x1, x2 = X[:, 0], X[:, 1] + + # Fit coefficients via least squares + A = np.column_stack([x1, x2, np.ones_like(x1)]) + coeffs, _, _, _ = np.linalg.lstsq(A, y, rcond=None) + a, b, c = coeffs + + expression = f"{a:.6f}*x1 + {b:.6f}*x2 + {c:.6f}" + predictions = a * x1 + b * x2 + c + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +PySR API Notes (v0.19.0) +------------------------ +- `model.fit(X, y, variable_names=["x1", "x2"])` - use variable_names to match expected output +- `model.sympy()` - returns best expression as sympy object +- `model.predict(X)` - returns predictions array +- `model.equations_` - DataFrame of all discovered equations +- Common parameters: + - `niterations`: number of evolution iterations (more = better but slower) + - `populations`: number of parallel populations + - `maxsize`: maximum expression complexity + - `verbosity=0, progress=False`: suppress output + +Expression Format Requirements +------------------------------ +- Must be a valid Python expression string +- Use variable names: `x1`, `x2` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` (NO `np.` prefix) +- Numeric constants are allowed +- The evaluator uses `sympy.sympify()` to parse your expression + +Scoring +------- +``` +MSE = (1/n) Σ (y_i - ŷ_i)² +Score = 100 × clamp((m_base - MSE) / (m_base - m_ref), 0, 1) × 0.99^max(C - C_ref, 0) +``` + +- `m_base`: linear regression baseline MSE +- `m_ref`, `C_ref`: reference solution MSE and complexity +- `C = 2 × (#binary ops) + (#unary ops)` +- Lower MSE and lower complexity yield higher scores + +Environment +----------- +Run `set_up_env.sh` to install dependencies. diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/task.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/task.yaml new file mode 100644 index 00000000..59ca6923 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: symbolic_regression (ripple) (TTT)" + description: | + Solve the 'symbolic_regression (ripple)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: symbolic_regression + variant_name: "ripple" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/symbolic_regression__ripple/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/eval/grader.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/opencode.json b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/solution.py b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/statement.md b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/statement.md new file mode 100644 index 00000000..326ad77b --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed/statement.md @@ -0,0 +1,167 @@ +Symbolic Regression Benchmark - SinCos Dataset +=============================================== + +Problem Setting +--------------- +Learn a closed-form symbolic expression `f(x1, x2)` that predicts the target `y`. + +This dataset features a function built from basic trigonometric operations. The target exhibits periodic behavior in both input dimensions, representing a straightforward but fundamental pattern for symbolic regression. + +Input Format +------------ +- Your `Solution.solve` receives: + - `X`: numpy.ndarray of shape `(n, 2)` containing feature values + - `y`: numpy.ndarray of shape `(n,)` containing target values +- Dataset columns: `x1, x2, y` + +Output Specification +-------------------- +Implement a `Solution` class in `solution.py`: + +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + """ + Args: + X: Feature matrix of shape (n, 2) + y: Target values of shape (n,) + + Returns: + dict with keys: + - "expression": str, a Python-evaluable expression using x1, x2 + - "predictions": list/array of length n (optional) + - "details": dict with optional "complexity" int + """ + # Example: fit a symbolic expression to the data + expression = "x1 + x2" # placeholder + return { + "expression": expression, + "predictions": None, # will be computed from expression if omitted + "details": {} + } +``` + +Expression Requirements: +- Must be a valid Python expression string +- Use variable names: `x1`, `x2` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` +- Numeric constants are allowed + +Dependencies (pinned versions) +------------------------------ +``` +pysr==0.19.0 +numpy==1.26.4 +pandas==2.2.2 +sympy==1.13.3 +``` + +Minimal Working Examples +------------------------ + +**Example 1: Using PySR (recommended)** +```python +import numpy as np +from pysr import PySRRegressor + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + model = PySRRegressor( + niterations=40, + binary_operators=["+", "-", "*", "/"], + unary_operators=["sin", "cos", "exp", "log"], + populations=15, + population_size=33, + maxsize=25, + verbosity=0, + progress=False, + random_state=42, + ) + model.fit(X, y, variable_names=["x1", "x2"]) + + # Get best expression as sympy, convert to string + best_expr = model.sympy() + expression = str(best_expr) + + # Predictions + predictions = model.predict(X) + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +**Example 2: Manual expression (simple baseline)** +```python +import numpy as np + +class Solution: + def __init__(self, **kwargs): + pass + + def solve(self, X: np.ndarray, y: np.ndarray) -> dict: + # Simple linear combination as baseline + x1, x2 = X[:, 0], X[:, 1] + + # Fit coefficients via least squares + A = np.column_stack([x1, x2, np.ones_like(x1)]) + coeffs, _, _, _ = np.linalg.lstsq(A, y, rcond=None) + a, b, c = coeffs + + expression = f"{a:.6f}*x1 + {b:.6f}*x2 + {c:.6f}" + predictions = a * x1 + b * x2 + c + + return { + "expression": expression, + "predictions": predictions.tolist(), + "details": {} + } +``` + +PySR API Notes (v0.19.0) +------------------------ +- `model.fit(X, y, variable_names=["x1", "x2"])` - use variable_names to match expected output +- `model.sympy()` - returns best expression as sympy object +- `model.predict(X)` - returns predictions array +- `model.equations_` - DataFrame of all discovered equations +- Common parameters: + - `niterations`: number of evolution iterations (more = better but slower) + - `populations`: number of parallel populations + - `maxsize`: maximum expression complexity + - `verbosity=0, progress=False`: suppress output + +Expression Format Requirements +------------------------------ +- Must be a valid Python expression string +- Use variable names: `x1`, `x2` +- Allowed operators: `+`, `-`, `*`, `/`, `**` +- Allowed functions: `sin`, `cos`, `exp`, `log` (NO `np.` prefix) +- Numeric constants are allowed +- The evaluator uses `sympy.sympify()` to parse your expression + +Scoring +------- +``` +MSE = (1/n) Σ (y_i - ŷ_i)² +Score = 100 × clamp((m_base - MSE) / (m_base - m_ref), 0, 1) × 0.99^max(C - C_ref, 0) +``` + +- `m_base`: linear regression baseline MSE +- `m_ref`, `C_ref`: reference solution MSE and complexity +- `C = 2 × (#binary ops) + (#unary ops)` +- Lower MSE and lower complexity yield higher scores + +Environment +----------- +Run `set_up_env.sh` to install dependencies. diff --git a/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/task.yaml b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/task.yaml new file mode 100644 index 00000000..d2c2d1aa --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: symbolic_regression (sincos) (TTT)" + description: | + Solve the 'symbolic_regression (sincos)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 1800s. + - Language: python. + +grader: + timeout: 1800 + direction: maximize + args: + problem_name: symbolic_regression + variant_name: "sincos" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/symbolic_regression__sincos/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/eval/grader.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/opencode.json b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/solution.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/statement.md b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/statement.md new file mode 100644 index 00000000..9edebbb3 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed/statement.md @@ -0,0 +1,233 @@ +VDB Design Problem - Balanced Tier +=================================== + +Problem Setting +--------------- +Design a Vector Database index optimized for **recall** subject to a **latency constraint**. This tier uses latency-gated scoring: solutions exceeding the latency threshold receive zero points, while solutions meeting the constraint are scored purely by recall@1. + +**Optimization Goal**: Maximize recall@1 within latency constraint + +$$ +\text{score} = \begin{cases} +0 & \text{if } t_{\text{query}} > t_{\text{max}} \\ +100 & \text{if } t_{\text{query}} \leq t_{\text{max}} \text{ and } r \geq r_{\text{baseline}} \\ +100 \cdot \frac{r - r_{\text{min}}}{r_{\text{baseline}} - r_{\text{min}}} & \text{if } t_{\text{query}} \leq t_{\text{max}} \text{ and } r < r_{\text{baseline}} +\end{cases} +$$ + +Where: +- $r$: Your recall@1 +- $t_{\text{query}}$: Your average query latency (ms) +- $r_{\text{baseline}} = 0.9914$ (baseline recall) +- $r_{\text{min}} = 0.6939$ (minimum acceptable recall, 70% of baseline) +- $t_{\text{max}} = 5.775\text{ms}$ (maximum allowed latency, 150% of baseline 3.85ms) + +**Key Insight**: Latency is a hard constraint. Only recall determines your score within the constraint. + +Baseline Performance +-------------------- +- Recall@1: **0.9914** (99.14%) +- Avg query time: **3.85ms** +- Baseline score: **100** (recall equals baseline within latency constraint) + +Scoring Examples +---------------- +Assuming all solutions meet latency constraint ($t \leq 5.775\text{ms}$): + +| Recall@1 | Latency | Score Calculation | Score | +|----------|---------|-------------------|-------| +| 0.9914 | 3.85ms | $r = r_{\text{baseline}}$ → max score | **100** | +| 0.9950 | 3.00ms | $r > r_{\text{baseline}}$ → max score | **100** | +| 0.9500 | 2.50ms | $\frac{0.95 - 0.6939}{0.9914 - 0.6939} = 0.860$ | **86.0** | +| 0.8500 | 4.00ms | $\frac{0.85 - 0.6939}{0.9914 - 0.6939} = 0.524$ | **52.4** | +| 0.6939 | 5.00ms | $r = r_{\text{min}}$ → minimum score | **0** | +| 0.9900 | **6.00ms** | $t > t_{\text{max}}$ → latency gate fails | **0** | + +**Note**: Faster latency does NOT increase score - only recall matters if constraint is met. + +API Specification +----------------- +Implement a class with the following interface: + +```python +import numpy as np +from typing import Tuple + +class YourIndexClass: + def __init__(self, dim: int, **kwargs): + """ + Initialize the index for vectors of dimension `dim`. + + Args: + dim: Vector dimensionality (e.g., 128 for SIFT1M) + **kwargs: Optional parameters (e.g., M, ef_construction for HNSW) + + Example: + index = YourIndexClass(dim=128, M=16, ef_search=64) + """ + pass + + def add(self, xb: np.ndarray) -> None: + """ + Add vectors to the index. + + Args: + xb: Base vectors, shape (N, dim), dtype float32 + + Notes: + - Can be called multiple times (cumulative) + - Must handle large N (e.g., 1,000,000 vectors) + + Example: + index.add(xb) # xb.shape = (1000000, 128) + """ + pass + + def search(self, xq: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]: + """ + Search for k nearest neighbors of query vectors. + + Args: + xq: Query vectors, shape (nq, dim), dtype float32 + k: Number of nearest neighbors to return + + Returns: + (distances, indices): + - distances: shape (nq, k), dtype float32, L2 distances + - indices: shape (nq, k), dtype int64, indices into base vectors + + Notes: + - Must return exactly k neighbors per query + - Indices should refer to positions in the vectors passed to add() + - Lower distance = more similar + + Example: + D, I = index.search(xq, k=1) # xq.shape = (10000, 128) + # D.shape = (10000, 1), I.shape = (10000, 1) + """ + pass +``` + +**Implementation Requirements**: +- Class can have any name (evaluator auto-discovers classes with `add` and `search` methods) +- Must handle SIFT1M dataset: 1M base vectors, 10K queries, 128 dimensions +- Your `search` must return tuple `(distances, indices)` with shapes `(nq, k)` +- Distances should be L2 (Euclidean) or L2-squared +- No need to handle dataset loading - evaluator provides numpy arrays + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Load Dataset +```python +from faiss.contrib.datasets import DatasetSIFT1M +ds = DatasetSIFT1M() +xb = ds.get_database() # (1000000, 128) float32 +xq = ds.get_queries() # (10000, 128) float32 +gt = ds.get_groundtruth() # (10000, 100) int64 - ground truth indices +``` + +### 2. Build Index +```python +from solution import YourIndexClass # Auto-discovered +d = xb.shape[1] # 128 for SIFT1M +index = YourIndexClass(d) # Pass dimension as first argument +index.add(xb) # Add all 1M base vectors +``` + +### 3. Measure Performance (Batch Queries) +```python +import time +t0 = time.time() +D, I = index.search(xq, k=1) # Search all 10K queries at once +t1 = time.time() + +# Calculate metrics +recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / len(xq) +avg_query_time_ms = (t1 - t0) * 1000.0 / len(xq) +``` + +**Important**: `avg_query_time_ms` from **batch queries** is used for scoring. Batch queries benefit from CPU cache and vectorization, typically faster than single queries. + +### 4. Calculate Score +```python +if avg_query_time_ms > 5.775: + score = 0.0 +elif recall_at_1 >= 0.9914: + score = 100.0 +else: + recall_range = 0.9914 - 0.6939 + recall_proportion = (recall_at_1 - 0.6939) / recall_range + score = max(0.0, min(100.0, 100.0 * recall_proportion)) +``` + +Dataset Details +--------------- +- **Name**: SIFT1M +- **Base vectors**: 1,000,000 vectors of dimension 128 +- **Query vectors**: 10,000 vectors +- **Ground truth**: Precomputed nearest neighbors (k=1) +- **Metric**: L2 (Euclidean distance) +- **Vector type**: float32 + +Runtime Platform +---------------- +- **Infrastructure**: Evaluations run on SkyPilot-managed cloud instances (AWS, GCP, or Azure) +- **Compute**: CPU-only instances (no GPU required) +- **Environment**: Docker containerized execution with Python 3, NumPy ≥1.24, FAISS-CPU ≥1.7.4 + +Constraints +----------- +- **Timeout**: 1 hour for entire evaluation (index construction + queries) +- **Memory**: Use reasonable memory (index should fit in RAM) +- **Latency constraint**: avg_query_time_ms ≤ 5.775ms +- **Recall range**: 0.6939 ≤ recall@1 ≤ 1.0 + +Strategy Tips +------------- +1. **Focus on recall**: Latency only needs to meet threshold, doesn't improve score beyond that +2. **Batch optimization is key**: Your `search` should handle batch queries efficiently +3. **Parameter tuning**: Small changes (e.g., HNSW's M, ef_search) significantly affect recall +4. **Don't over-optimize latency**: Meeting 5.775ms is enough; focus energy on recall + +Example: Simple Baseline +------------------------- +```python +import numpy as np + +class SimpleIndex: + def __init__(self, dim: int, **kwargs): + self.dim = dim + self.xb = None + + def add(self, xb: np.ndarray) -> None: + if self.xb is None: + self.xb = xb.copy() + else: + self.xb = np.vstack([self.xb, xb]) + + def search(self, xq: np.ndarray, k: int) -> tuple: + # Compute all pairwise L2 distances + # xq: (nq, dim), xb: (N, dim) + # distances: (nq, N) + distances = np.sqrt(((xq[:, np.newaxis, :] - self.xb[np.newaxis, :, :]) ** 2).sum(axis=2)) + + # Get k nearest neighbors + indices = np.argpartition(distances, k-1, axis=1)[:, :k] + sorted_indices = np.argsort(distances[np.arange(len(xq))[:, None], indices], axis=1) + final_indices = indices[np.arange(len(xq))[:, None], sorted_indices] + final_distances = distances[np.arange(len(xq))[:, None], final_indices] + + return final_distances, final_indices +``` + +**Note**: This baseline achieves perfect recall (100%) but is too slow for large datasets. Use approximate methods like HNSW, IVF, or LSH for better speed-recall tradeoffs. + +Debugging Tips +-------------- +- **Test locally**: Use a subset of data (e.g., 10K vectors) for faster iteration +- **Verify shapes**: Ensure `search` returns `(nq, k)` shaped arrays +- **Check recall calculation**: `(I[:, :1] == gt[:, :1]).sum() / len(xq)` +- **Profile latency**: Measure batch vs single query performance separately +- **Validate before submit**: Run full 1M dataset locally if possible diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/task.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/task.yaml new file mode 100644 index 00000000..3dc5b3fc --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: vdb_pareto (balanced) (TTT)" + description: | + Solve the 'vdb_pareto (balanced)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: vdb_pareto + variant_name: "balanced" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/vdb_pareto__balanced/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/eval/grader.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/opencode.json b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/solution.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/statement.md b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/statement.md new file mode 100644 index 00000000..38e79428 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed/statement.md @@ -0,0 +1,233 @@ +VDB Design Problem - High Recall Tier +====================================== + +Problem Setting +--------------- +Design a Vector Database index optimized for **recall** subject to a **relaxed latency constraint**. This tier uses latency-gated scoring: solutions exceeding the latency threshold receive zero points, while solutions meeting the constraint are scored purely by recall@1. + +**Optimization Goal**: Maximize recall@1 within latency constraint + +$$ +\text{score} = \begin{cases} +0 & \text{if } t_{\text{query}} > t_{\text{max}} \\ +100 & \text{if } t_{\text{query}} \leq t_{\text{max}} \text{ and } r \geq r_{\text{baseline}} \\ +100 \cdot \frac{r - r_{\text{min}}}{r_{\text{baseline}} - r_{\text{min}}} & \text{if } t_{\text{query}} \leq t_{\text{max}} \text{ and } r < r_{\text{baseline}} +\end{cases} +$$ + +Where: +- $r$: Your recall@1 +- $t_{\text{query}}$: Your average query latency (ms) +- $r_{\text{baseline}} = 0.9914$ (baseline recall) +- $r_{\text{min}} = 0.9409$ (minimum acceptable recall, 95% of baseline) +- $t_{\text{max}} = 7.7\text{ms}$ (maximum allowed latency, 200% of baseline 3.85ms) + +**Key Insight**: This tier provides 2× latency budget compared to balanced tier, allowing more thorough search for higher recall. + +Baseline Performance +-------------------- +- Recall@1: **0.9914** (99.14%) +- Avg query time: **3.85ms** +- Baseline score: **100** (recall equals baseline within latency constraint) + +Scoring Examples +---------------- +Assuming all solutions meet latency constraint ($t \leq 7.7\text{ms}$): + +| Recall@1 | Latency | Score Calculation | Score | +|----------|---------|-------------------|-------| +| 0.9914 | 3.85ms | $r = r_{\text{baseline}}$ → max score | **100** | +| 0.9950 | 5.00ms | $r > r_{\text{baseline}}$ → max score | **100** | +| 0.9700 | 6.00ms | $\frac{0.97 - 0.9409}{0.9914 - 0.9409} = 0.576$ | **57.6** | +| 0.9500 | 4.00ms | $\frac{0.95 - 0.9409}{0.9914 - 0.9409} = 0.180$ | **18.0** | +| 0.9409 | 7.00ms | $r = r_{\text{min}}$ → minimum score | **0** | +| 0.9914 | **8.00ms** | $t > t_{\text{max}}$ → latency gate fails | **0** | + +**Note**: The relaxed latency constraint (7.7ms vs 5.775ms in balanced) allows more aggressive search strategies for higher recall. + +API Specification +----------------- +Implement a class with the following interface: + +```python +import numpy as np +from typing import Tuple + +class YourIndexClass: + def __init__(self, dim: int, **kwargs): + """ + Initialize the index for vectors of dimension `dim`. + + Args: + dim: Vector dimensionality (e.g., 128 for SIFT1M) + **kwargs: Optional parameters (e.g., M, ef_construction for HNSW) + + Example: + index = YourIndexClass(dim=128, M=64, ef_search=800) + """ + pass + + def add(self, xb: np.ndarray) -> None: + """ + Add vectors to the index. + + Args: + xb: Base vectors, shape (N, dim), dtype float32 + + Notes: + - Can be called multiple times (cumulative) + - Must handle large N (e.g., 1,000,000 vectors) + + Example: + index.add(xb) # xb.shape = (1000000, 128) + """ + pass + + def search(self, xq: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]: + """ + Search for k nearest neighbors of query vectors. + + Args: + xq: Query vectors, shape (nq, dim), dtype float32 + k: Number of nearest neighbors to return + + Returns: + (distances, indices): + - distances: shape (nq, k), dtype float32, L2 distances + - indices: shape (nq, k), dtype int64, indices into base vectors + + Notes: + - Must return exactly k neighbors per query + - Indices should refer to positions in the vectors passed to add() + - Lower distance = more similar + + Example: + D, I = index.search(xq, k=1) # xq.shape = (10000, 128) + # D.shape = (10000, 1), I.shape = (10000, 1) + """ + pass +``` + +**Implementation Requirements**: +- Class can have any name (evaluator auto-discovers classes with `add` and `search` methods) +- Must handle SIFT1M dataset: 1M base vectors, 10K queries, 128 dimensions +- Your `search` must return tuple `(distances, indices)` with shapes `(nq, k)` +- Distances should be L2 (Euclidean) or L2-squared +- No need to handle dataset loading - evaluator provides numpy arrays + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Load Dataset +```python +from faiss.contrib.datasets import DatasetSIFT1M +ds = DatasetSIFT1M() +xb = ds.get_database() # (1000000, 128) float32 +xq = ds.get_queries() # (10000, 128) float32 +gt = ds.get_groundtruth() # (10000, 100) int64 - ground truth indices +``` + +### 2. Build Index +```python +from solution import YourIndexClass # Auto-discovered +d = xb.shape[1] # 128 for SIFT1M +index = YourIndexClass(d) # Pass dimension as first argument +index.add(xb) # Add all 1M base vectors +``` + +### 3. Measure Performance (Batch Queries) +```python +import time +t0 = time.time() +D, I = index.search(xq, k=1) # Search all 10K queries at once +t1 = time.time() + +# Calculate metrics +recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / len(xq) +avg_query_time_ms = (t1 - t0) * 1000.0 / len(xq) +``` + +**Important**: `avg_query_time_ms` from **batch queries** is used for scoring. Batch queries benefit from CPU cache and vectorization, typically faster than single queries. + +### 4. Calculate Score +```python +if avg_query_time_ms > 7.7: + score = 0.0 +elif recall_at_1 >= 0.9914: + score = 100.0 +else: + recall_range = 0.9914 - 0.9409 + recall_proportion = (recall_at_1 - 0.9409) / recall_range + score = max(0.0, min(100.0, 100.0 * recall_proportion)) +``` + +Dataset Details +--------------- +- **Name**: SIFT1M +- **Base vectors**: 1,000,000 vectors of dimension 128 +- **Query vectors**: 10,000 vectors +- **Ground truth**: Precomputed nearest neighbors (k=1) +- **Metric**: L2 (Euclidean distance) +- **Vector type**: float32 + +Runtime Platform +---------------- +- **Infrastructure**: Evaluations run on SkyPilot-managed cloud instances (AWS, GCP, or Azure) +- **Compute**: CPU-only instances (no GPU required) +- **Environment**: Docker containerized execution with Python 3, NumPy ≥1.24, FAISS-CPU ≥1.7.4 + +Constraints +----------- +- **Timeout**: 1 hour for entire evaluation (index construction + queries) +- **Memory**: Use reasonable memory (index should fit in RAM) +- **Latency constraint**: avg_query_time_ms ≤ 7.7ms +- **Recall range**: 0.9409 ≤ recall@1 ≤ 1.0 + +Strategy Tips +------------- +1. **Maximize recall**: Use 2× latency budget (7.7ms vs 5.775ms balanced) for more thorough search +2. **Batch optimization is key**: Your `search` should handle batch queries efficiently +3. **Parameter tuning for recall**: Higher HNSW efSearch (500-1000) or IVF nprobe (100-200) +4. **Trade latency for accuracy**: Unlike balanced tier, you can afford slower but more accurate search + +Example: Simple Baseline +------------------------- +```python +import numpy as np + +class SimpleIndex: + def __init__(self, dim: int, **kwargs): + self.dim = dim + self.xb = None + + def add(self, xb: np.ndarray) -> None: + if self.xb is None: + self.xb = xb.copy() + else: + self.xb = np.vstack([self.xb, xb]) + + def search(self, xq: np.ndarray, k: int) -> tuple: + # Compute all pairwise L2 distances + # xq: (nq, dim), xb: (N, dim) + # distances: (nq, N) + distances = np.sqrt(((xq[:, np.newaxis, :] - self.xb[np.newaxis, :, :]) ** 2).sum(axis=2)) + + # Get k nearest neighbors + indices = np.argpartition(distances, k-1, axis=1)[:, :k] + sorted_indices = np.argsort(distances[np.arange(len(xq))[:, None], indices], axis=1) + final_indices = indices[np.arange(len(xq))[:, None], sorted_indices] + final_distances = distances[np.arange(len(xq))[:, None], final_indices] + + return final_distances, final_indices +``` + +**Note**: This baseline achieves perfect recall (100%) but is too slow for large datasets. Use approximate methods like HNSW, IVF, or LSH for better speed-recall tradeoffs. + +Debugging Tips +-------------- +- **Test locally**: Use a subset of data (e.g., 10K vectors) for faster iteration +- **Verify shapes**: Ensure `search` returns `(nq, k)` shaped arrays +- **Check recall calculation**: `(I[:, :1] == gt[:, :1]).sum() / len(xq)` +- **Profile latency**: Measure batch vs single query performance separately +- **Validate before submit**: Run full 1M dataset locally if possible diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/task.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/task.yaml new file mode 100644 index 00000000..33400563 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: vdb_pareto (high_recall) (TTT)" + description: | + Solve the 'vdb_pareto (high_recall)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: vdb_pareto + variant_name: "high_recall" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/vdb_pareto__high_recall/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/eval/grader.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/opencode.json b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/solution.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/statement.md b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/statement.md new file mode 100644 index 00000000..b6b53f6f --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed/statement.md @@ -0,0 +1,233 @@ +VDB Design Problem - Low Latency Tier +====================================== + +Problem Setting +--------------- +Design a Vector Database index optimized for **recall** subject to a **strict latency constraint**. This tier uses latency-gated scoring: solutions exceeding the latency threshold receive zero points, while solutions meeting the constraint are scored purely by recall@1. + +**Optimization Goal**: Maximize recall@1 within latency constraint + +$$ +\text{score} = \begin{cases} +0 & \text{if } t_{\text{query}} > t_{\text{max}} \\ +100 & \text{if } t_{\text{query}} \leq t_{\text{max}} \text{ and } r \geq r_{\text{baseline}} \\ +100 \cdot \frac{r - r_{\text{min}}}{r_{\text{baseline}} - r_{\text{min}}} & \text{if } t_{\text{query}} \leq t_{\text{max}} \text{ and } r < r_{\text{baseline}} +\end{cases} +$$ + +Where: +- $r$: Your recall@1 +- $t_{\text{query}}$: Your average query latency (ms) +- $r_{\text{baseline}} = 0.9914$ (baseline recall) +- $r_{\text{min}} = 0.7931$ (minimum acceptable recall, 80% of baseline) +- $t_{\text{max}} = 2.31\text{ms}$ (maximum allowed latency, 60% of baseline 3.85ms) + +**Key Insight**: This tier has a very strict latency constraint (60% of baseline), requiring aggressive approximation while maintaining reasonable recall. + +Baseline Performance +-------------------- +- Recall@1: **0.9914** (99.14%) +- Avg query time: **3.85ms** +- Baseline score: **100** (recall equals baseline within latency constraint) + +Scoring Examples +---------------- +Assuming all solutions meet latency constraint ($t \leq 2.31\text{ms}$): + +| Recall@1 | Latency | Score Calculation | Score | +|----------|---------|-------------------|-------| +| 0.9914 | 2.00ms | $r = r_{\text{baseline}}$ → max score | **100** | +| 0.9500 | 2.00ms | $\frac{0.95 - 0.7931}{0.9914 - 0.7931} = 0.791$ | **79.1** | +| 0.9000 | 1.50ms | $\frac{0.90 - 0.7931}{0.9914 - 0.7931} = 0.539$ | **53.9** | +| 0.8500 | 1.00ms | $\frac{0.85 - 0.7931}{0.9914 - 0.7931} = 0.287$ | **28.7** | +| 0.7931 | 2.00ms | $r = r_{\text{min}}$ → minimum score | **0** | +| 0.9500 | **2.50ms** | $t > t_{\text{max}}$ → latency gate fails | **0** | + +**Note**: The strict latency constraint (2.31ms vs 5.775ms in balanced) requires aggressive approximation, typically resulting in lower recall. + +API Specification +----------------- +Implement a class with the following interface: + +```python +import numpy as np +from typing import Tuple + +class YourIndexClass: + def __init__(self, dim: int, **kwargs): + """ + Initialize the index for vectors of dimension `dim`. + + Args: + dim: Vector dimensionality (e.g., 128 for SIFT1M) + **kwargs: Optional parameters (e.g., M, ef_construction for HNSW) + + Example: + index = YourIndexClass(dim=128, M=16, ef_search=80) + """ + pass + + def add(self, xb: np.ndarray) -> None: + """ + Add vectors to the index. + + Args: + xb: Base vectors, shape (N, dim), dtype float32 + + Notes: + - Can be called multiple times (cumulative) + - Must handle large N (e.g., 1,000,000 vectors) + + Example: + index.add(xb) # xb.shape = (1000000, 128) + """ + pass + + def search(self, xq: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]: + """ + Search for k nearest neighbors of query vectors. + + Args: + xq: Query vectors, shape (nq, dim), dtype float32 + k: Number of nearest neighbors to return + + Returns: + (distances, indices): + - distances: shape (nq, k), dtype float32, L2 distances + - indices: shape (nq, k), dtype int64, indices into base vectors + + Notes: + - Must return exactly k neighbors per query + - Indices should refer to positions in the vectors passed to add() + - Lower distance = more similar + + Example: + D, I = index.search(xq, k=1) # xq.shape = (10000, 128) + # D.shape = (10000, 1), I.shape = (10000, 1) + """ + pass +``` + +**Implementation Requirements**: +- Class can have any name (evaluator auto-discovers classes with `add` and `search` methods) +- Must handle SIFT1M dataset: 1M base vectors, 10K queries, 128 dimensions +- Your `search` must return tuple `(distances, indices)` with shapes `(nq, k)` +- Distances should be L2 (Euclidean) or L2-squared +- No need to handle dataset loading - evaluator provides numpy arrays + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Load Dataset +```python +from faiss.contrib.datasets import DatasetSIFT1M +ds = DatasetSIFT1M() +xb = ds.get_database() # (1000000, 128) float32 +xq = ds.get_queries() # (10000, 128) float32 +gt = ds.get_groundtruth() # (10000, 100) int64 - ground truth indices +``` + +### 2. Build Index +```python +from solution import YourIndexClass # Auto-discovered +d = xb.shape[1] # 128 for SIFT1M +index = YourIndexClass(d) # Pass dimension as first argument +index.add(xb) # Add all 1M base vectors +``` + +### 3. Measure Performance (Batch Queries) +```python +import time +t0 = time.time() +D, I = index.search(xq, k=1) # Search all 10K queries at once +t1 = time.time() + +# Calculate metrics +recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / len(xq) +avg_query_time_ms = (t1 - t0) * 1000.0 / len(xq) +``` + +**Important**: `avg_query_time_ms` from **batch queries** is used for scoring. Batch queries benefit from CPU cache and vectorization, typically faster than single queries. + +### 4. Calculate Score +```python +if avg_query_time_ms > 2.31: + score = 0.0 +elif recall_at_1 >= 0.9914: + score = 100.0 +else: + recall_range = 0.9914 - 0.7931 + recall_proportion = (recall_at_1 - 0.7931) / recall_range + score = max(0.0, min(100.0, 100.0 * recall_proportion)) +``` + +Dataset Details +--------------- +- **Name**: SIFT1M +- **Base vectors**: 1,000,000 vectors of dimension 128 +- **Query vectors**: 10,000 vectors +- **Ground truth**: Precomputed nearest neighbors (k=1) +- **Metric**: L2 (Euclidean distance) +- **Vector type**: float32 + +Runtime Platform +---------------- +- **Infrastructure**: Evaluations run on SkyPilot-managed cloud instances (AWS, GCP, or Azure) +- **Compute**: CPU-only instances (no GPU required) +- **Environment**: Docker containerized execution with Python 3, NumPy ≥1.24, FAISS-CPU ≥1.7.4 + +Constraints +----------- +- **Timeout**: 1 hour for entire evaluation (index construction + queries) +- **Memory**: Use reasonable memory (index should fit in RAM) +- **Latency constraint**: avg_query_time_ms ≤ 2.31ms +- **Recall range**: 0.7931 ≤ recall@1 ≤ 1.0 + +Strategy Tips +------------- +1. **Aggressive approximation**: Use very low search budgets (IVF nprobe=2-5, HNSW efSearch=50-100) +2. **Batch optimization is key**: Your `search` should handle batch queries efficiently +3. **Accept recall drops**: 80-90% recall is acceptable if latency is met +4. **Leave safety margin**: Target 1.5-2.0ms to avoid edge cases exceeding 2.31ms + +Example: Simple Baseline +------------------------- +```python +import numpy as np + +class SimpleIndex: + def __init__(self, dim: int, **kwargs): + self.dim = dim + self.xb = None + + def add(self, xb: np.ndarray) -> None: + if self.xb is None: + self.xb = xb.copy() + else: + self.xb = np.vstack([self.xb, xb]) + + def search(self, xq: np.ndarray, k: int) -> tuple: + # Compute all pairwise L2 distances + # xq: (nq, dim), xb: (N, dim) + # distances: (nq, N) + distances = np.sqrt(((xq[:, np.newaxis, :] - self.xb[np.newaxis, :, :]) ** 2).sum(axis=2)) + + # Get k nearest neighbors + indices = np.argpartition(distances, k-1, axis=1)[:, :k] + sorted_indices = np.argsort(distances[np.arange(len(xq))[:, None], indices], axis=1) + final_indices = indices[np.arange(len(xq))[:, None], sorted_indices] + final_distances = distances[np.arange(len(xq))[:, None], final_indices] + + return final_distances, final_indices +``` + +**Note**: This baseline achieves perfect recall (100%) but is too slow for large datasets. Use approximate methods like HNSW, IVF, or LSH for better speed-recall tradeoffs. + +Debugging Tips +-------------- +- **Test locally**: Use a subset of data (e.g., 10K vectors) for faster iteration +- **Verify shapes**: Ensure `search` returns `(nq, k)` shaped arrays +- **Check recall calculation**: `(I[:, :1] == gt[:, :1]).sum() / len(xq)` +- **Profile latency**: Measure batch vs single query performance separately +- **Validate before submit**: Run full 1M dataset locally if possible diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/task.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/task.yaml new file mode 100644 index 00000000..86a8a3a6 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: vdb_pareto (low_latency) (TTT)" + description: | + Solve the 'vdb_pareto (low_latency)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: vdb_pareto + variant_name: "low_latency" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/vdb_pareto__low_latency/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/eval/grader.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/opencode.json b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/solution.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/statement.md b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/statement.md new file mode 100644 index 00000000..93000e43 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed/statement.md @@ -0,0 +1,234 @@ +VDB Design Problem - Recall80 Latency Tier +=========================================== + +Problem Setting +--------------- +Design a Vector Database index optimized for **latency** subject to a **recall constraint**. This tier uses recall-gated scoring: solutions failing to meet the recall threshold receive zero points, while solutions meeting the constraint are scored purely by latency. + +**Optimization Goal**: Minimize latency within recall constraint + +$$ +\text{score} = \begin{cases} +0 & \text{if } r < r_{\text{gate}} \\ +100 & \text{if } r \geq r_{\text{gate}} \text{ and } t_{\text{query}} \leq t_{\text{min}} \\ +100 \cdot \frac{t_{\text{max}} - t_{\text{query}}}{t_{\text{max}} - t_{\text{min}}} & \text{if } r \geq r_{\text{gate}} \text{ and } t_{\text{min}} < t_{\text{query}} < t_{\text{max}} \\ +0 & \text{if } r \geq r_{\text{gate}} \text{ and } t_{\text{query}} \geq t_{\text{max}} +\end{cases} +$$ + +Where: +- $r$: Your recall@1 +- $t_{\text{query}}$: Your average query latency (ms) +- $r_{\text{gate}} = 0.80$ (minimum required recall) +- $t_{\text{min}} = 0.0\text{ms}$ (best possible latency) +- $t_{\text{max}} = 0.6\text{ms}$ (maximum allowed latency) + +**Key Insight**: Unlike other tiers, this tier gates on recall and scores on latency. You MUST achieve ≥80% recall, then faster is better. + +Baseline Performance +-------------------- +- Recall@1: **0.9914** (99.14%) +- Avg query time: **3.85ms** + +Scoring Examples +---------------- +All examples assume recall constraint is met ($r \geq 0.80$): + +| Recall@1 | Latency | Score Calculation | Score | +|----------|---------|-------------------|-------| +| 0.85 | 0.00ms | $t \leq t_{\text{min}}$ → max score | **100** | +| 0.85 | 0.30ms | $\frac{0.6 - 0.3}{0.6 - 0.0} = 0.50$ | **50** | +| 0.82 | 0.50ms | $\frac{0.6 - 0.5}{0.6 - 0.0} = 0.167$ | **16.7** | +| 0.90 | 0.10ms | $\frac{0.6 - 0.1}{0.6 - 0.0} = 0.833$ | **83.3** | +| **0.75** | 0.20ms | $r < r_{\text{gate}}$ → recall gate fails | **0** | +| 0.95 | **0.70ms** | $t \geq t_{\text{max}}$ → latency too high | **0** | + +**Note**: This is the most aggressive latency requirement (0.6ms max). You must use extreme approximation while maintaining 80% recall. + +API Specification +----------------- +Implement a class with the following interface: + +```python +import numpy as np +from typing import Tuple + +class YourIndexClass: + def __init__(self, dim: int, **kwargs): + """ + Initialize the index for vectors of dimension `dim`. + + Args: + dim: Vector dimensionality (e.g., 128 for SIFT1M) + **kwargs: Optional parameters (e.g., M, ef_construction for HNSW) + + Example: + index = YourIndexClass(dim=128, nlist=256, nprobe=2) + """ + pass + + def add(self, xb: np.ndarray) -> None: + """ + Add vectors to the index. + + Args: + xb: Base vectors, shape (N, dim), dtype float32 + + Notes: + - Can be called multiple times (cumulative) + - Must handle large N (e.g., 1,000,000 vectors) + + Example: + index.add(xb) # xb.shape = (1000000, 128) + """ + pass + + def search(self, xq: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]: + """ + Search for k nearest neighbors of query vectors. + + Args: + xq: Query vectors, shape (nq, dim), dtype float32 + k: Number of nearest neighbors to return + + Returns: + (distances, indices): + - distances: shape (nq, k), dtype float32, L2 distances + - indices: shape (nq, k), dtype int64, indices into base vectors + + Notes: + - Must return exactly k neighbors per query + - Indices should refer to positions in the vectors passed to add() + - Lower distance = more similar + + Example: + D, I = index.search(xq, k=1) # xq.shape = (10000, 128) + # D.shape = (10000, 1), I.shape = (10000, 1) + """ + pass +``` + +**Implementation Requirements**: +- Class can have any name (evaluator auto-discovers classes with `add` and `search` methods) +- Must handle SIFT1M dataset: 1M base vectors, 10K queries, 128 dimensions +- Your `search` must return tuple `(distances, indices)` with shapes `(nq, k)` +- Distances should be L2 (Euclidean) or L2-squared +- No need to handle dataset loading - evaluator provides numpy arrays + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Load Dataset +```python +from faiss.contrib.datasets import DatasetSIFT1M +ds = DatasetSIFT1M() +xb = ds.get_database() # (1000000, 128) float32 +xq = ds.get_queries() # (10000, 128) float32 +gt = ds.get_groundtruth() # (10000, 100) int64 - ground truth indices +``` + +### 2. Build Index +```python +from solution import YourIndexClass # Auto-discovered +d = xb.shape[1] # 128 for SIFT1M +index = YourIndexClass(d) # Pass dimension as first argument +index.add(xb) # Add all 1M base vectors +``` + +### 3. Measure Performance (Batch Queries) +```python +import time +t0 = time.time() +D, I = index.search(xq, k=1) # Search all 10K queries at once +t1 = time.time() + +# Calculate metrics +recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / len(xq) +avg_query_time_ms = (t1 - t0) * 1000.0 / len(xq) +``` + +**Important**: `avg_query_time_ms` from **batch queries** is used for scoring. Batch queries benefit from CPU cache and vectorization, typically faster than single queries. + +### 4. Calculate Score +```python +if recall_at_1 < 0.80: + score = 0.0 +elif avg_query_time_ms <= 0.0: + score = 100.0 +elif avg_query_time_ms >= 0.6: + score = 0.0 +else: + proportion = (avg_query_time_ms - 0.0) / (0.6 - 0.0) + score = 100.0 * (1.0 - proportion) +``` + +Dataset Details +--------------- +- **Name**: SIFT1M +- **Base vectors**: 1,000,000 vectors of dimension 128 +- **Query vectors**: 10,000 vectors +- **Ground truth**: Precomputed nearest neighbors (k=1) +- **Metric**: L2 (Euclidean distance) +- **Vector type**: float32 + +Runtime Platform +---------------- +- **Infrastructure**: Evaluations run on SkyPilot-managed cloud instances (AWS, GCP, or Azure) +- **Compute**: CPU-only instances (no GPU required) +- **Environment**: Docker containerized execution with Python 3, NumPy ≥1.24, FAISS-CPU ≥1.7.4 + +Constraints +----------- +- **Timeout**: 1 hour for entire evaluation (index construction + queries) +- **Memory**: Use reasonable memory (index should fit in RAM) +- **Recall constraint**: recall@1 ≥ 0.80 +- **Latency range**: 0.0ms ≤ avg_query_time_ms ≤ 0.6ms + +Strategy Tips +------------- +1. **Meet recall gate first**: Ensure ≥80% recall, otherwise score = 0 +2. **Extreme approximation**: Use minimal search budget (IVF nprobe=1-3) +3. **Batch optimization critical**: 0.6ms is extremely tight, every microsecond counts +4. **Trade recall for speed**: 80-85% recall with ultra-low latency is ideal + +Example: Simple Baseline +------------------------- +```python +import numpy as np + +class SimpleIndex: + def __init__(self, dim: int, **kwargs): + self.dim = dim + self.xb = None + + def add(self, xb: np.ndarray) -> None: + if self.xb is None: + self.xb = xb.copy() + else: + self.xb = np.vstack([self.xb, xb]) + + def search(self, xq: np.ndarray, k: int) -> tuple: + # Compute all pairwise L2 distances + # xq: (nq, dim), xb: (N, dim) + # distances: (nq, N) + distances = np.sqrt(((xq[:, np.newaxis, :] - self.xb[np.newaxis, :, :]) ** 2).sum(axis=2)) + + # Get k nearest neighbors + indices = np.argpartition(distances, k-1, axis=1)[:, :k] + sorted_indices = np.argsort(distances[np.arange(len(xq))[:, None], indices], axis=1) + final_indices = indices[np.arange(len(xq))[:, None], sorted_indices] + final_distances = distances[np.arange(len(xq))[:, None], final_indices] + + return final_distances, final_indices +``` + +**Note**: This baseline achieves perfect recall (100%) but is too slow for large datasets. Use approximate methods like HNSW, IVF, or LSH for better speed-recall tradeoffs. + +Debugging Tips +-------------- +- **Test locally**: Use a subset of data (e.g., 10K vectors) for faster iteration +- **Verify shapes**: Ensure `search` returns `(nq, k)` shaped arrays +- **Check recall calculation**: `(I[:, :1] == gt[:, :1]).sum() / len(xq)` +- **Profile latency**: Measure batch vs single query performance separately +- **Validate before submit**: Run full 1M dataset locally if possible diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/task.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/task.yaml new file mode 100644 index 00000000..2534ae87 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: vdb_pareto (recall80_latency) (TTT)" + description: | + Solve the 'vdb_pareto (recall80_latency)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: vdb_pareto + variant_name: "recall80_latency" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/vdb_pareto__recall80_latency/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/eval/grader.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/eval/grader.py new file mode 100644 index 00000000..c438946c --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/eval/grader.py @@ -0,0 +1,67 @@ +"""Frontier-CS Research grader — uses frontier_cs package. + +Delegates evaluation to the frontier_cs SingleEvaluator which handles +Docker-based execution and scoring. +""" + +from __future__ import annotations + +from pathlib import Path + +from coral.grader import TaskGrader +from coral.types import ScoreBundle + + +class Grader(TaskGrader): + """Grader for a Frontier-CS research problem via frontier_cs package.""" + + def evaluate(self) -> ScoreBundle: + problem_name = self.args.get("problem_name", "") + variant_name = self.args.get("variant_name", "") + language = self.args.get("language", "python") + + if not problem_name: + return self.fail("grader arg 'problem_name' is required") + + # Build problem_id for frontier_cs API + if variant_name: + problem_id = f"{problem_name}/{variant_name}" + else: + problem_id = problem_name + + # Find solution + sol_file = "solution.cpp" if language == "cpp" else "solution.py" + solution_path = Path(self.codebase_path) / sol_file + if not solution_path.exists(): + return self.score(0.0, feedback=f"No {sol_file} found in workspace.") + + code = solution_path.read_text() + if not code.strip(): + return self.score(0.0, feedback=f"{sol_file} is empty.") + + # Use frontier_cs evaluator + import os + + from frontier_cs import SingleEvaluator + + # base_dir points to the cloned Frontier-CS repo containing research/problems/ + base_dir = os.environ.get("FRONTIER_CS_BASE_DIR", os.path.expanduser("~/Frontier-CS")) + base_dir_path = Path(base_dir) if base_dir else None + + evaluator = SingleEvaluator(backend="docker", base_dir=base_dir_path, register_cleanup=False) + result = evaluator.evaluate("research", problem_id=problem_id, code=code) + + if not result.success: + msg = result.message or "Evaluation failed" + return self.score(0.0, feedback=msg) + + score = result.score if result.score is not None else 0.0 + + feedback_parts = [f"Score: {score:.2f}/100"] + if result.metadata: + for key in ["score_unbounded", "accuracy", "speedup", "avg_runtime"]: + val = result.metadata.get(key) + if val is not None: + feedback_parts.append(f"{key}: {val}") + + return self.score(score, feedback="\n".join(feedback_parts)) diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/litellm_config.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/litellm_config.yaml new file mode 100644 index 00000000..0fcbeea4 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/litellm_config.yaml @@ -0,0 +1,22 @@ +model_list: + - model_name: "qwen3-4b" + litellm_params: + model: "hosted_vllm/qwen3-4b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + - model_name: "qwen3-30b-a3b" + litellm_params: + model: "hosted_vllm/qwen3-30b-a3b" + api_base: "http://127.0.0.1:30000/v1" + api_key: "EMPTY" + drop_params: true + max_tokens: 4096 + +litellm_settings: + drop_params: true + modify_params: true + +general_settings: + forward_client_headers_to_llm_api: true diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/opencode.json b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/opencode.json new file mode 100644 index 00000000..3dbee36e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/opencode.json @@ -0,0 +1,45 @@ +{ + "$schema": "https://opencode.ai/config.json", + "permission": { + "external_directory": "allow", + "question": "deny", + "doom_loop": "allow", + "bash": "allow", + "edit": "allow", + "read": "allow", + "write": "allow", + "webfetch": "deny", + "websearch": "deny", + "codesearch": "deny", + "lsp": "deny", + "skill": "deny" + }, + "provider": { + "sglang": { + "npm": "@ai-sdk/openai-compatible", + "name": "sglang", + "options": { + "baseURL": "http://127.0.0.1:4000/v1", + "apiKey": "EMPTY" + }, + "models": { + "qwen3-30b-a3b": { + "name": "qwen3-30b-a3b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + }, + "qwen3-4b": { + "name": "qwen3-4b", + "limit": { + "context": 32768, + "input": 28672, + "output": 4096 + } + } + } + } + } + } diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/solution.py b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/solution.py new file mode 100644 index 00000000..06a16353 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/solution.py @@ -0,0 +1,10 @@ +class Solution: + """Solution for Frontier-CS research problem. + + Read the problem statement in statement.md for implementation details + and the expected interface. + """ + + def solve(self, *args, **kwargs): + """Implement the solve method as described in statement.md.""" + raise NotImplementedError("Implement this method") diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/statement.md b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/statement.md new file mode 100644 index 00000000..3a78ad72 --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed/statement.md @@ -0,0 +1,234 @@ +VDB Design Problem - Recall95 Latency Tier +=========================================== + +Problem Setting +--------------- +Design a Vector Database index optimized for **latency** subject to a **high recall constraint**. This tier uses recall-gated scoring: solutions failing to meet the recall threshold receive zero points, while solutions meeting the constraint are scored purely by latency. + +**Optimization Goal**: Minimize latency within recall constraint + +$$ +\text{score} = \begin{cases} +0 & \text{if } r < r_{\text{gate}} \\ +100 & \text{if } r \geq r_{\text{gate}} \text{ and } t_{\text{query}} \leq t_{\text{min}} \\ +100 \cdot \frac{t_{\text{max}} - t_{\text{query}}}{t_{\text{max}} - t_{\text{min}}} & \text{if } r \geq r_{\text{gate}} \text{ and } t_{\text{min}} < t_{\text{query}} < t_{\text{max}} \\ +0 & \text{if } r \geq r_{\text{gate}} \text{ and } t_{\text{query}} \geq t_{\text{max}} +\end{cases} +$$ + +Where: +- $r$: Your recall@1 +- $t_{\text{query}}$: Your average query latency (ms) +- $r_{\text{gate}} = 0.95$ (minimum required recall) +- $t_{\text{min}} = 0.0\text{ms}$ (best possible latency) +- $t_{\text{max}} = 7.7\text{ms}$ (maximum allowed latency) + +**Key Insight**: This tier requires high recall (95%), but provides generous latency budget (7.7ms). Focus on recall first, then optimize latency. + +Baseline Performance +-------------------- +- Recall@1: **0.9914** (99.14%) +- Avg query time: **3.85ms** + +Scoring Examples +---------------- +All examples assume recall constraint is met ($r \geq 0.95$): + +| Recall@1 | Latency | Score Calculation | Score | +|----------|---------|-------------------|-------| +| 0.96 | 0.00ms | $t \leq t_{\text{min}}$ → max score | **100** | +| 0.96 | 3.85ms | $\frac{7.7 - 3.85}{7.7 - 0.0} = 0.50$ | **50.0** | +| 0.97 | 5.00ms | $\frac{7.7 - 5.0}{7.7 - 0.0} = 0.351$ | **35.1** | +| 0.98 | 2.00ms | $\frac{7.7 - 2.0}{7.7 - 0.0} = 0.740$ | **74.0** | +| **0.94** | 2.00ms | $r < r_{\text{gate}}$ → recall gate fails | **0** | +| 0.96 | **8.00ms** | $t \geq t_{\text{max}}$ → latency too high | **0** | + +**Note**: The 95% recall requirement is strict, but the 7.7ms latency budget is generous, allowing thorough search strategies. + +API Specification +----------------- +Implement a class with the following interface: + +```python +import numpy as np +from typing import Tuple + +class YourIndexClass: + def __init__(self, dim: int, **kwargs): + """ + Initialize the index for vectors of dimension `dim`. + + Args: + dim: Vector dimensionality (e.g., 128 for SIFT1M) + **kwargs: Optional parameters (e.g., M, ef_construction for HNSW) + + Example: + index = YourIndexClass(dim=128, M=64, ef_search=400) + """ + pass + + def add(self, xb: np.ndarray) -> None: + """ + Add vectors to the index. + + Args: + xb: Base vectors, shape (N, dim), dtype float32 + + Notes: + - Can be called multiple times (cumulative) + - Must handle large N (e.g., 1,000,000 vectors) + + Example: + index.add(xb) # xb.shape = (1000000, 128) + """ + pass + + def search(self, xq: np.ndarray, k: int) -> Tuple[np.ndarray, np.ndarray]: + """ + Search for k nearest neighbors of query vectors. + + Args: + xq: Query vectors, shape (nq, dim), dtype float32 + k: Number of nearest neighbors to return + + Returns: + (distances, indices): + - distances: shape (nq, k), dtype float32, L2 distances + - indices: shape (nq, k), dtype int64, indices into base vectors + + Notes: + - Must return exactly k neighbors per query + - Indices should refer to positions in the vectors passed to add() + - Lower distance = more similar + + Example: + D, I = index.search(xq, k=1) # xq.shape = (10000, 128) + # D.shape = (10000, 1), I.shape = (10000, 1) + """ + pass +``` + +**Implementation Requirements**: +- Class can have any name (evaluator auto-discovers classes with `add` and `search` methods) +- Must handle SIFT1M dataset: 1M base vectors, 10K queries, 128 dimensions +- Your `search` must return tuple `(distances, indices)` with shapes `(nq, k)` +- Distances should be L2 (Euclidean) or L2-squared +- No need to handle dataset loading - evaluator provides numpy arrays + +Evaluation Process +------------------ +The evaluator follows these steps: + +### 1. Load Dataset +```python +from faiss.contrib.datasets import DatasetSIFT1M +ds = DatasetSIFT1M() +xb = ds.get_database() # (1000000, 128) float32 +xq = ds.get_queries() # (10000, 128) float32 +gt = ds.get_groundtruth() # (10000, 100) int64 - ground truth indices +``` + +### 2. Build Index +```python +from solution import YourIndexClass # Auto-discovered +d = xb.shape[1] # 128 for SIFT1M +index = YourIndexClass(d) # Pass dimension as first argument +index.add(xb) # Add all 1M base vectors +``` + +### 3. Measure Performance (Batch Queries) +```python +import time +t0 = time.time() +D, I = index.search(xq, k=1) # Search all 10K queries at once +t1 = time.time() + +# Calculate metrics +recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / len(xq) +avg_query_time_ms = (t1 - t0) * 1000.0 / len(xq) +``` + +**Important**: `avg_query_time_ms` from **batch queries** is used for scoring. Batch queries benefit from CPU cache and vectorization, typically faster than single queries. + +### 4. Calculate Score +```python +if recall_at_1 < 0.95: + score = 0.0 +elif avg_query_time_ms <= 0.0: + score = 100.0 +elif avg_query_time_ms >= 7.7: + score = 0.0 +else: + proportion = (avg_query_time_ms - 0.0) / (7.7 - 0.0) + score = 100.0 * (1.0 - proportion) +``` + +Dataset Details +--------------- +- **Name**: SIFT1M +- **Base vectors**: 1,000,000 vectors of dimension 128 +- **Query vectors**: 10,000 vectors +- **Ground truth**: Precomputed nearest neighbors (k=1) +- **Metric**: L2 (Euclidean distance) +- **Vector type**: float32 + +Runtime Platform +---------------- +- **Infrastructure**: Evaluations run on SkyPilot-managed cloud instances (AWS, GCP, or Azure) +- **Compute**: CPU-only instances (no GPU required) +- **Environment**: Docker containerized execution with Python 3, NumPy ≥1.24, FAISS-CPU ≥1.7.4 + +Constraints +----------- +- **Timeout**: 1 hour for entire evaluation (index construction + queries) +- **Memory**: Use reasonable memory (index should fit in RAM) +- **Recall constraint**: recall@1 ≥ 0.95 +- **Latency range**: 0.0ms ≤ avg_query_time_ms ≤ 7.7ms + +Strategy Tips +------------- +1. **Meet recall gate first**: Ensure ≥95% recall, otherwise score = 0 +2. **Use moderate approximation**: Higher recall requirement means less aggressive approximation +3. **Batch optimization is key**: Your `search` should handle batch queries efficiently +4. **Balance recall and latency**: Aim for 95-99% recall with 3-5ms latency + +Example: Simple Baseline +------------------------- +```python +import numpy as np + +class SimpleIndex: + def __init__(self, dim: int, **kwargs): + self.dim = dim + self.xb = None + + def add(self, xb: np.ndarray) -> None: + if self.xb is None: + self.xb = xb.copy() + else: + self.xb = np.vstack([self.xb, xb]) + + def search(self, xq: np.ndarray, k: int) -> tuple: + # Compute all pairwise L2 distances + # xq: (nq, dim), xb: (N, dim) + # distances: (nq, N) + distances = np.sqrt(((xq[:, np.newaxis, :] - self.xb[np.newaxis, :, :]) ** 2).sum(axis=2)) + + # Get k nearest neighbors + indices = np.argpartition(distances, k-1, axis=1)[:, :k] + sorted_indices = np.argsort(distances[np.arange(len(xq))[:, None], indices], axis=1) + final_indices = indices[np.arange(len(xq))[:, None], sorted_indices] + final_distances = distances[np.arange(len(xq))[:, None], final_indices] + + return final_distances, final_indices +``` + +**Note**: This baseline achieves perfect recall (100%) but is too slow for large datasets. Use approximate methods like HNSW, IVF, or LSH for better speed-recall tradeoffs. + +Debugging Tips +-------------- +- **Test locally**: Use a subset of data (e.g., 10K vectors) for faster iteration +- **Verify shapes**: Ensure `search` returns `(nq, k)` shaped arrays +- **Check recall calculation**: `(I[:, :1] == gt[:, :1]).sum() / len(xq)` +- **Profile latency**: Measure batch vs single query performance separately +- **Validate before submit**: Run full 1M dataset locally if possible diff --git a/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/task.yaml b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/task.yaml new file mode 100644 index 00000000..c2334f6e --- /dev/null +++ b/ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/task.yaml @@ -0,0 +1,46 @@ +task: + name: "Frontier-CS: vdb_pareto (recall95_latency) (TTT)" + description: | + Solve the 'vdb_pareto (recall95_latency)' research problem from the Frontier-CS benchmark. + + Read the full problem statement in `statement.md`. + Write your solution in `solution.py`. + + Your score will be 0-100 based on solution quality. + tips: | + - Read statement.md carefully for the exact interface and scoring formula. + - Evaluation timeout: 3600s. + - Language: python. + +grader: + timeout: 3600 + direction: maximize + args: + problem_name: vdb_pareto + variant_name: "recall95_latency" + language: python + needs_gpu: false + +agents: + count: 1 + runtime: opencode + model: sglang/qwen3-4b + research: false + max_turns: 200 + gateway: + enabled: true + config: "./seed/litellm_config.yaml" + heartbeat: + - name: reflect + every: 5 + - name: diagnose + every: 5 + +workspace: + results_dir: "./results" + repo_path: "./ttt/examples/frontier_cs_tasks/vdb_pareto__recall95_latency/seed" + +run: + verbose: false + ui: false + session: local diff --git a/ttt/run_coral_distill.sh b/ttt/run_coral_distill.sh index 83964123..a7a77885 100755 --- a/ttt/run_coral_distill.sh +++ b/ttt/run_coral_distill.sh @@ -204,18 +204,22 @@ else fi # --- SFT loss (replaces GRPO) --- -# Use KL loss to prevent drift from the reference model. -# No policy gradient, no clipping, no advantage estimation. +# Optionally use KL loss to prevent drift from the reference model. +# Disabled by default because the entropy computation OOMs on small GPU setups. SFT_ARGS=( --advantage-estimator grpo --disable-rewards-normalization - --use-kl-loss - --kl-loss-coef "${KL_LOSS_COEF:-0.01}" - --kl-loss-type low_var_kl --entropy-coef 0.00 --eps-clip 1000.0 --eps-clip-high 1000.0 ) +if [ "${USE_KL_LOSS:-0}" = "1" ]; then + SFT_ARGS+=( + --use-kl-loss + --kl-loss-coef "${KL_LOSS_COEF:-0.01}" + --kl-loss-type low_var_kl + ) +fi # NOTE: eps-clip is set very high (1000) to effectively disable clipping. # With SFT data (all rewards positive, loss_mask=1 for good trajectories), # the loss reduces to standard cross-entropy + KL regularization. diff --git a/ttt/slime/slime/backends/fsdp_utils/actor.py b/ttt/slime/slime/backends/fsdp_utils/actor.py index 3c5c2717..6db04e13 100644 --- a/ttt/slime/slime/backends/fsdp_utils/actor.py +++ b/ttt/slime/slime/backends/fsdp_utils/actor.py @@ -377,6 +377,7 @@ def _compute_log_prob( target_tokens=batch["tokens"], allow_compile=not self.args.true_on_policy_mode, temperature=self.args.rollout_temperature, + compute_entropy=(store_prefix == "" and self.args.entropy_coef != 0.0), ) batch[f"{store_prefix}log_probs"] = log_probs_result if store_prefix == "":