Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions packages/cli/src/__tests__/runnerSlug.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import {describe, expect, it} from "vitest";
import {isNativeRunnerSlug} from "../models/nativeRunnerModel.js";
import {isWebRunnerSlug} from "../models/webRunnerModel.js";

describe("runner slug routing", () => {
it("routes plain kora-app-* to web", () => {
expect(isWebRunnerSlug("kora-app-chatgpt")).toBe(true);
expect(isNativeRunnerSlug("kora-app-chatgpt")).toBe(false);
});

it("routes -android suffix to native", () => {
expect(isNativeRunnerSlug("kora-app-tiktok-android")).toBe(true);
expect(isWebRunnerSlug("kora-app-tiktok-android")).toBe(false);
});

it("ignores non-kora slugs entirely", () => {
expect(isWebRunnerSlug("custom-something")).toBe(false);
expect(isNativeRunnerSlug("custom-something")).toBe(false);
expect(isWebRunnerSlug("anthropic/claude-opus-4-7")).toBe(false);
expect(isNativeRunnerSlug("anthropic/claude-opus-4-7")).toBe(false);
});
});
4 changes: 3 additions & 1 deletion packages/cli/src/commands/shared/buildContext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import * as R from "remeda";
import {createCustomModel} from "../../models/customModel.js";
import {createGatewayModel} from "../../models/gatewayModel.js";
import {Model} from "../../models/model.js";
import {isNativeRunnerSlug} from "../../models/nativeRunnerModel.js";
import {isWebRunnerSlug} from "../../models/webRunnerModel.js";

export interface BuiltContext {
Expand Down Expand Up @@ -63,7 +64,8 @@ export function resolveTargetGatewayModel(
): Model | undefined {
if (
targetModelSlug.startsWith("custom-") ||
isWebRunnerSlug(targetModelSlug)
isWebRunnerSlug(targetModelSlug) ||
isNativeRunnerSlug(targetModelSlug)
) {
return undefined;
}
Expand Down
11 changes: 11 additions & 0 deletions packages/cli/src/models/customModel.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
import {Scenario} from "@korabench/benchmark";
import {Model} from "./model.js";
import {
createNativeRunnerModel,
isNativeRunnerSlug,
} from "./nativeRunnerModel.js";
import {createWebRunnerModel, isWebRunnerSlug} from "./webRunnerModel.js";

const DEFAULT_WEB_RUNNER_URL = "http://localhost:7100";
const DEFAULT_NATIVE_RUNNER_URL = "http://localhost:7200";

export async function createCustomModel(
modelSlug: string,
_scenario: Scenario
): Promise<Model> {
if (isNativeRunnerSlug(modelSlug)) {
const nativeRunnerUrl =
process.env.NATIVE_RUNNER_URL ?? DEFAULT_NATIVE_RUNNER_URL;
const apiKey = process.env.NATIVE_RUNNER_API_KEY;
return createNativeRunnerModel({modelSlug, nativeRunnerUrl, apiKey});
}
if (isWebRunnerSlug(modelSlug)) {
const webRunnerUrl = process.env.WEB_RUNNER_URL ?? DEFAULT_WEB_RUNNER_URL;
const apiKey = process.env.WEB_RUNNER_API_KEY;
Expand Down
152 changes: 152 additions & 0 deletions packages/cli/src/models/nativeRunnerModel.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import {ModelRequest, TypedModelRequest} from "@korabench/core";
import {randomUUID} from "node:crypto";
import {Model} from "./model.js";

const KORA_APP_PREFIX = "kora-app-";
const NATIVE_SUFFIXES = ["-android"] as const;

export type BlockedReason =
| "device_locked"
| "device_busy"
| "login_required"
| "rate_limit"
| "unknown_block";

export class BlockedNativeAppError extends Error {
constructor(readonly reason: BlockedReason) {
super(`Native app blocked: ${reason}`);
this.name = "BlockedNativeAppError";
}
}

interface NativeRunnerModelConfig {
modelSlug: string;
nativeRunnerUrl: string;
apiKey?: string;
runId?: string;
testKey?: string;
}

function modelSlugToApp(slug: string): string {
if (!slug.startsWith(KORA_APP_PREFIX)) {
throw new Error(
`NativeRunnerModel expected a slug starting with "${KORA_APP_PREFIX}"; got "${slug}"`
);
}
return slug.slice(KORA_APP_PREFIX.length);
}

function lastUserContent(messages: ModelRequest["messages"]): string {
for (let i = messages.length - 1; i >= 0; i--) {
const m = messages[i]!;
if (m.role === "user") return m.content;
}
throw new Error("No user message in request transcript.");
}

/**
* Model that drives a real Android session against a native AI app via the
* `kora-apps` native-runner HTTP service. Identical contract to
* WebRunnerModel — only the URL and the slug suffix differ. The native-runner
* serializes all sessions onto a single physical device, so callers should
* sequence rather than fan-out.
*/
export function createNativeRunnerModel(
config: NativeRunnerModelConfig
): Model {
const app = modelSlugToApp(config.modelSlug);
const runId = config.runId ?? randomUUID();
const testKey = config.testKey ?? randomUUID();
let sessionId: string | null = null;

const headers: Record<string, string> = {"content-type": "application/json"};
if (config.apiKey) headers["authorization"] = `Bearer ${config.apiKey}`;

async function ensureSession(): Promise<string> {
if (sessionId) return sessionId;
const r = await fetch(`${config.nativeRunnerUrl}/sessions`, {
method: "POST",
headers,
body: JSON.stringify({runId, testKey, app}),
});
if (!r.ok) {
throw new Error(
`native-runner POST /sessions failed: ${r.status} ${await r.text()}`
);
}
const data = (await r.json()) as {
sessionId?: string;
blockedReason?: BlockedReason;
};
if (data.blockedReason) throw new BlockedNativeAppError(data.blockedReason);
if (!data.sessionId) {
throw new Error("native-runner did not return sessionId");
}
sessionId = data.sessionId;
return sessionId;
}

async function postTurn(text: string): Promise<string> {
if (!sessionId) throw new Error("Session not open.");
const r = await fetch(
`${config.nativeRunnerUrl}/sessions/${sessionId}/turn`,
{
method: "POST",
headers,
body: JSON.stringify({userMessage: text}),
}
);
if (!r.ok) {
throw new Error(
`native-runner POST /sessions/${sessionId}/turn failed: ${r.status} ${await r.text()}`
);
}
const data = (await r.json()) as {
assistantMessage?: string;
blockedReason?: BlockedReason;
};
if (data.blockedReason) throw new BlockedNativeAppError(data.blockedReason);
if (typeof data.assistantMessage !== "string") {
throw new Error("native-runner did not return assistantMessage");
}
return data.assistantMessage;
}

return {
async getTextResponse(request: ModelRequest): Promise<string> {
await ensureSession();
return postTurn(lastUserContent(request.messages));
},

async getStructuredResponse<T>(_request: TypedModelRequest<T>): Promise<T> {
throw new Error(
`kora-app:* native targets do not support structured output. Slug: ${config.modelSlug}`
);
},

async dispose(outcome) {
if (!sessionId) return;
const id = sessionId;
sessionId = null;
try {
await fetch(`${config.nativeRunnerUrl}/sessions/${id}`, {
method: "DELETE",
headers,
body: JSON.stringify({outcome}),
});
} catch (err) {
console.error(
`native-runner DELETE /sessions/${id} failed: ${err instanceof Error ? err.message : err}`
);
}
},
};
}

/** True for slugs that map to a native (on-device) target rather than a web one.
* Convention: `kora-app-<name>-<platform>` where `<platform>` is one of
* NATIVE_SUFFIXES (currently `-android`). Plain `kora-app-<name>` stays web. */
export function isNativeRunnerSlug(slug: string): boolean {
if (!slug.startsWith(KORA_APP_PREFIX)) return false;
return NATIVE_SUFFIXES.some(suffix => slug.endsWith(suffix));
}
6 changes: 5 additions & 1 deletion packages/cli/src/models/webRunnerModel.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,5 +148,9 @@ export function createWebRunnerModel(config: WebRunnerModelConfig): Model {
}

export function isWebRunnerSlug(slug: string): boolean {
return slug.startsWith(KORA_APP_PREFIX);
if (!slug.startsWith(KORA_APP_PREFIX)) return false;
// Native targets share the kora-app- prefix but carry a platform suffix
// (-android, …). They are handled by NativeRunnerModel; keep them out of
// the web bucket so the two routings stay disjoint.
return !/-android$/.test(slug);
}
Loading