diff --git a/packages/cli/src/__tests__/runnerSlug.test.ts b/packages/cli/src/__tests__/runnerSlug.test.ts new file mode 100644 index 0000000..b1751f9 --- /dev/null +++ b/packages/cli/src/__tests__/runnerSlug.test.ts @@ -0,0 +1,22 @@ +import {describe, expect, it} from "vitest"; +import {isNativeRunnerSlug} from "../models/nativeRunnerModel.js"; +import {isWebRunnerSlug} from "../models/webRunnerModel.js"; + +describe("runner slug routing", () => { + it("routes plain kora-app-* to web", () => { + expect(isWebRunnerSlug("kora-app-chatgpt")).toBe(true); + expect(isNativeRunnerSlug("kora-app-chatgpt")).toBe(false); + }); + + it("routes -android suffix to native", () => { + expect(isNativeRunnerSlug("kora-app-tiktok-android")).toBe(true); + expect(isWebRunnerSlug("kora-app-tiktok-android")).toBe(false); + }); + + it("ignores non-kora slugs entirely", () => { + expect(isWebRunnerSlug("custom-something")).toBe(false); + expect(isNativeRunnerSlug("custom-something")).toBe(false); + expect(isWebRunnerSlug("anthropic/claude-opus-4-7")).toBe(false); + expect(isNativeRunnerSlug("anthropic/claude-opus-4-7")).toBe(false); + }); +}); diff --git a/packages/cli/src/commands/shared/buildContext.ts b/packages/cli/src/commands/shared/buildContext.ts index 81bb2eb..bceabbf 100644 --- a/packages/cli/src/commands/shared/buildContext.ts +++ b/packages/cli/src/commands/shared/buildContext.ts @@ -3,6 +3,7 @@ import * as R from "remeda"; import {createCustomModel} from "../../models/customModel.js"; import {createGatewayModel} from "../../models/gatewayModel.js"; import {Model} from "../../models/model.js"; +import {isNativeRunnerSlug} from "../../models/nativeRunnerModel.js"; import {isWebRunnerSlug} from "../../models/webRunnerModel.js"; export interface BuiltContext { @@ -63,7 +64,8 @@ export function resolveTargetGatewayModel( ): Model | undefined { if ( targetModelSlug.startsWith("custom-") || - isWebRunnerSlug(targetModelSlug) + isWebRunnerSlug(targetModelSlug) || + isNativeRunnerSlug(targetModelSlug) ) { return undefined; } diff --git a/packages/cli/src/models/customModel.ts b/packages/cli/src/models/customModel.ts index 6c678a6..4dbc327 100644 --- a/packages/cli/src/models/customModel.ts +++ b/packages/cli/src/models/customModel.ts @@ -1,13 +1,24 @@ import {Scenario} from "@korabench/benchmark"; import {Model} from "./model.js"; +import { + createNativeRunnerModel, + isNativeRunnerSlug, +} from "./nativeRunnerModel.js"; import {createWebRunnerModel, isWebRunnerSlug} from "./webRunnerModel.js"; const DEFAULT_WEB_RUNNER_URL = "http://localhost:7100"; +const DEFAULT_NATIVE_RUNNER_URL = "http://localhost:7200"; export async function createCustomModel( modelSlug: string, _scenario: Scenario ): Promise { + if (isNativeRunnerSlug(modelSlug)) { + const nativeRunnerUrl = + process.env.NATIVE_RUNNER_URL ?? DEFAULT_NATIVE_RUNNER_URL; + const apiKey = process.env.NATIVE_RUNNER_API_KEY; + return createNativeRunnerModel({modelSlug, nativeRunnerUrl, apiKey}); + } if (isWebRunnerSlug(modelSlug)) { const webRunnerUrl = process.env.WEB_RUNNER_URL ?? DEFAULT_WEB_RUNNER_URL; const apiKey = process.env.WEB_RUNNER_API_KEY; diff --git a/packages/cli/src/models/nativeRunnerModel.ts b/packages/cli/src/models/nativeRunnerModel.ts new file mode 100644 index 0000000..fbb5ea4 --- /dev/null +++ b/packages/cli/src/models/nativeRunnerModel.ts @@ -0,0 +1,152 @@ +import {ModelRequest, TypedModelRequest} from "@korabench/core"; +import {randomUUID} from "node:crypto"; +import {Model} from "./model.js"; + +const KORA_APP_PREFIX = "kora-app-"; +const NATIVE_SUFFIXES = ["-android"] as const; + +export type BlockedReason = + | "device_locked" + | "device_busy" + | "login_required" + | "rate_limit" + | "unknown_block"; + +export class BlockedNativeAppError extends Error { + constructor(readonly reason: BlockedReason) { + super(`Native app blocked: ${reason}`); + this.name = "BlockedNativeAppError"; + } +} + +interface NativeRunnerModelConfig { + modelSlug: string; + nativeRunnerUrl: string; + apiKey?: string; + runId?: string; + testKey?: string; +} + +function modelSlugToApp(slug: string): string { + if (!slug.startsWith(KORA_APP_PREFIX)) { + throw new Error( + `NativeRunnerModel expected a slug starting with "${KORA_APP_PREFIX}"; got "${slug}"` + ); + } + return slug.slice(KORA_APP_PREFIX.length); +} + +function lastUserContent(messages: ModelRequest["messages"]): string { + for (let i = messages.length - 1; i >= 0; i--) { + const m = messages[i]!; + if (m.role === "user") return m.content; + } + throw new Error("No user message in request transcript."); +} + +/** + * Model that drives a real Android session against a native AI app via the + * `kora-apps` native-runner HTTP service. Identical contract to + * WebRunnerModel — only the URL and the slug suffix differ. The native-runner + * serializes all sessions onto a single physical device, so callers should + * sequence rather than fan-out. + */ +export function createNativeRunnerModel( + config: NativeRunnerModelConfig +): Model { + const app = modelSlugToApp(config.modelSlug); + const runId = config.runId ?? randomUUID(); + const testKey = config.testKey ?? randomUUID(); + let sessionId: string | null = null; + + const headers: Record = {"content-type": "application/json"}; + if (config.apiKey) headers["authorization"] = `Bearer ${config.apiKey}`; + + async function ensureSession(): Promise { + if (sessionId) return sessionId; + const r = await fetch(`${config.nativeRunnerUrl}/sessions`, { + method: "POST", + headers, + body: JSON.stringify({runId, testKey, app}), + }); + if (!r.ok) { + throw new Error( + `native-runner POST /sessions failed: ${r.status} ${await r.text()}` + ); + } + const data = (await r.json()) as { + sessionId?: string; + blockedReason?: BlockedReason; + }; + if (data.blockedReason) throw new BlockedNativeAppError(data.blockedReason); + if (!data.sessionId) { + throw new Error("native-runner did not return sessionId"); + } + sessionId = data.sessionId; + return sessionId; + } + + async function postTurn(text: string): Promise { + if (!sessionId) throw new Error("Session not open."); + const r = await fetch( + `${config.nativeRunnerUrl}/sessions/${sessionId}/turn`, + { + method: "POST", + headers, + body: JSON.stringify({userMessage: text}), + } + ); + if (!r.ok) { + throw new Error( + `native-runner POST /sessions/${sessionId}/turn failed: ${r.status} ${await r.text()}` + ); + } + const data = (await r.json()) as { + assistantMessage?: string; + blockedReason?: BlockedReason; + }; + if (data.blockedReason) throw new BlockedNativeAppError(data.blockedReason); + if (typeof data.assistantMessage !== "string") { + throw new Error("native-runner did not return assistantMessage"); + } + return data.assistantMessage; + } + + return { + async getTextResponse(request: ModelRequest): Promise { + await ensureSession(); + return postTurn(lastUserContent(request.messages)); + }, + + async getStructuredResponse(_request: TypedModelRequest): Promise { + throw new Error( + `kora-app:* native targets do not support structured output. Slug: ${config.modelSlug}` + ); + }, + + async dispose(outcome) { + if (!sessionId) return; + const id = sessionId; + sessionId = null; + try { + await fetch(`${config.nativeRunnerUrl}/sessions/${id}`, { + method: "DELETE", + headers, + body: JSON.stringify({outcome}), + }); + } catch (err) { + console.error( + `native-runner DELETE /sessions/${id} failed: ${err instanceof Error ? err.message : err}` + ); + } + }, + }; +} + +/** True for slugs that map to a native (on-device) target rather than a web one. + * Convention: `kora-app--` where `` is one of + * NATIVE_SUFFIXES (currently `-android`). Plain `kora-app-` stays web. */ +export function isNativeRunnerSlug(slug: string): boolean { + if (!slug.startsWith(KORA_APP_PREFIX)) return false; + return NATIVE_SUFFIXES.some(suffix => slug.endsWith(suffix)); +} diff --git a/packages/cli/src/models/webRunnerModel.ts b/packages/cli/src/models/webRunnerModel.ts index 50c7a5d..7adb759 100644 --- a/packages/cli/src/models/webRunnerModel.ts +++ b/packages/cli/src/models/webRunnerModel.ts @@ -148,5 +148,9 @@ export function createWebRunnerModel(config: WebRunnerModelConfig): Model { } export function isWebRunnerSlug(slug: string): boolean { - return slug.startsWith(KORA_APP_PREFIX); + if (!slug.startsWith(KORA_APP_PREFIX)) return false; + // Native targets share the kora-app- prefix but carry a platform suffix + // (-android, …). They are handled by NativeRunnerModel; keep them out of + // the web bucket so the two routings stay disjoint. + return !/-android$/.test(slug); }