diff --git a/evals/env.ts b/evals/env.ts index 45f877bd1..4f49517e1 100644 --- a/evals/env.ts +++ b/evals/env.ts @@ -4,10 +4,11 @@ * * The environment is read from the EVAL_ENV environment variable. */ -export const env: "BROWSERBASE" | "LOCAL" = - process.env.EVAL_ENV?.toLowerCase() === "browserbase" - ? "BROWSERBASE" - : "LOCAL"; +// export const env: "BROWSERBASE" | "LOCAL" = +// process.env.EVAL_ENV?.toLowerCase() === "browserbase" +// ? "BROWSERBASE" +// : "LOCAL"; +export const env = "BROWSERBASE" as const; /** * Enable or disable caching based on the EVAL_ENABLE_CACHING environment variable. diff --git a/evals/index.eval.ts b/evals/index.eval.ts index 0b265b4d6..bc759f3e6 100644 --- a/evals/index.eval.ts +++ b/evals/index.eval.ts @@ -39,11 +39,11 @@ dotenv.config(); */ const MAX_CONCURRENCY = process.env.EVAL_MAX_CONCURRENCY ? parseInt(process.env.EVAL_MAX_CONCURRENCY, 10) - : 20; + : 5; const TRIAL_COUNT = process.env.EVAL_TRIAL_COUNT ? parseInt(process.env.EVAL_TRIAL_COUNT, 10) - : 5; + : 1; /** * generateSummary: diff --git a/evals/initStagehand.ts b/evals/initStagehand.ts index bb26c9b30..876f10bb8 100644 --- a/evals/initStagehand.ts +++ b/evals/initStagehand.ts @@ -80,6 +80,7 @@ export const initStagehand = async ({ logger.log(logLine); }, ...configOverrides, + useAPI: true, }; const stagehand = new Stagehand(config); diff --git a/evals/taskConfig.ts b/evals/taskConfig.ts index 0031b8a80..231424fe0 100644 --- a/evals/taskConfig.ts +++ b/evals/taskConfig.ts @@ -49,7 +49,7 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) { */ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS ? process.env.EVAL_MODELS.split(",") - : ["gpt-4o", "claude-3-5-sonnet-latest"]; + : ["gpt-4o"]; /** * getModelList: