web-infra-dev · zhoushaw · Jan 26, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 23, 2025
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -14,6 +14,7 @@
     "iconfont",
     "qwen",
     "taobao",
+    "targetcreated",
     "Volcengine"
   ]
 }
diff --git a/apps/site/docs/en/API.md b/apps/site/docs/en/API.md
@@ -1,10 +1,29 @@
 # API
 
-These are the main APIs of all kinds of agents in Midscene.
+## Constructor
+
+There are multiple agents in Midscene, each with its own constructor.
+
+* In Puppeteer, use [PuppeteerAgent](./integrate-with-puppeteer)
+* In Bridge mode, use [AgentOverChromeBridge](./bridge-mode-by-chrome-extension#constructor)
+
+Here are the common options for all agents:
+
+* `generateReport: boolean`: If true, the agent will generate a report file. Default is true.
+* `autoPrintReportMsg: boolean`: If true, the agent will print the report message. Default is true.
+* `cacheId: string | undefined`: If set, the agent will use this cacheId to match the cache. Default is undefined.
+
+And also, puppeteer agent has an extra option:
+
+* `trackingActiveTab`: If true, the agent will track the newly opened tab. Default is false.
+
+## Methods
+
+These are the main methods on all kinds of agents in Midscene.
 
 > In the following documentation, you may see functions called with the `mid.` prefix. If you use destructuring in Playwright, like `async ({ ai, aiQuery }) => { /* ... */}`, you can call the functions without this prefix. It's just a matter of syntax.
 
-## `.aiAction(steps: string)` or `.ai(steps: string)` - Interact with the page
+### `.aiAction(steps: string)` or `.ai(steps: string)` - Interact with the page
 
 You can use `.aiAction` to perform a series of actions. It accepts a `steps: string` as a parameter, which describes the actions. In the prompt, you should clearly describe the steps. Midscene will take care of the rest.
 
@@ -36,7 +55,7 @@ Related Docs:
 * [FAQ: Can Midscene smartly plan the actions according to my one-line goal? Like executing "Tweet 'hello world'](./faq)
 * [Prompting Tips](./prompting-tips)
 
-## `.aiQuery(dataDemand: any)` - extract any data from page
+### `.aiQuery(dataDemand: any)` - extract any data from page
 
 You can extract customized data from the UI. Provided that the multi-modal AI can perform inference, it can return both data directly written on the page and any data based on "understanding". The return value is in JSON format, so it should be valid primitive types, like String, Number, JSON, Array, etc. Just describe it in the `dataDemand`.
 
@@ -61,7 +80,7 @@ const dataB = await mid.aiQuery('string[], task names in the list');
 const dataC = await mid.aiQuery('{name: string, age: string}[], Data Record in the table');
 ```
 
-## `.aiAssert(assertion: string, errorMsg?: string)` - do an assertion
+### `.aiAssert(assertion: string, errorMsg?: string)` - do an assertion
 
 `.aiAssert` works just like the normal `assert` method, except that the condition is a prompt string written in natural language. Midscene will call AI to determine if the `assertion` is true. If the condition is not met, an error will be thrown containing `errorMsg` and a detailed reason generated by AI.
 
@@ -84,7 +103,7 @@ expect(onesieItem.price).toBe(7.99);
 ```
 :::
 
-## `.aiWaitFor(assertion: string, {timeoutMs?: number, checkIntervalMs?: number })` - wait until the assertion is met
+### `.aiWaitFor(assertion: string, {timeoutMs?: number, checkIntervalMs?: number })` - wait until the assertion is met
 
 `.aiWaitFor` will help you check if your assertion has been met or a timeout error occurred. Considering the AI service cost, the check interval will not exceed `checkIntervalMs` milliseconds. The default config sets `timeoutMs` to 15 seconds and `checkIntervalMs` to 3 seconds: i.e. check at most 5 times if all assertions fail and the AI service always responds immediately.
 
@@ -94,7 +113,29 @@ When considering the time required for the AI service, `.aiWaitFor` may not be v
 await mid.aiWaitFor("there is at least one headphone item on page");
 ```
 
-## Debug Config (Optional)
+## Properties
+
+### `.reportFile`
+
+The report file path of the agent.
+
+## More
+
+### Setting environment variables during runtime
+
+You can set environment variables during runtime by calling `overrideAIConfig` method.
+
+```typescript
+import { overrideAIConfig } from '@midscene/web/puppeteer'; // or other agent
+
+overrideAIConfig({
+  OPENAI_BASE_URL: "...",
+  OPENAI_API_KEY: "...",
+  MIDSCENE_MODEL_NAME: "..."
+});
+
+// ...
+```
 
 ### AI profiling
 

diff --git a/apps/site/docs/en/automate-with-scripts-in-yaml.mdx b/apps/site/docs/en/automate-with-scripts-in-yaml.mdx
@@ -40,7 +40,7 @@ or you can use a `.env` file to store the configuration, Midscene command line t
 OPENAI_API_KEY="sk-abcdefghijklmnopqrstuvwxyz"
 ```
 
-For more details about model and provider, see [customize model and provider](./model-provider)
+For more details about model and provider, see [config model and provider](./model-provider)
 
 ## Start
 
@@ -148,6 +148,9 @@ target:
   # string, the path to save the aiQuery result, optional
   output: <path-to-output-file>
 
+  # boolean, if track the newly opened tab, true for default in yaml script
+  trackingActiveTab: <boolean>
+
   # string, the bridge mode to use, optional, default is false, can be 'newTabWithUrl' or 'currentTab'. More details see the following section
   bridgeMode: false | 'newTabWithUrl' | 'currentTab'
 ```

diff --git a/apps/site/docs/en/bridge-mode-by-chrome-extension.mdx b/apps/site/docs/en/bridge-mode-by-chrome-extension.mdx
@@ -14,7 +14,7 @@ you can check the demo project of bridge mode here: [https://github.com/web-infr
 
 ## Preparation
 
-1. Config the OpenAI API key, or [customize model and provider](./model-provider)
+1. Config the OpenAI API key, or [config model and provider](./model-provider)
 
 ```bash
 # replace with your own
@@ -81,7 +81,7 @@ Each of the agent instance can only connect to one tab instance, and it cannot b
 
 ### `connectCurrentTab(options?: { trackingActiveTab?: boolean })`
 
-Connect to the current active tab on Chrome.
+Connect to the current active tab on Chrome as the starting point.
 
 If `trackingActiveTab` is true, the agent will always track the active tab. For example, if you switch to another tab or a new tab is opened, the agent will track the latest active tab. Otherwise, the agent will only track the tab you connected to initially.
 

diff --git a/apps/site/docs/en/integrate-with-playwright.mdx b/apps/site/docs/en/integrate-with-playwright.mdx
@@ -12,7 +12,7 @@ you can check the demo project of Playwright here: [https://github.com/web-infra
 
 ## Preparation
 
-Config the OpenAI API key, or [customize model and provider](./model-provider)
+Config the OpenAI API key, or [config model and provider](./model-provider)
 
 ```bash
 # replace with your own

diff --git a/apps/site/docs/en/integrate-with-puppeteer.mdx b/apps/site/docs/en/integrate-with-puppeteer.mdx
@@ -13,7 +13,7 @@ There is also a demo of Puppeteer with Vitest: [https://github.com/web-infra-dev
 
 ## Preparation
 
-Config the OpenAI API key, or [customize model and provider](./model-provider)
+Config the OpenAI API key, or [config model and provider](./model-provider)
 
 ```bash
 # replace with your own
@@ -98,6 +98,18 @@ For the agent's more APIs, please refer to [API](./API).
 
 After the above command executes successfully, the console will output: `Midscene - report file updated: /path/to/report/some_id.html`. You can open this file in a browser to view the report.
 
+## More options in PuppeteerAgent constructor
+
+### To track newly opened tab
+
+If you want to track the newly opened tab (like clicking a link with `target="_blank"`), you can set the `trackingActiveTab` option to `true`:
+
+```typescript
+const mid = new PuppeteerAgent(page, {
+  trackingActiveTab: true,
+});
+```
+
 ## More
 
-You may also be interested in [Prompting Tips](./prompting-tips)
+You may also be interested in [Prompting Tips](./prompting-tips)
diff --git a/apps/site/docs/en/quick-experience.mdx b/apps/site/docs/en/quick-experience.mdx
@@ -20,7 +20,7 @@ Start the extension (may be folded by Chrome extension icon), setup the config b
 OPENAI_API_KEY="sk-replace-by-your-own"
 ```
 
-You can also paste the configuration as described in [customize model and provider](./model-provider) here.
+You can also paste the configuration as described in [config model and provider](./model-provider) here.
 
 ## Start experiencing
 

diff --git a/apps/site/docs/zh/API.md b/apps/site/docs/zh/API.md
@@ -1,10 +1,29 @@
 # API 参考
 
+## 构造器
+
+Midscene 中每个 Agent 都有自己的构造函数。
+
+* 在 Puppeteer 中，使用 [PuppeteerAgent](./integrate-with-puppeteer)
+* 在桥接模式（Bridge Mode）中，使用 [AgentOverChromeBridge](./bridge-mode-by-chrome-extension#constructor)
+
+这些 Agent 有一些相同的构造参数：
+
+* `generateReport: boolean`: 如果为 true，则生成报告文件。默认值为 true。
+* `autoPrintReportMsg: boolean`: 如果为 true，则打印报告消息。默认值为 true。
+* `cacheId: string | undefined`: 如果配置，则使用此 cacheId 匹配缓存。默认值为 undefined。
+
+在 puppeteer 中，还有一个额外的参数：
+
+* `trackingActiveTab: boolean`: 如果为 true，则跟踪新打开的标签页。默认值为 false。
+
+## 方法
+
 这些是 Midscene 中各类 Agent 的主要 API。
 
 > 在以下文档中，你可能会看到带有 `mid.` 前缀的函数调用。如果你在 Playwright 中使用了解构赋值（object destructuring），如 `async ({ ai, aiQuery }) => { /* ... */}`，你可以不带这个前缀进行调用。这只是语法的区别。
 
-## `.aiAction(steps: string)` 或 `.ai(steps: string)` - 控制界面
+### `.aiAction(steps: string)` 或 `.ai(steps: string)` - 控制界面
 
 你可以使用 `.aiAction` 来执行一系列操作。它接受一个参数 `steps: string` 用于描述这些操作。在这个参数中，你应该清楚地描述每一个步骤，然后 Midscene 会自动为你分析并执行。
 
@@ -36,7 +55,7 @@ await mid.ai('点击任务列表下方的 "completed" 状态按钮');
 * [FAQ: Midscene 能否根据一句话指令实现智能操作？比如执行 "发一条微博"'](./faq)
 * [编写提示词的技巧](./prompting-tips)
 
-## `.aiQuery(dataShape: any)` - 从页面提取数据
+### `.aiQuery(dataShape: any)` - 从页面提取数据
 
 这个方法可以从 UI 提取自定义数据。它不仅能返回页面上直接书写的数据，还能基于“理解”返回数据（前提是多模态 AI 能够推理）。返回值可以是任何合法的基本类型，比如字符串、数字、JSON、数组等。你只需在 `dataDemand` 中描述它，Midscene 就会给你满足格式的返回。
 
@@ -59,7 +78,7 @@ const dataB = await mid.aiQuery('string[]，列表中的任务名称');
 const dataC = await mid.aiQuery('{name: string, age: string}[], 表格中的数据记录');
 ```
 
-## `.aiAssert(assertion: string, errorMsg?: string)` - 进行断言
+### `.aiAssert(assertion: string, errorMsg?: string)` - 进行断言
 
 `.aiAssert` 的功能类似于一般的断言（assert）方法，但可以用自然语言编写条件参数 `assertion`。Midscene 会调用 AI 来判断条件是否为真。若条件不满足，SDK 会抛出一个错误并在 `errorMsg` 后附上 AI 生成的错误原因。
 
@@ -82,7 +101,7 @@ expect(onesieItem.price).toBe(7.99);
 ```
 :::
 
-## `.aiWaitFor(assertion: string, {timeoutMs?: number, checkIntervalMs?: number })` - 等待断言执行成功
+### `.aiWaitFor(assertion: string, {timeoutMs?: number, checkIntervalMs?: number })` - 等待断言执行成功
 
 `.aiWaitFor` 帮助你检查你的断言是否满足，或是是否发生了超时错误。考虑到 AI 服务的成本，检查间隔不会超过 `checkIntervalMs` 毫秒。默认配置将 `timeoutMs` 设为 15 秒，`checkIntervalMs` 设为 3 秒：也就是说，如果所有断言都失败，并且 AI 服务总是立即响应，则最多检查 5 次。
 
@@ -92,8 +111,27 @@ expect(onesieItem.price).toBe(7.99);
 await mid.aiWaitFor("界面上至少有一个耳机的信息");
 ```
 
+## 属性
+
+### `.reportFile`
 
-## 调试配置（可选）
+报告文件的路径。
+
+## 更多配置
+
+### 在运行时设置环境变量
+
+你可以通过 `overrideAIConfig` 方法在运行时设置环境变量。
+
+```typescript
+import { overrideAIConfig } from '@midscene/web/puppeteer'; // 或其他的 Agent
+
+overrideAIConfig({
+  OPENAI_BASE_URL: "...",
+  OPENAI_API_KEY: "...",
+  MIDSCENE_MODEL_NAME: "..."
+});
+```
 
 ### 打印 AI 性能信息
 

diff --git a/apps/site/docs/zh/automate-with-scripts-in-yaml.mdx b/apps/site/docs/zh/automate-with-scripts-in-yaml.mdx
@@ -148,6 +148,9 @@ target:
   # 输出 aiQuery 结果的 JSON 文件路径，可选
   output: <path-to-output-file>
 
+  # 是否跟踪新打开的标签页，可选，默认 true
+  trackingActiveTab: <boolean>
+
   # 桥接模式，可选，默认 false，可以为 'newTabWithUrl' 或 'currentTab'。更多详情请参阅后文
   bridgeMode: false | 'newTabWithUrl' | 'currentTab'
 ```

diff --git a/apps/site/docs/zh/integrate-with-puppeteer.mdx b/apps/site/docs/zh/integrate-with-puppeteer.mdx
@@ -98,6 +98,16 @@ npx tsx demo.ts
 
 当上面的命令执行成功后，会在控制台输出：`Midscene - report file updated: /path/to/report/some_id.html`， 通过浏览器打开该文件即可看到报告。
 
+## 如何跟踪新打开的标签页
+
+如果你想要跟踪新打开的标签页（比如点击一个带有 `target="_blank"` 属性的链接），你可以设置 `trackingActiveTab` 选项为 `true`：
+
+```typescript
+const mid = new PuppeteerAgent(page, {
+  trackingActiveTab: true,
+});
+```
+
 ## 更多
 
 你可能还想了解 [提示词技巧](./prompting-tips)

diff --git a/packages/midscene/src/yaml.d.ts b/packages/midscene/src/yaml.d.ts
@@ -24,6 +24,7 @@ export interface MidsceneYamlScriptEnv {
   };
   cookie?: string;
   output?: string;
+  trackingActiveTab?: boolean; // if track the newly opened tab, true for default in yaml script
 
   // bridge mode config
   bridgeMode?: false | 'newTabWithUrl' | 'currentTab';

diff --git a/packages/web-integration/src/bridge-mode/agent-cli-side.ts b/packages/web-integration/src/bridge-mode/agent-cli-side.ts
@@ -1,5 +1,5 @@
 import assert from 'node:assert';
-import { PageAgent } from '@/common/agent';
+import { PageAgent, PageAgentOpt } from '@/common/agent';
 import type { KeyboardAction, MouseAction } from '@/page';
 import {
   type BridgeConnectTabOptions,
@@ -93,13 +93,16 @@ export const getBridgePageInCliSide = (): ChromeExtensionPageCliSide => {
 };
 
 export class AgentOverChromeBridge extends PageAgent<ChromeExtensionPageCliSide> {
-  constructor() {
+  constructor(opts?: PageAgentOpt) {
     const page = getBridgePageInCliSide();
-    super(page, {
-      onTaskStartTip: (tip: string) => {
-        this.page.showStatusMessage(tip);
-      },
-    });
+    super(
+      page,
+      Object.assign(opts || {}, {
+        onTaskStartTip: (tip: string) => {
+          this.page.showStatusMessage(tip);
+        },
+      }),
+    );
   }
 
   async connectNewTabWithUrl(url: string, options?: BridgeConnectTabOptions) {

diff --git a/packages/web-integration/src/bridge-mode/index.ts b/packages/web-integration/src/bridge-mode/index.ts
@@ -1,3 +1,5 @@
 import { AgentOverChromeBridge } from './agent-cli-side';
 
 export { AgentOverChromeBridge };
+
+export { overrideAIConfig } from '@midscene/core/env';
diff --git a/packages/web-integration/src/common/agent.ts b/packages/web-integration/src/common/agent.ts
@@ -26,6 +26,7 @@ import { printReportMsg, reportFileName } from './utils';
 import { type WebUIContext, parseContextFromWebPage } from './utils';
 
 export interface PageAgentOpt {
+  trackingActiveTab?: boolean /* if tracking the newly created tab, default false */;
   testId?: string;
   cacheId?: string;
   groupName?: string;
@@ -69,6 +70,8 @@ export class PageAgent<PageType extends WebPage = WebPage> {
       },
       opts || {},
     );
+    // get the parent browser of the puppeteer page
+    // const browser = (this.page as PuppeteerWebPage).browser();
 
     this.insight = new Insight<WebElementInfo, WebUIContext>(
       async (action: InsightAction) => {