From 325264d5e863c9802e060c3a8b817d883e05d9d4 Mon Sep 17 00:00:00 2001 From: yutao Date: Mon, 23 Dec 2024 17:43:14 +0800 Subject: [PATCH 1/4] feat: allow adding custom description in dom --- .../docs/en/describe-elements-by-your-own.mdx | 37 +++++++ .../docs/zh/describe-elements-by-your-own.mdx | 37 +++++++ apps/site/rspress.config.ts | 8 ++ .../web-integration/src/extractor/dom-util.ts | 11 ++ .../src/extractor/web-extractor.ts | 95 +++++++++++++++++- .../__snapshots__/web-extractor.test.ts.snap | 38 ++++--- .../fixtures/web-extractor/index.html | 72 +++++++++---- .../fixtures/web-extractor/input.png | Bin 192868 -> 261309 bytes .../fixtures/web-extractor/output.png | Bin 213905 -> 284938 bytes .../fixtures/web-extractor/scroll/input.png | Bin 28288 -> 37034 bytes .../fixtures/web-extractor/scroll/output.png | Bin 31970 -> 40796 bytes 11 files changed, 266 insertions(+), 32 deletions(-) create mode 100644 apps/site/docs/en/describe-elements-by-your-own.mdx create mode 100644 apps/site/docs/zh/describe-elements-by-your-own.mdx diff --git a/apps/site/docs/en/describe-elements-by-your-own.mdx b/apps/site/docs/en/describe-elements-by-your-own.mdx new file mode 100644 index 000000000..2f2a7d0b1 --- /dev/null +++ b/apps/site/docs/en/describe-elements-by-your-own.mdx @@ -0,0 +1,37 @@ +# Describe Elements by Your Own + +In some cases, you may want to describe elements by yourself. For example, you may want to describe the content inside an `` element which cannot be extracted by Midscene. Or you just want to "be friendly" to the LLM. + +To do this, you can put an identifier in the `midscene-description-ref` attribute, and put the description JSON in a ` +``` + +The core fields are `rect`, `text`, and `description`. + +- `rect`: The rectangle of this content. Required. All the coordinates are relative to the upper left corner of the element. +- `text`: The text of this content. Optional. Should be same as the text on screen. +- `description`: A short description of this content. Optional. diff --git a/apps/site/docs/zh/describe-elements-by-your-own.mdx b/apps/site/docs/zh/describe-elements-by-your-own.mdx new file mode 100644 index 000000000..e2b781dc3 --- /dev/null +++ b/apps/site/docs/zh/describe-elements-by-your-own.mdx @@ -0,0 +1,37 @@ +# 自定义页面元素的描述信息 + +在某些情况下,你可能需要自定义页面元素的描述。例如,你可能需要描述页面中无法被 Midscene 提取的 `` 元素的内容。或者你只是单纯想对 LLM 示好。 + +为此,你可以在元素上添加 `midscene-description-ref` 属性,然后在 ` +``` + +这里的关键字段是 `rect`、`text` 和 `description`。 + +- `rect`: 该内容的坐标区域。必选。所有坐标都是相对于此元素的左上角。 +- `text`: 该内容的文本内容。可选。应与屏幕上的文本一致。 +- `description`: 该内容的简短描述。可选。 diff --git a/apps/site/rspress.config.ts b/apps/site/rspress.config.ts index 7d31150fd..7ef3070f6 100644 --- a/apps/site/rspress.config.ts +++ b/apps/site/rspress.config.ts @@ -84,6 +84,10 @@ export default defineConfig({ text: 'Customize Model and Provider', link: '/model-provider', }, + { + text: 'Describe Elements by Your Own', + link: '/describe-elements-by-your-own', + }, ], }, { @@ -141,6 +145,10 @@ export default defineConfig({ text: '自定义模型和服务商', link: '/zh/model-provider', }, + { + text: '自定义页面元素的描述信息', + link: '/zh/describe-elements-by-your-own', + }, ], }, { diff --git a/packages/web-integration/src/extractor/dom-util.ts b/packages/web-integration/src/extractor/dom-util.ts index 42b08e54b..c26db5539 100644 --- a/packages/web-integration/src/extractor/dom-util.ts +++ b/packages/web-integration/src/extractor/dom-util.ts @@ -1,3 +1,14 @@ +export const USER_DESCRIBED_ELEMENT_ATTRIBUTE_REF = 'midscene-description-ref'; +export const USER_DESCRIBED_ELEMENT_ATTRIBUTE_ID = 'midscene-description-id'; + +export function isUserDescribedElement(node: Node): boolean { + if (node instanceof Element) { + return node.hasAttribute(USER_DESCRIBED_ELEMENT_ATTRIBUTE_REF); + } + + return false; +} + export function isFormElement(node: Node) { return ( node instanceof HTMLElement && diff --git a/packages/web-integration/src/extractor/web-extractor.ts b/packages/web-integration/src/extractor/web-extractor.ts index 70f7fffa2..ec2279bc9 100644 --- a/packages/web-integration/src/extractor/web-extractor.ts +++ b/packages/web-integration/src/extractor/web-extractor.ts @@ -5,11 +5,14 @@ import { } from '@midscene/shared/constants'; import type { ElementInfo } from '.'; import { + USER_DESCRIBED_ELEMENT_ATTRIBUTE_ID, + USER_DESCRIBED_ELEMENT_ATTRIBUTE_REF, isButtonElement, isContainerElement, isFormElement, isImgElement, isTextElement, + isUserDescribedElement, } from './dom-util'; import { getDocument, @@ -40,7 +43,7 @@ function tagNameOfNode(node: Node): string { } const parentElement = node.parentElement; - if (parentElement && parentElement instanceof HTMLElement) { + if (!tagName && parentElement && parentElement instanceof HTMLElement) { tagName = parentElement.tagName.toLowerCase(); } @@ -51,7 +54,7 @@ function collectElementInfo( node: Node, nodePath: string, baseZoom = 1, -): WebElementInfo | null { +): WebElementInfo | WebElementInfo[] | null { const rect = visibleRect(node, baseZoom); if ( !rect || @@ -60,6 +63,88 @@ function collectElementInfo( ) { return null; } + + if (isUserDescribedElement(node)) { + console.log('isUserDescribedElement', node); + const element = node as Element; + const descriptionId = element.getAttribute( + USER_DESCRIBED_ELEMENT_ATTRIBUTE_REF, + ); + const targetSelector = `[${USER_DESCRIBED_ELEMENT_ATTRIBUTE_ID}="${descriptionId}"]`; + const targetElement = document.querySelectorAll(targetSelector); + if (targetElement.length === 0) { + console.warn( + 'cannot find element for Midscene description', + targetSelector, + ); + return null; + } + + if (targetElement.length > 1) { + console.warn( + 'multiple elements found for Midscene description', + targetSelector, + ); + } + + const descriptionElement = targetElement[0] as HTMLElement; + const description = descriptionElement.innerText; + try { + const descriptionJson = JSON.parse(description); + if (!Array.isArray(descriptionJson)) { + console.warn('description is not a valid JSON', description); + return null; + } + + const infoList = descriptionJson.map((item, index) => { + const overallRect = { + left: item.rect.x + rect.left, + top: item.rect.y + rect.top, + width: item.rect.width, + height: item.rect.height, + }; + const center: [number, number] = [ + Math.round(overallRect.left + overallRect.width / 2), + Math.round(overallRect.top + overallRect.height / 2), + ]; + const nodeType = item.text ? NodeType.TEXT : NodeType.CONTAINER; + const nodeHashId = midsceneGenerateHash( + null, + `${index}-${item.text || ''}`, + overallRect, + ); + return { + id: nodeHashId, + indexId: indexId++, + zoom: 1, + locator: '', + nodeType, + nodePath, + nodeHashId, + attributes: { + ...(item.description ? { description: item.description } : {}), + nodeType, + }, + content: item.text || '', + rect: overallRect, + center, + screenWidth: window.innerWidth, + screenHeight: window.innerHeight, + }; + }); + + return infoList; + } catch (e) { + console.error(e); + console.warn( + 'description is not a valid JSON', + targetSelector, + description, + ); + return null; + } + } + if (isFormElement(node)) { const attributes = getNodeAttributes(node); let valueContent = @@ -266,6 +351,12 @@ export function extractTextWithPosition( } const elementInfo = collectElementInfo(node, nodePath, baseZoom); + + if (Array.isArray(elementInfo)) { + elementInfoArray.push(...elementInfo); + return null; + } + // stop collecting if the node is a Button or Image if ( elementInfo?.nodeType === NodeType.BUTTON || diff --git a/packages/web-integration/tests/unit-test/__snapshots__/web-extractor.test.ts.snap b/packages/web-integration/tests/unit-test/__snapshots__/web-extractor.test.ts.snap index d373445fc..f7f35be93 100644 --- a/packages/web-integration/tests/unit-test/__snapshots__/web-extractor.test.ts.snap +++ b/packages/web-integration/tests/unit-test/__snapshots__/web-extractor.test.ts.snap @@ -268,7 +268,7 @@ exports[`extractor > basic 1`] = ` }, { "attributes": { - "htmlTagName": "
", + "htmlTagName": "