diff --git a/apps/site/docs/en/API.mdx b/apps/site/docs/en/API.mdx index c51442daf..c0190218b 100644 --- a/apps/site/docs/en/API.mdx +++ b/apps/site/docs/en/API.mdx @@ -162,16 +162,16 @@ await agent.aiInput('Hello World', 'The search input box'); ### `agent.aiKeyboardPress()` -Press a keyboard key. +Press a keyboard key or key combination. * Type ```typescript -function aiKeyboardPress(key: string, locate?: string, options?: Object): Promise; +function aiKeyboardPress(key: string | string[], locate?: string, options?: Object): Promise; ``` * Parameters: - * `key: string` - The web key to press, e.g. 'Enter', 'Tab', 'Escape', etc. Key Combination is not supported. + * `key: string | string[]` - The web key(s) to press. Can be a single key like 'Enter', 'Tab', 'Escape', or an array of keys for combinations like ['Ctrl', 'Shift'] or ['Meta', 'a']. * `locate?: string` - Optional, a natural language description of the element to press the key on. * `options?: Object` - Optional, a configuration object containing: * `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element. @@ -183,7 +183,12 @@ function aiKeyboardPress(key: string, locate?: string, options?: Object): Promis * Examples: ```typescript +// Single key press await agent.aiKeyboardPress('Enter', 'The search input box'); + +// Key combinations +await agent.aiKeyboardPress(['Ctrl', 'a']); // Select all +await agent.aiKeyboardPress(['Ctrl', 'Shift', 'I']); // Open developer tools ``` ### `agent.aiScroll()` diff --git a/apps/site/docs/zh/API.mdx b/apps/site/docs/zh/API.mdx index 0889486af..dcb4ea5e1 100644 --- a/apps/site/docs/zh/API.mdx +++ b/apps/site/docs/zh/API.mdx @@ -159,16 +159,16 @@ await agent.aiInput('Hello World', '搜索框'); ### `agent.aiKeyboardPress()` -按下键盘上的某个键。 +按下键盘上的某个键或组合键。 * 类型 ```typescript -function aiKeyboardPress(key: string, locate?: string, options?: Object): Promise; +function aiKeyboardPress(key: string | string[], locate?: string, options?: Object): Promise; ``` * 参数: - * `key: string` - 要按下的键,如 `Enter`、`Tab`、`Escape` 等。不支持组合键。 + * `key: string | string[]` - 要按下的键,可以是单个键如 `Enter`、`Tab`、`Escape` 等,或者是组合键数组如 `['Ctrl', 'Shift']`。 * `locate?: string` - 用自然语言描述的元素定位。 * `options?: Object` - 可选,一个配置对象,包含: * `deepThink?: boolean` - 是否开启深度思考。如果为 true,Midscene 会调用 AI 模型两次以精确定位元素。 @@ -180,7 +180,12 @@ function aiKeyboardPress(key: string, locate?: string, options?: Object): Promis * 示例: ```typescript +// 单个按键 await agent.aiKeyboardPress('Enter', '搜索框'); + +// 组合键 +await agent.aiKeyboardPress(['Ctrl', 'a']); // 全选 +await agent.aiKeyboardPress(['Ctrl', 'Shift', 'I']); // 打开开发者工具 ``` ### `agent.aiScroll()` diff --git a/packages/core/src/ai-model/common.ts b/packages/core/src/ai-model/common.ts index 362d8a62c..184550a50 100644 --- a/packages/core/src/ai-model/common.ts +++ b/packages/core/src/ai-model/common.ts @@ -4,7 +4,8 @@ import type { ElementTreeNode, MidsceneYamlFlowItem, PlanningAction, - PlanningActionParamInputOrKeyPress, + PlanningActionParamInput, + PlanningActionParamKeyPress, PlanningActionParamScroll, PlanningActionParamSleep, Rect, @@ -348,13 +349,13 @@ export function buildYamlFlowFromPlans( aiHover: locate!, }); } else if (type === 'Input') { - const param = plan.param as PlanningActionParamInputOrKeyPress; + const param = plan.param as PlanningActionParamInput; flow.push({ aiInput: param.value, locate, }); } else if (type === 'KeyboardPress') { - const param = plan.param as PlanningActionParamInputOrKeyPress; + const param = plan.param as PlanningActionParamKeyPress; flow.push({ aiKeyboardPress: param.value, locate, diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 2ea8043a1..cb6ef5705 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -306,10 +306,14 @@ export interface PlanningAIResponse { export type PlanningActionParamTap = null; export type PlanningActionParamHover = null; export type PlanningActionParamRightClick = null; -export interface PlanningActionParamInputOrKeyPress { +export interface PlanningActionParamInput { value: string; } +export interface PlanningActionParamKeyPress { + value: string | string[]; +} + export type PlanningActionParamScroll = scrollParam; export interface PlanningActionParamAssert { diff --git a/packages/core/src/yaml.ts b/packages/core/src/yaml.ts index 4ecaab1d3..44bcf0eab 100644 --- a/packages/core/src/yaml.ts +++ b/packages/core/src/yaml.ts @@ -142,7 +142,7 @@ export interface MidsceneYamlFlowItemAIInput extends LocateOption { } export interface MidsceneYamlFlowItemAIKeyboardPress extends LocateOption { - aiKeyboardPress: string; + aiKeyboardPress: string | string[]; locate?: string; // where to press, optional } diff --git a/packages/mcp/src/midscene.ts b/packages/mcp/src/midscene.ts index 1aa5db305..a91732a0c 100644 --- a/packages/mcp/src/midscene.ts +++ b/packages/mcp/src/midscene.ts @@ -241,9 +241,9 @@ export class MidsceneManager { tools.midscene_aiKeyboardPress.description, { key: z - .string() + .union([z.string(), z.array(z.string())]) .describe( - "The web key to press, e.g. 'Enter', 'Tab', 'Escape', etc.", + "The web key(s) to press. Can be a single key like 'Enter', 'Tab', 'Escape', or an array of keys for combinations like ['Ctrl', 'Shift'] or ['Meta', 'a'].", ), locate: z .string() @@ -264,11 +264,12 @@ export class MidsceneManager { const options = deepThink ? { deepThink } : undefined; await agent.aiKeyboardPress(key, locate, options); + const keyDesc = Array.isArray(key) ? key.join('+') : key; const targetDesc = locate ? ` on element "${locate}"` : ''; return { content: [ - { type: 'text', text: `Pressed key '${key}'${targetDesc}` }, + { type: 'text', text: `Pressed key(s) '${keyDesc}'${targetDesc}` }, { type: 'text', text: `report file: ${agent.reportFile}` }, ], isError: false, diff --git a/packages/web-integration/src/common/agent.ts b/packages/web-integration/src/common/agent.ts index fec3e25e6..a68284798 100644 --- a/packages/web-integration/src/common/agent.ts +++ b/packages/web-integration/src/common/agent.ts @@ -326,7 +326,7 @@ export class PageAgent { } async aiKeyboardPress( - keyName: string, + keyName: string | string[], locatePrompt?: string, opt?: LocateOption, ) { diff --git a/packages/web-integration/src/common/plan-builder.ts b/packages/web-integration/src/common/plan-builder.ts index ade992b7d..b18129365 100644 --- a/packages/web-integration/src/common/plan-builder.ts +++ b/packages/web-integration/src/common/plan-builder.ts @@ -2,7 +2,8 @@ import type { DetailedLocateParam, MidsceneYamlFlowItem, PlanningAction, - PlanningActionParamInputOrKeyPress, + PlanningActionParamInput, + PlanningActionParamKeyPress, PlanningActionParamScroll, PlanningActionParamSleep, PlanningActionParamTap, @@ -17,7 +18,8 @@ export function buildPlans( type: PlanningAction['type'], locateParam?: DetailedLocateParam, param?: - | PlanningActionParamInputOrKeyPress + | PlanningActionParamInput + | PlanningActionParamKeyPress | PlanningActionParamScroll | PlanningActionParamSleep, ): PlanningAction[] { @@ -42,23 +44,33 @@ export function buildPlans( returnPlans = [locatePlan, tapPlan]; } - if (type === 'Input' || type === 'KeyboardPress') { - if (type === 'Input') { - assert(locateParam, `missing locate info for action "${type}"`); - } + if (type === 'Input') { + assert(locateParam, `missing locate info for action "${type}"`); assert(param, `missing param for action "${type}"`); - const inputPlan: PlanningAction = { + const inputPlan: PlanningAction = { type, - param: param as PlanningActionParamInputOrKeyPress, + param: param as PlanningActionParamInput, thought: '', - locate: locateParam!, + locate: locateParam, + }; + + returnPlans = [locatePlan!, inputPlan]; + } + if (type === 'KeyboardPress') { + assert(param, `missing param for action "${type}"`); + + const keyboardPressPlan: PlanningAction = { + type, + param: param as PlanningActionParamKeyPress, + thought: '', + locate: locateParam, }; if (locatePlan) { - returnPlans = [locatePlan, inputPlan]; + returnPlans = [locatePlan, keyboardPressPlan]; } else { - returnPlans = [inputPlan]; + returnPlans = [keyboardPressPlan]; } } diff --git a/packages/web-integration/src/common/tasks.ts b/packages/web-integration/src/common/tasks.ts index a33e019c6..15385c821 100644 --- a/packages/web-integration/src/common/tasks.ts +++ b/packages/web-integration/src/common/tasks.ts @@ -26,7 +26,8 @@ import { type PlanningActionParamAssert, type PlanningActionParamError, type PlanningActionParamHover, - type PlanningActionParamInputOrKeyPress, + type PlanningActionParamInput, + type PlanningActionParamKeyPress, type PlanningActionParamScroll, type PlanningActionParamSleep, type PlanningActionParamTap, @@ -379,7 +380,7 @@ export class PageTaskExecutor { }; tasks.push(taskAssert); } else if (plan.type === 'Input') { - const taskActionInput: ExecutionTaskActionApply = + const taskActionInput: ExecutionTaskActionApply = { type: 'Action', subType: 'Input', @@ -402,7 +403,7 @@ export class PageTaskExecutor { }; tasks.push(taskActionInput); } else if (plan.type === 'KeyboardPress') { - const taskActionKeyboardPress: ExecutionTaskActionApply = + const taskActionKeyboardPress: ExecutionTaskActionApply = { type: 'Action', subType: 'KeyboardPress', diff --git a/packages/web-integration/src/common/ui-utils.ts b/packages/web-integration/src/common/ui-utils.ts index a632ed65a..3343206ab 100644 --- a/packages/web-integration/src/common/ui-utils.ts +++ b/packages/web-integration/src/common/ui-utils.ts @@ -19,7 +19,7 @@ export function getKeyCommands( value: string | string[], ): Array<{ key: string; command?: string }> { // Ensure value is an array of keys - const keys = Array.isArray(value) ? value : [value]; + const keys = Array.isArray(value) ? value : value.split('+'); // Compatible with input format 'Meta+A'; // Process each key to attach a corresponding command if needed, based on the presence of 'Meta' or 'Control' in the keys array. // ref: https://github.com/puppeteer/puppeteer/pull/9357/files#diff-32cf475237b000f980eb214a0a823e45a902bddb7d2426d677cae96397aa0ae4R94 diff --git a/packages/web-integration/tests/unit-test/__snapshots__/plan-builder.test.ts.snap b/packages/web-integration/tests/unit-test/__snapshots__/plan-builder.test.ts.snap index d8d78e904..765c43b93 100644 --- a/packages/web-integration/tests/unit-test/__snapshots__/plan-builder.test.ts.snap +++ b/packages/web-integration/tests/unit-test/__snapshots__/plan-builder.test.ts.snap @@ -61,6 +61,23 @@ exports[`build plans > keyboardPress 1`] = ` ] `; +exports[`build plans > keyboardPress with combination keys 1`] = ` +[ + { + "locate": undefined, + "param": { + "value": [ + "Ctrl", + "Shift", + "I", + ], + }, + "thought": "", + "type": "KeyboardPress", + }, +] +`; + exports[`build plans > rightClick 1`] = ` [ { diff --git a/packages/web-integration/tests/unit-test/plan-builder.test.ts b/packages/web-integration/tests/unit-test/plan-builder.test.ts index 1fa35a89b..8dec8630e 100644 --- a/packages/web-integration/tests/unit-test/plan-builder.test.ts +++ b/packages/web-integration/tests/unit-test/plan-builder.test.ts @@ -51,6 +51,13 @@ describe('build plans', () => { expect(result).toMatchSnapshot(); }); + it('keyboardPress with combination keys', async () => { + const result = await buildPlans('KeyboardPress', undefined, { + value: ['Ctrl', 'Shift', 'I'], + }); + expect(result).toMatchSnapshot(); + }); + it('scroll', async () => { const result = await buildPlans('Scroll', undefined, { direction: 'down', diff --git a/packages/web-integration/tests/unit-test/util.test.ts b/packages/web-integration/tests/unit-test/util.test.ts index d5798e103..0e98b0116 100644 --- a/packages/web-integration/tests/unit-test/util.test.ts +++ b/packages/web-integration/tests/unit-test/util.test.ts @@ -68,4 +68,27 @@ describe('getKeyCommands', () => { { key: 'V', command: 'Paste' }, ]); }); + + it('should handle combination keys like Ctrl+Shift', () => { + const result = getKeyCommands(['Control', 'Shift']); + expect(result).toEqual([{ key: 'Control' }, { key: 'Shift' }]); + }); + + it('should handle complex combinations like Ctrl+Shift+A', () => { + const result = getKeyCommands(['Control', 'Shift', 'A']); + expect(result).toEqual([ + { key: 'Control' }, + { key: 'Shift' }, + { key: 'A', command: 'SelectAll' }, + ]); + }); + + it('should handle Meta+Shift+V combination', () => { + const result = getKeyCommands(['Meta', 'Shift', 'V']); + expect(result).toEqual([ + { key: 'Meta' }, + { key: 'Shift' }, + { key: 'V', command: 'Paste' }, + ]); + }); });