Skip to content

Added Hybrid Mode that combines Snapshot and Vision modes #611

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ Playwright MCP server supports following arguments. They can be provided in the
example "1280, 720"
--vision Run server that uses screenshots (Aria snapshots
are used by default)
--hybrid Run server that uses Snapshot and Vision Modes
```

<!--- End of options generated section -->
Expand Down Expand Up @@ -313,6 +314,9 @@ npx @playwright/mcp@latest --config path/to/config.json
// Enable vision mode (screenshots instead of accessibility snapshots)
vision?: boolean;

// Enable hybrid mode (dafault and vision capabilities)
hybrid?: boolean;

// Directory for output files
outputDir?: string;

Expand All @@ -324,7 +328,7 @@ npx @playwright/mcp@latest --config path/to/config.json
// List of origins to block the browser to request. Origins matching both `allowedOrigins` and `blockedOrigins` will be blocked.
blockedOrigins?: string[];
};

/**
* Do not send image responses to the client.
*/
Expand Down Expand Up @@ -425,6 +429,26 @@ To use Vision Mode, add the `--vision` flag when starting the server:
Vision Mode works best with the computer use models that are able to interact with elements using
X Y coordinate space, based on the provided screenshot.


3. **Hybrid Mode**: Uses Snapshot and Vision Mode for combined scenario where coordinates is
also required

To use Hybdrid Mode, add the `--hybrid` flag when starting the server:

```js
{
"mcpServers": {
"playwright": {
"command": "npx",
"args": [
"@playwright/mcp@latest",
"--hybrid"
]
}
}
}
```

<!--- Tools generated by update-readme.js -->

<details>
Expand Down
5 changes: 5 additions & 0 deletions config.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ export type Config = {
*/
vision?: boolean;

/**
* Run server that uses hybrid version (vision and classic) (Aria snapshots are used by default).
*/
hybrid?: boolean;

/**
* Whether to save the Playwright trace of the session into the output directory.
*/
Expand Down
2 changes: 2 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ export type CLIOptions = {
userDataDir?: string;
viewportSize?: string;
vision?: boolean;
hybrid?: boolean;
extension?: boolean;
};

Expand Down Expand Up @@ -204,6 +205,7 @@ export async function configFromCLIOptions(cliOptions: CLIOptions): Promise<Conf
},
capabilities: cliOptions.caps?.split(',').map((c: string) => c.trim() as ToolCapability),
vision: !!cliOptions.vision,
hybrid: !!cliOptions.hybrid,
extension: !!cliOptions.extension,
network: {
allowedOrigins: cliOptions.allowedOrigins,
Expand Down
4 changes: 2 additions & 2 deletions src/connection.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ import { CallToolRequestSchema, ListToolsRequestSchema, Tool as McpTool } from '
import { zodToJsonSchema } from 'zod-to-json-schema';

import { Context } from './context.js';
import { snapshotTools, visionTools } from './tools.js';
import { snapshotTools, visionTools , hybridTools } from './tools.js';
import { packageJSON } from './package.js';

import { FullConfig, validateConfig } from './config.js';

import type { BrowserContextFactory } from './browserContextFactory.js';

export function createConnection(config: FullConfig, browserContextFactory: BrowserContextFactory): Connection {
const allTools = config.vision ? visionTools : snapshotTools;
const allTools = config.vision ? visionTools : config.hybrid ? hybridTools: snapshotTools;
const tools = allTools.filter(tool => !config.capabilities || tool.capability === 'core' || config.capabilities.includes(tool.capability));
validateConfig(config);
const context = new Context(tools, config, browserContextFactory);
Expand Down
1 change: 1 addition & 0 deletions src/program.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ program
.option('--user-data-dir <path>', 'path to the user data directory. If not specified, a temporary directory will be created.')
.option('--viewport-size <size>', 'specify browser viewport size in pixels, for example "1280, 720"')
.option('--vision', 'Run server that uses screenshots (Aria snapshots are used by default)')
.option('--hybrid', 'Run server that combines Snapshot and Vision Modes')
.addOption(new Option('--extension', 'Allow connecting to a running browser instance (Edge/Chrome only). Requires the \'Playwright MCP\' browser extension to be installed.').hideHelp())
.action(async options => {
const config = await resolveCLIConfig(options);
Expand Down
17 changes: 17 additions & 0 deletions src/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,20 @@ export const visionTools: Tool<any>[] = [
...vision,
...wait(false),
];
export const hybridTools: Tool<any>[] = [
...common(true),
...console,
...dialogs(true),
...files(true),
...install,
...keyboard(true),
...navigate(true),
...network,
...pdf,
...screenshot,
...snapshot,
...tabs(true),
...testing,
...vision,
...wait(true),
];
37 changes: 37 additions & 0 deletions tests/capabilities.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,43 @@ test('test vision tool list', async ({ visionClient }) => {
]));
});

test('test hybrid tool list', async ({ hybridClient }) => {
const { tools: hybridTools } = await hybridClient.listTools();
expect(new Set(hybridTools.map(t => t.name))).toEqual(new Set([
'browser_click',
'browser_console_messages',
'browser_drag',
'browser_file_upload',
'browser_generate_playwright_test',
'browser_handle_dialog',
'browser_hover',
'browser_select_option',
'browser_type',
'browser_close',
'browser_install',
'browser_navigate_back',
'browser_navigate_forward',
'browser_navigate',
'browser_network_requests',
'browser_pdf_save',
'browser_press_key',
'browser_resize',
'browser_snapshot',
'browser_tab_close',
'browser_tab_list',
'browser_tab_new',
'browser_tab_select',
'browser_take_screenshot',
'browser_wait_for',
'browser_screen_capture',
'browser_screen_click',
'browser_screen_drag',
'browser_screen_move_mouse',
'browser_screen_type',
]));
});


test('test capabilities', async ({ startClient }) => {
const { client } = await startClient({
args: ['--caps="core"'],
Expand Down
6 changes: 6 additions & 0 deletions tests/fixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ type CDPServer = {
type TestFixtures = {
client: Client;
visionClient: Client;
hybridClient: Client;
startClient: (options?: { clientName?: string, args?: string[], config?: Config }) => Promise<{ client: Client, stderr: () => string }>;
wsEndpoint: string;
cdpServer: CDPServer;
Expand All @@ -71,6 +72,11 @@ export const test = baseTest.extend<TestFixtures & TestOptions, WorkerFixtures>(
await use(client);
},

hybridClient: async ({ startClient }, use) => {
const { client } = await startClient({ args: ['--hybrid'] });
await use(client);
},

startClient: async ({ mcpHeadless, mcpBrowser, mcpMode, startMcpExtension }, use, testInfo) => {
const userDataDir = mcpMode !== 'docker' ? testInfo.outputPath('user-data-dir') : undefined;
const configDir = path.dirname(test.info().config.configFile!);
Expand Down
9 changes: 8 additions & 1 deletion utils/update-readme.js
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,20 @@ const categories = {
...filesTools(false),
...dialogsTools(false),
],
'Hybrid mode': [
...visionTools,
...keyboardTools(),
...waitTools(true),
...filesTools(true),
...dialogsTools(true),
],
};

// NOTE: Can be removed when we drop Node.js 18 support and changed to import.meta.filename.
const __filename = url.fileURLToPath(import.meta.url);

/**
* @param {import('../src/tools/tool.js').ToolSchema<any>} tool
* @param {import('../src/tools/tool.js').ToolSchema<any>} tool
* @returns {string[]}
*/
function formatToolForReadme(tool) {
Expand Down