Skip to content

Commit bde2444

Browse files
authored
refactor(gui-agent): refactor browser operators architecture and fix deployment issues (#1682)
1 parent 4eb7ac7 commit bde2444

File tree

19 files changed

+1640
-1453
lines changed

19 files changed

+1640
-1453
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Quick real-environment runner for BrowserGUIAgent
3+
* - Launches a local Chrome (via Puppeteer)
4+
* - Navigates to a page
5+
* - Uses BrowserGUIAgent.screenshot() and saves a WEBP file
6+
*/
7+
import { AgentTARS } from '../src';
8+
9+
async function main() {
10+
const localAgent = new AgentTARS({
11+
model: {
12+
provider: 'volcengine',
13+
id: 'ep-20250510145437-5sxhs',
14+
apiKey: process.env.ARK_API_KEY,
15+
displayName: 'doubao-1.5-thinking-vision-pro',
16+
},
17+
toolCallEngine: 'structured_outputs',
18+
});
19+
await localAgent.initialize();
20+
const tools = localAgent.getTools();
21+
console.log('\n📋 Available Tools:');
22+
console.log('─'.repeat(80));
23+
tools.forEach((tool, index) => {
24+
const num = (index + 1).toString().padStart(2, ' ');
25+
const name = tool.name.padEnd(30, ' ');
26+
const desc = (tool.description || 'No description').substring(0, 45).replace(/\n/g, ' ');
27+
console.log(`${num}. ${name}${desc}`);
28+
});
29+
console.log('─'.repeat(80));
30+
console.log(`Total: ${tools.length} tools\n`);
31+
32+
// Test tasks to run
33+
const tasks = [
34+
'Open https://seed-tars.com',
35+
'Use gui to go to https://www.producthunt.com/, search the top products for "AI", from the results, identify the top-listed product (the top 3 result). Collect the following information from that product\'s card: 1. Product name 2. Short description 3. Number of upvotes summarize it and report to me.',
36+
'Use gui action, go to https://sample-files.com/documents/pdf/, find the 65KB pdf file, preview it, scroll the file from top to bottom.',
37+
];
38+
39+
// Execute tasks iteratively
40+
for (let i = 0; i < tasks.length; i++) {
41+
const task = tasks[i];
42+
console.log(`\n🚀 Executing Task ${i + 1}:`);
43+
console.log(`📝 ${task}`);
44+
console.log('─'.repeat(80));
45+
46+
try {
47+
const response = await localAgent.run(task);
48+
console.log(`✅ Task ${i + 1} Response:`, response);
49+
} catch (error) {
50+
console.error(`❌ Task ${i + 1} Failed:`, error);
51+
}
52+
53+
console.log('─'.repeat(80));
54+
}
55+
console.log('🎉 All tasks completed. Exiting...');
56+
57+
// Clean up resources and exit
58+
console.log('\n🧹 Cleaning up resources...');
59+
try {
60+
await localAgent.cleanup();
61+
console.log('✅ Cleanup completed');
62+
} catch (error) {
63+
console.error('❌ Cleanup failed:', error);
64+
}
65+
process.exit(0);
66+
}
67+
68+
main().catch((err) => {
69+
console.error('Runner failed:', err);
70+
process.exit(1);
71+
});

multimodal/agent-tars/core/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
},
3030
"devDependencies": {
3131
"@gui-agent/operator-browser": "workspace:*",
32+
"@gui-agent/action-parser": "workspace:*",
33+
"@gui-agent/shared": "workspace:*",
3234
"@agent-infra/mcp-server-browser": "1.1.10",
3335
"@agent-infra/mcp-server-commands": "1.1.10",
3436
"@agent-infra/mcp-server-filesystem": "1.1.10",

multimodal/agent-tars/core/src/environments/local/browser/browser-gui-agent.ts

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,12 @@ import { BrowserOperator } from '@gui-agent/operator-browser';
99
import { ConsoleLogger, AgentEventStream, Tool, z } from '@tarko/mcp-agent';
1010
import { ImageCompressor, formatBytes } from '@tarko/shared-media-utils';
1111
import { ActionInputs, PredictionParsed } from '@agent-tars/interface';
12+
import { ActionParserHelper } from '@gui-agent/action-parser';
13+
import { Coordinates, NormalizeCoordinates } from '@gui-agent/shared/types';
14+
import { normalizeActionCoords } from '@gui-agent/shared/utils';
1215
import {
1316
convertToGUIResponse,
17+
convertToAgentUIAction,
1418
createGUIErrorResponse,
1519
GUIExecuteResult,
1620
} from '@tarko/shared-utils';
@@ -37,6 +41,22 @@ export interface GUIAgentOptions {
3741
eventStream?: AgentEventStream.Processor;
3842
}
3943

44+
const actionParserHelper = new ActionParserHelper();
45+
46+
const defaultNormalizeCoords: NormalizeCoordinates = (rawCoords: Coordinates) => {
47+
if (!rawCoords.raw) {
48+
return { normalized: rawCoords };
49+
}
50+
const normalizedCoords = {
51+
...rawCoords,
52+
normalized: {
53+
x: rawCoords.raw.x / 1000,
54+
y: rawCoords.raw.y / 1000,
55+
},
56+
};
57+
return { normalized: normalizedCoords };
58+
};
59+
4060
/**
4161
* Browser GUI Agent for visual browser automation
4262
*/
@@ -109,30 +129,39 @@ wait() - Wait 5 seconds and take a scree
109129
}),
110130
function: async ({ thought, step, action }) => {
111131
try {
112-
const parsed = this.parseAction(action);
113-
parsed.thought = thought;
132+
const parsedAction = actionParserHelper.parseActionCallString(action);
133+
if (!parsedAction) {
134+
return createGUIErrorResponse(action, 'Invalid action format');
135+
}
136+
const normalizedCoordsAction = normalizeActionCoords(
137+
parsedAction,
138+
defaultNormalizeCoords,
139+
);
114140

115141
this.logger.debug({
116142
thought,
117143
step,
118144
action,
119-
parsedAction: JSON.stringify(parsed, null, 2),
145+
normalizedCoordsAction: JSON.stringify(normalizedCoordsAction, null, 2),
120146
screenDimensions: {
121147
width: this.screenWidth,
122148
height: this.screenHeight,
123149
},
124150
});
125151

126-
const operatorResult: GUIExecuteResult = await this.browserOperator.execute({
127-
parsedPrediction: parsed,
128-
screenWidth: this.screenWidth || 1920,
129-
screenHeight: this.screenHeight || 1080,
152+
const operatorResult = await this.browserOperator.doExecute({
153+
actions: [normalizedCoordsAction],
130154
});
155+
this.logger.debug('Browser action completed', operatorResult);
131156

132157
await sleep(500);
133158

134-
const guiResponse = convertToGUIResponse(action, parsed, operatorResult);
135-
return guiResponse;
159+
return {
160+
success: true,
161+
action: action,
162+
normalizedAction: convertToAgentUIAction(normalizedCoordsAction),
163+
observation: undefined, // Reserved for future implementation
164+
};
136165
} catch (error) {
137166
this.logger.error(
138167
`Browser action failed: ${error instanceof Error ? error.message : String(error)}`,
@@ -164,7 +193,7 @@ wait() - Wait 5 seconds and take a scree
164193
// Record screenshot start time
165194
const startTime = performance.now();
166195

167-
const output = await this.browserOperator.screenshot();
196+
const output = await this.browserOperator.doScreenshot();
168197

169198
// Calculate screenshot time
170199
const endTime = performance.now();
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
snapshots
2+
snapshot

multimodal/gui-agent/agent-sdk/examples/configs/browser-ve-15vp.config.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@ import 'dotenv/config';
66
import path from 'path';
77

88
import { defineConfig } from '@tarko/agent-cli';
9-
import { browserOperator } from './operators';
9+
import { browserOperator, remoteBrowserOperator } from './operators';
1010
import { doubao_1_5_vp } from './models';
11-
import { systemPromptTemplate1 } from './promptTemps';
11+
import { systemPromptTemplate2 } from './promptTemps';
1212

1313
export default defineConfig({
14-
operator: browserOperator,
14+
// operator: browserOperator,
15+
operator: remoteBrowserOperator,
1516
model: doubao_1_5_vp,
16-
systemPrompt: systemPromptTemplate1,
17+
systemPrompt: systemPromptTemplate2,
1718
snapshot: {
1819
enable: true,
1920
storageDirectory: path.join(__dirname, '../snapshots/browser-ve-15vp'),

multimodal/gui-agent/agent-sdk/examples/configs/operators.ts

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,27 @@
44
*/
55
import { AdbOperator } from '@gui-agent/operator-adb';
66
import { NutJSOperator } from '@gui-agent/operator-nutjs';
7-
import { Operator, ScreenContext } from '@gui-agent/shared/base';
87
import {
9-
SupportedActionType,
10-
ScreenshotOutput,
11-
ExecuteParams,
12-
ExecuteOutput,
13-
} from 'gui-agent/shared/src/types';
8+
LocalBrowserOperator,
9+
RemoteBrowserOperator,
10+
SearchEngine,
11+
} from '@gui-agent/operator-browser';
1412

1513
const computerOperator = new NutJSOperator();
1614
const androidOperator = new AdbOperator();
15+
const browserOperator = new LocalBrowserOperator({
16+
searchEngine: SearchEngine.GOOGLE,
17+
showActionInfo: false,
18+
showWaterFlow: false,
19+
highlightClickableElements: false,
20+
});
1721

18-
class MockedBrowserOperator extends Operator {
19-
protected initialize(): Promise<void> {
20-
throw new Error('Method not implemented.');
21-
}
22-
protected supportedActions(): Array<SupportedActionType> {
23-
throw new Error('Method not implemented.');
24-
}
25-
protected screenContext(): ScreenContext {
26-
throw new Error('Method not implemented.');
27-
}
28-
protected screenshot(): Promise<ScreenshotOutput> {
29-
throw new Error('Method not implemented.');
30-
}
31-
protected execute(params: ExecuteParams): Promise<ExecuteOutput> {
32-
throw new Error('Method not implemented.');
33-
}
34-
}
22+
const remoteBrowserOperator = new RemoteBrowserOperator({
23+
wsEndpoint: 'ws://localhost:9222/devtools/browser/<id>',
24+
searchEngine: SearchEngine.GOOGLE,
25+
showActionInfo: true,
26+
showWaterFlow: true,
27+
highlightClickableElements: true,
28+
});
3529

36-
const browserOperator = new MockedBrowserOperator();
37-
38-
export { computerOperator, androidOperator, browserOperator };
30+
export { computerOperator, androidOperator, browserOperator, remoteBrowserOperator };

multimodal/gui-agent/agent-sdk/examples/configs/promptTemps.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,48 @@ export const systemPromptTemplate1: SystemPromptTemplate = {
4949
.join('\n');
5050
},
5151
};
52+
53+
export const systemPromptTemplate2: SystemPromptTemplate = {
54+
template: SYSTEM_PROMPT_1,
55+
actionsToString: (actions) => {
56+
return actions
57+
.map((action) => {
58+
switch (action) {
59+
case 'click':
60+
return `click(point='<point>x1 y1</point>')`;
61+
case 'right_click':
62+
return `right_single(point='<point>x1 y1</point>')`;
63+
case 'double_click':
64+
return `left_double(point='<point>x1 y1</point>')`;
65+
case 'navigate':
66+
return `navigate(url='xxx') # Navigate to the given url.`;
67+
case 'navigate_back':
68+
return `navigate_back() # Navigate back to the previous page.`;
69+
case 'drag':
70+
return `drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>') # Swipe/Drag to show more information or select elements. The direction of the page movement is opposite to the finger's movement`;
71+
case 'hotkey':
72+
return `hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.`;
73+
case 'type':
74+
return `type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content. `;
75+
case 'scroll':
76+
return `scroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \`direction\` side.`;
77+
case 'long_press':
78+
return `long_press(point='<point>x1 y1</point>')`;
79+
case 'press_back':
80+
return `press_back() # Press the back button. 如果你想切换应用不需要press_back,直接open_app。`;
81+
case 'press_home':
82+
return `press_home() # Press the home button. 如果你想切换应用不需要press_home,直接open_app。`;
83+
case 'open_app':
84+
return `open_app(app_name='xxx') # Open the app with the given name. You can only use the apps in the app_list.`;
85+
case 'wait':
86+
return `wait() #Sleep for 5s and take a screenshot to check for any changes.`;
87+
case 'finished':
88+
return `finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.`;
89+
default:
90+
return null;
91+
}
92+
})
93+
.filter((actionString) => actionString !== null)
94+
.join('\n');
95+
},
96+
};

multimodal/gui-agent/operator-browser/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
"@agent-infra/browser": "0.1.1",
3838
"@agent-infra/logger": "0.0.2-beta.2",
3939
"@agent-infra/puppeteer-enhance": "0.1.6",
40-
"@ui-tars/sdk": "1.2.3"
40+
"@gui-agent/shared": "workspace:*"
4141
},
4242
"devDependencies": {
4343
"@rslib/core": "0.10.0",
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
import { ConsoleLogger, LogLevel } from '@agent-infra/logger';
6+
import { LocalBrowser, BrowserFinder, BrowserType } from '@agent-infra/browser';
7+
8+
import { LocalBrowserOperatorOptions, SearchEngine, searchEngineUrlMap } from './types';
9+
import { BrowserOperator } from './browser-operator';
10+
11+
export class LocalBrowserOperator extends BrowserOperator {
12+
private browserPath: string;
13+
private browserType: BrowserType;
14+
private searchEngine?: SearchEngine;
15+
16+
constructor(options?: LocalBrowserOperatorOptions) {
17+
const {
18+
highlightClickableElements = false,
19+
showActionInfo = false,
20+
showWaterFlow = false,
21+
searchEngine,
22+
} = options || {};
23+
24+
// Create logger with LocalBrowserOperator prefix
25+
const logger = (options?.logger || new ConsoleLogger(undefined, LogLevel.DEBUG)).spawn(
26+
'[Local]',
27+
);
28+
29+
const browserFinder = new BrowserFinder(logger.spawn('[BrowserFinder]'));
30+
const { path, type } = browserFinder.findBrowser();
31+
logger.debug('ctor: browserData: ', { path, type });
32+
33+
const browser = new LocalBrowser({ logger: logger.spawn('[Browser]') });
34+
const browserOptions = {
35+
browser: browser,
36+
browserType: type,
37+
logger: logger,
38+
highlightClickableElements: highlightClickableElements,
39+
showActionInfo: showActionInfo,
40+
showWaterFlow: showWaterFlow,
41+
};
42+
super(browserOptions);
43+
logger.debug('super ctor done');
44+
45+
this.browserPath = path;
46+
this.browserType = type;
47+
this.searchEngine = searchEngine;
48+
}
49+
50+
protected async initialize(): Promise<void> {
51+
this.logger.debug('initialize: start');
52+
await this.browser.launch({
53+
executablePath: this.browserPath,
54+
browserType: this.browserType,
55+
});
56+
this.logger.debug('initialize: browser launched');
57+
58+
const targetUrl = this.searchEngine ? searchEngineUrlMap[this.searchEngine] : undefined;
59+
if (targetUrl) {
60+
const openingPage = await this.browser?.getActivePage();
61+
await openingPage?.goto(targetUrl, {
62+
waitUntil: 'networkidle2',
63+
});
64+
}
65+
this.logger.debug('initialize: search engine opened');
66+
67+
await super.initialize();
68+
}
69+
}

0 commit comments

Comments
 (0)