Skip to content

Commit 4539b80

Browse files
authored
feat(o-gui-agent): support agent mode and game operator with enhanced error handling (#1656)
1 parent 6f595f4 commit 4539b80

File tree

13 files changed

+332
-126
lines changed

13 files changed

+332
-126
lines changed

multimodal/gui-agent/operator-aio/src/AIOComputer.ts

Lines changed: 98 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,21 @@ export class AIOComputer {
6262
};
6363
} catch (error) {
6464
logger.error('[AIOComputer] Action failed:', error);
65-
return {
66-
success: false,
67-
message: error instanceof Error ? error.message : 'Unknown error',
68-
};
65+
// Extract key error message from HTTP 422 responses
66+
let errorMessage = 'Unknown error';
67+
if (error instanceof Error) {
68+
const message = error.message;
69+
logger.error('[AIOComputer] Action failed message:', message);
70+
// Try to extract the key error message from HTTP 422 responses
71+
const detailMatch = message.match(/"msg":"([^"]+)"/);
72+
if (detailMatch && detailMatch[1]) {
73+
errorMessage = detailMatch[1];
74+
} else {
75+
errorMessage = message;
76+
}
77+
}
78+
// Throw a new error with the extracted key message
79+
throw new Error(errorMessage);
6980
}
7081
}
7182

@@ -287,3 +298,86 @@ export class AIOComputer {
287298
return this.request(action);
288299
}
289300
}
301+
302+
// Comprehensive key name mapping for common key variations
303+
export const keyNameMap = {
304+
// Arrow keys
305+
arrowup: 'up',
306+
arrowdown: 'down',
307+
arrowleft: 'left',
308+
arrowright: 'right',
309+
// Common key aliases
310+
space: ' ',
311+
spacebar: ' ',
312+
enter: 'enter',
313+
return: 'return',
314+
tab: 'tab',
315+
escape: 'esc',
316+
back: 'backspace',
317+
backspace: 'backspace',
318+
delete: 'del',
319+
insert: 'insert',
320+
home: 'home',
321+
end: 'end',
322+
pageup: 'pageup',
323+
pagedown: 'pagedown',
324+
// Function keys
325+
// f1: 'f1', f2: 'f2', f3: 'f3', f4: 'f4', f5: 'f5', f6: 'f6',
326+
// f7: 'f7', f8: 'f8', f9: 'f9', f10: 'f10', f11: 'f11', f12: 'f12',
327+
// Modifier keys
328+
ctrl: 'ctrl',
329+
control: 'ctrl',
330+
alt: 'alt',
331+
shift: 'shift',
332+
cmd: 'command',
333+
command: 'command',
334+
meta: 'command',
335+
win: 'win',
336+
windows: 'win',
337+
338+
// Number pad
339+
numpad0: 'num0',
340+
numpad1: 'num1',
341+
numpad2: 'num2',
342+
numpad3: 'num3',
343+
numpad4: 'num4',
344+
numpad5: 'num5',
345+
numpad6: 'num6',
346+
numpad7: 'num7',
347+
numpad8: 'num8',
348+
numpad9: 'num9',
349+
350+
// Special characters and punctuation
351+
comma: ',',
352+
period: '.',
353+
semicolon: ';',
354+
quote: "'",
355+
doublequote: '"',
356+
backquote: '`',
357+
tilde: '~',
358+
exclamation: '!',
359+
at: '@',
360+
hash: '#',
361+
dollar: '$',
362+
percent: '%',
363+
caret: '^',
364+
ampersand: '&',
365+
asterisk: '*',
366+
leftparen: '(',
367+
rightparen: ')',
368+
underscore: '_',
369+
plus: '+',
370+
minus: '-',
371+
equal: '=',
372+
leftbracket: '[',
373+
rightbracket: ']',
374+
backslash: '\\',
375+
pipe: '|',
376+
leftbrace: '{',
377+
rightbrace: '}',
378+
colon: ':',
379+
less: '<',
380+
greater: '>',
381+
question: '?',
382+
slash: '/',
383+
};
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
import { ConsoleLogger, LogLevel } from '@agent-infra/logger';
7+
import { AIOHybridOperator } from './AIOHybridOperator';
8+
import { AIOGameOptions } from './types';
9+
import { SupportedActionType } from '@gui-agent/shared/types';
10+
11+
const defaultLogger = new ConsoleLogger(undefined, LogLevel.DEBUG);
12+
13+
export class AIOGameOperator extends AIOHybridOperator {
14+
readonly name = 'aio-game';
15+
readonly description = 'Operator for game environment';
16+
private targetUrl: string;
17+
18+
constructor(options: AIOGameOptions, logger: ConsoleLogger = defaultLogger) {
19+
const operatorLogger = logger.spawn('[Game]');
20+
super(options, operatorLogger);
21+
this.targetUrl = options.targetUrl;
22+
}
23+
24+
protected supportedActions(): Array<SupportedActionType> {
25+
return [
26+
'call_user',
27+
'finished',
28+
'wait',
29+
'mouse_down',
30+
'mouse_up',
31+
'mouse_move',
32+
'click',
33+
'double_click',
34+
'right_click',
35+
'middle_click',
36+
'drag',
37+
'type',
38+
'hotkey',
39+
'press',
40+
'release',
41+
'scroll',
42+
];
43+
}
44+
45+
protected async initialize(): Promise<void> {
46+
await super.initialize();
47+
await this.aioBrowser?.handleNavigate({
48+
url: this.targetUrl,
49+
});
50+
this.logger.info(`initialize: goto ${this.targetUrl} successfully`);
51+
}
52+
}

multimodal/gui-agent/operator-aio/src/AIOHybridOperator.ts

Lines changed: 30 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,38 +14,23 @@ import { Operator, ScreenContext } from '@gui-agent/shared/base';
1414
import { ConsoleLogger, LogLevel } from '@agent-infra/logger';
1515
import { Base64ImageParser } from '@agent-infra/media-utils';
1616

17-
import { AIOComputer } from './AIOComputer';
17+
import { AIOComputer, keyNameMap } from './AIOComputer';
1818
import { AIOBrowser } from './AIOBrowser';
1919
import type { AIOHybridOptions } from './types';
2020

2121
const defaultLogger = new ConsoleLogger(undefined, LogLevel.DEBUG);
2222

23-
const arrowKeyMap = {
24-
arrowup: 'up',
25-
arrowdown: 'down',
26-
arrowleft: 'left',
27-
arrowright: 'right',
28-
};
29-
3023
export class AIOHybridOperator extends Operator {
31-
private static currentInstance: AIOHybridOperator | null = null;
32-
public static async create(options: AIOHybridOptions): Promise<AIOHybridOperator> {
33-
defaultLogger.info('[AioHybridOperator.create]:', options.baseURL);
34-
const instance = new AIOHybridOperator(options);
35-
// await instance.initialize(options);
36-
this.currentInstance = instance;
37-
return instance;
38-
}
39-
4024
private options: AIOHybridOptions;
41-
private logger: ConsoleLogger;
42-
private aioBrowser: AIOBrowser | null = null;
43-
private aioComputer: AIOComputer;
25+
26+
protected logger: ConsoleLogger;
27+
protected aioBrowser: AIOBrowser | null = null;
28+
protected aioComputer: AIOComputer;
4429

4530
private screenshotWidth = 1280;
4631
private screenshotHeight = 1024;
4732

48-
private constructor(options: AIOHybridOptions, logger: ConsoleLogger = defaultLogger) {
33+
constructor(options: AIOHybridOptions, logger: ConsoleLogger = defaultLogger) {
4934
super();
5035
this.options = options;
5136
this.logger = logger.spawn('[AIOHybridOperator]');
@@ -184,30 +169,30 @@ export class AIOHybridOperator extends Operator {
184169
case 'click':
185170
case 'left_click':
186171
case 'left_single': {
187-
this.handleClick(actionInputs, 'left');
172+
await this.handleClick(actionInputs, 'left');
188173
const { point } = actionInputs;
189174
startXPercent = (point as Coordinates)?.normalized?.x;
190175
startYPercent = (point as Coordinates)?.normalized?.y;
191176
break;
192177
}
193178
case 'left_double':
194179
case 'double_click': {
195-
this.handleClick(actionInputs, 'left', 2);
180+
await this.handleClick(actionInputs, 'left', 2);
196181
const { point } = actionInputs;
197182
startXPercent = (point as Coordinates)?.normalized?.x;
198183
startYPercent = (point as Coordinates)?.normalized?.y;
199184
break;
200185
}
201186
case 'right_click':
202187
case 'right_single': {
203-
this.handleClick(actionInputs, 'right');
188+
await this.handleClick(actionInputs, 'right');
204189
const { point } = actionInputs;
205190
startXPercent = (point as Coordinates)?.normalized?.x;
206191
startYPercent = (point as Coordinates)?.normalized?.y;
207192
break;
208193
}
209194
case 'middle_click': {
210-
this.handleClick(actionInputs, 'middle');
195+
await this.handleClick(actionInputs, 'middle');
211196
const { point } = actionInputs;
212197
startXPercent = (point as Coordinates)?.normalized?.x;
213198
startYPercent = (point as Coordinates)?.normalized?.y;
@@ -216,7 +201,7 @@ export class AIOHybridOperator extends Operator {
216201
case 'left_click_drag':
217202
case 'drag':
218203
case 'select': {
219-
this.handleDrag(actionInputs);
204+
await this.handleDrag(actionInputs);
220205
break;
221206
}
222207
case 'type': {
@@ -230,18 +215,26 @@ export class AIOHybridOperator extends Operator {
230215
}
231216
case 'hotkey':
232217
case 'press': {
233-
let keyStr = actionInputs?.key || actionInputs?.hotkey;
218+
const keyStr = actionInputs?.key || actionInputs?.hotkey;
234219
if (typeof keyStr !== 'string') {
235220
throw new Error('key string is required when press or hotkey');
236221
}
237-
keyStr = keyStr.toLowerCase();
238-
const keys = (keyStr as string).split(/[\s+]/).filter((k) => k.length > 0);
239-
if (keys.length > 1) {
240-
await this.aioComputer.hotkey(keys);
222+
const lowerKeyStr: string = keyStr.toLowerCase();
223+
const keys = lowerKeyStr.split(/\s+/).filter((k) => k.length > 0);
224+
225+
// Validate and map each key in the hotkey combination
226+
const mappedKeys = keys.map((key) => {
227+
return keyNameMap[key as keyof typeof keyNameMap] || key;
228+
});
229+
if (mappedKeys.length === 0) {
230+
throw new Error('key string is required when press or hotkey');
231+
}
232+
233+
this.logger.info('Press/hotkey action mappedKeys:', mappedKeys.join('+'));
234+
if (mappedKeys.length > 1) {
235+
await this.aioComputer.hotkey(mappedKeys);
241236
} else {
242-
// Check if the key can be mapped using arrowKeyMap
243-
const mappedKey = arrowKeyMap[keyStr as keyof typeof arrowKeyMap] || keyStr;
244-
await this.aioComputer.press(mappedKey);
237+
await this.aioComputer.press(mappedKeys[0]);
245238
}
246239
break;
247240
}
@@ -343,9 +336,9 @@ export class AIOHybridOperator extends Operator {
343336
}
344337
const { realX: startX, realY: startY } = await this.calculateRealCoords(startPoint);
345338
const { realX: endX, realY: endY } = await this.calculateRealCoords(endPoint);
346-
if (startX > endX || startY > endY) {
347-
throw new Error('start point must be top left of end point');
348-
}
339+
// if (startX > endX || startY > endY) {
340+
// throw new Error('start point must be top left of end point');
341+
// }
349342
// Move to start position, press mouse, drag to end position, release mouse
350343
await this.aioComputer.moveTo(startX, startY);
351344
await this.aioComputer.mouseDown();

multimodal/gui-agent/operator-aio/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
*/
55
export { AIOComputer } from './AIOComputer';
66
export { AIOHybridOperator } from './AIOHybridOperator';
7+
export { AIOGameOperator } from './AIOGameOperator';
78
export * from './types';

multimodal/gui-agent/operator-aio/src/types.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,7 @@ export interface AIOHybridOptions {
125125
timeout?: number;
126126
headers?: Record<string, string>;
127127
}
128+
129+
export interface AIOGameOptions extends AIOHybridOptions {
130+
targetUrl: string;
131+
}

multimodal/gui-agent/operator-aio/test/aioComputer.test.ts

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -435,28 +435,19 @@ describe('AIOComputer', () => {
435435
it('should handle HTTP errors', async () => {
436436
mockAioClient.browserActions.mockRejectedValue(new Error('HTTP 404: Not Found'));
437437

438-
const result = await aioComputer.click(100, 200);
439-
440-
expect(result.success).toBe(false);
441-
expect(result.message).toBe('HTTP 404: Not Found');
438+
await expect(aioComputer.click(100, 200)).rejects.toThrow('HTTP 404: Not Found');
442439
});
443440

444441
it('should handle network errors', async () => {
445442
mockAioClient.browserActions.mockRejectedValue(new Error('Network timeout'));
446443

447-
const result = await aioComputer.click(100, 200);
448-
449-
expect(result.success).toBe(false);
450-
expect(result.message).toBe('Network timeout');
444+
await expect(aioComputer.click(100, 200)).rejects.toThrow('Network timeout');
451445
});
452446

453447
it('should handle unknown errors', async () => {
454448
mockAioClient.browserActions.mockRejectedValue('Unknown error');
455449

456-
const result = await aioComputer.click(100, 200);
457-
458-
expect(result.success).toBe(false);
459-
expect(result.message).toBe('Unknown error');
450+
await expect(aioComputer.click(100, 200)).rejects.toThrow('Unknown error');
460451
});
461452
});
462453
});

multimodal/gui-agent/operator-nutjs/src/NutJSOperator.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,9 @@ export class NutJSOperator extends Operator {
174174
const { realX: startX, realY: startY } = await this.calculateRealCoords(startPoint);
175175
const { realX: endX, realY: endY } = await this.calculateRealCoords(endPoint);
176176
this.logger.info(`drag: start(${startX},${startY}) -> end(${endX},${endY})`);
177-
if (startX > endX || startY > endY) {
178-
throw new Error('start point must be top left of end point');
179-
}
177+
// if (startX > endX || startY > endY) {
178+
// throw new Error('start point must be top left of end point');
179+
// }
180180

181181
await this.moveStraightTo(startX, startY);
182182
await sleep(100);

0 commit comments

Comments
 (0)