Skip to content

Commit f3ec7d4

Browse files
authored
feat(omni-agent): agentMode structure update; add game mode support (#1649)
1 parent b8e3d68 commit f3ec7d4

File tree

6 files changed

+85
-61
lines changed

6 files changed

+85
-61
lines changed

multimodal/omni-tars/core/src/environments/prompt_t5.ts

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

6+
import { getLogger } from '@tarko/agent';
67
import { AgentMode } from '../types';
78
import { getTimeString } from '../utils/hepler';
89
import { HOME_INSTRUCTION, PROXY_INSTRUCTION } from './code';
@@ -11,6 +12,7 @@ export const think_token = process.env.THINK_TOKEN || 'thinkt';
1112
export const use_native_thinking = process.env.NATIVE_THINKING === 'true';
1213
export const bypass_native_thinking = process.env.NATIVE_THINKING === 'bypass';
1314

15+
const logger = getLogger('prompt_t5');
1416
const think_budget = '\n';
1517

1618
const task_description = `\nCurrent time is: ${getTimeString()}\n
@@ -19,6 +21,17 @@ As a professional personal assistant (Doubao) capable of solving various user pr
1921
const gui_task_description = `\nCurrent time is: ${getTimeString()}\n
2022
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n`;
2123

24+
const game_task_description = `You should begin by detailing the internal reasoning process, and then present the answer to the user. The reasoning process should be enclosed within <${think_token}> </${think_token}> tags, as follows:
25+
<${think_token}> reasoning process here </${think_token}> answer here.
26+
You have different modes of thinking:
27+
Unrestricted think mode: Engage in an internal thinking process with thorough reasoning and reflections. You have an unlimited budget for thinking tokens and can continue thinking until you fully solve the problem.
28+
Efficient think mode: Provide a concise internal thinking process with efficient reasoning and reflections. You don't have a strict token budget but be less verbose and more direct in your thinking.
29+
No think mode: Respond directly to the question without any internal reasoning process or extra thinking tokens. Still follow the template with the minimum required thinking tokens to justify the answer.
30+
Budgeted think mode: Limit your internal reasoning and reflections to stay within the specified token budget
31+
Based on the complexity of the problem, select the appropriate mode for reasoning among the provided options listed below.
32+
Provided Mode(s):
33+
Unrestricted think.
34+
You are provided with a task description, a history of previous actions, and corresponding screenshots. Your goal is to perform the next action to complete the task. Please note that if performing the same action multiple times results in a static screen with no changes, you should attempt a modified or alternative action.`;
2235

2336
//Mixed scenarios use this additional_notes
2437
const omni_additional_notes = `- Use english in your reasoning process. \n
@@ -55,13 +68,11 @@ export const gui_functions = `
5568
{"type": "function", "name": "wait", "parameters": {"type": "object", "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}}, "required": []}, "description": "Wait for a while."}
5669
`;
5770

58-
59-
6071
const createPROMPT2 = (description: string) => {
61-
return `You are an agent designed to accomplish tasks.
72+
return `You are an agent designed to accomplish tasks.
6273
${description}
6374
<seed:cot_budget_reflect>${think_budget}</seed:cot_budget_reflect>`;
64-
}
75+
};
6576

6677
/** 3.1 Think Prompt */
6778
const PROMPT1 = use_native_thinking
@@ -70,7 +81,6 @@ const PROMPT1 = use_native_thinking
7081

7182
/** 3.2 Role/Task Prompt */
7283

73-
7484
/** 3.3 Action/Function Definition Prompt (如果没有functions则不需要这段prompt) */
7585
const createPROMPT3 = (functions: string[], additionalNotes: string) => `## Function Definition
7686
@@ -99,29 +109,37 @@ multiple lines
99109
${additionalNotes}
100110
`;
101111

102-
103112
// Default SYSTEM_PROMPT_GROUP for backwards compatibility (omni mode)
104113
export const SYSTEM_PROMPT_GROUP = [
105-
PROMPT1,
114+
PROMPT1,
106115
createPROMPT2(task_description),
107-
createPROMPT3([mcp_functions, code_functions, gui_functions], omni_additional_notes)
116+
createPROMPT3([mcp_functions, code_functions, gui_functions], omni_additional_notes),
108117
];
109118

110119
/**
111120
* Create system prompt group based on agent mode
112121
* @param agentMode - The agent mode ('omni' or 'gui')
113122
* @returns Array of prompt strings
114123
*/
115-
export const createSystemPromptGroup = (agentMode: AgentMode = 'omni'): string[] => {
116-
if (agentMode === 'omni') {
117-
return SYSTEM_PROMPT_GROUP;
118-
}else if(agentMode === 'gui') {
119-
return [
120-
PROMPT1,
121-
createPROMPT2(gui_task_description),
122-
createPROMPT3([gui_functions], gui_additional_notes)
123-
]
124+
export const createSystemPromptGroup = (agentMode: AgentMode): string[] => {
125+
logger.info('agentMode: ', agentMode);
126+
127+
switch (agentMode.id) {
128+
case 'omni':
129+
return SYSTEM_PROMPT_GROUP;
130+
case 'gui':
131+
return [
132+
PROMPT1,
133+
createPROMPT2(gui_task_description),
134+
createPROMPT3([gui_functions], gui_additional_notes),
135+
];
136+
case 'game':
137+
return [
138+
PROMPT1,
139+
createPROMPT2(game_task_description),
140+
createPROMPT3([gui_functions], gui_additional_notes),
141+
];
124142
}
125143

126-
return []
144+
return [];
127145
};

multimodal/omni-tars/core/src/types.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,5 +62,8 @@ export interface ToolCallEngineCompositionConfig {
6262
defaultEngine?: ToolCallEngineProvider;
6363
}
6464

65-
66-
export type AgentMode = 'omni' | 'gui';
65+
export type AgentMode = {
66+
id: 'omni' | 'gui' | 'game';
67+
link?: string;
68+
browserMode?: 'dom' | 'visual-grounding' | 'hybrid';
69+
};

multimodal/omni-tars/gui-agent/src/GUIAgentToolCallEngine.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ export class GUIAgentToolCallEngine extends ToolCallEngine {
3838

3939
constructor(...args: unknown[]) {
4040
super();
41-
const agentMode = args[0] as AgentMode | undefined;
42-
this.agentMode = agentMode || 'gui';
41+
const agentMode = args[0] as AgentMode;
42+
this.agentMode = agentMode;
4343
}
4444

4545
/**

multimodal/omni-tars/gui-agent/src/GuiToolCallEngineProvider.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,9 @@ export class GuiToolCallEngineProvider extends ToolCallEngineProvider<GUIAgentTo
1111
readonly priority = 90; // High priority for GUI tasks
1212
readonly description =
1313
'Tool call engine optimized for GUI automation, computer use, and visual interface interactions';
14-
1514
private agentMode: AgentMode;
1615

17-
constructor(agentMode: AgentMode = 'gui') {
16+
constructor(agentMode: AgentMode) {
1817
super();
1918
this.agentMode = agentMode;
2019
}

multimodal/omni-tars/omni-agent/src/options.ts

Lines changed: 30 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,26 +21,10 @@ import {
2121
} from '@omni-tars/mcp-agent';
2222
import { AgentAppConfig } from '@tarko/interface';
2323

24-
const mcpToolCallEngine = new McpToolCallEngineProvider();
25-
26-
const omniToolCallEngine = createComposableToolCallEngineFactory({
27-
engines: [
28-
new GuiToolCallEngineProvider('omni'),
29-
mcpToolCallEngine,
30-
new CodeToolCallEngineProvider(),
31-
],
32-
defaultEngine: mcpToolCallEngine,
33-
});
34-
35-
const guiToolCallEngine = createComposableToolCallEngineFactory({
36-
engines: [new GuiToolCallEngineProvider('gui')],
37-
});
38-
3924
export type OmniTarsOption = AgentAppConfig &
4025
MCPTarsExtraOption &
4126
CodeAgentExtraOption & {
4227
agentMode: AgentMode;
43-
browserMode: 'dom' | 'visual-grounding' | 'hybrid';
4428
};
4529

4630
export function getComposableOption(options: OmniTarsOption) {
@@ -52,8 +36,7 @@ export function getComposableOption(options: OmniTarsOption) {
5236
ignoreSandboxCheck,
5337
linkReaderAK,
5438
linkReaderMcpUrl,
55-
agentMode = 'omni',
56-
browserMode = 'hybrid',
39+
agentMode = { id: 'omni' },
5740
...restOptions
5841
} = options;
5942

@@ -67,22 +50,35 @@ export function getComposableOption(options: OmniTarsOption) {
6750
operatorManager: OperatorManager.createHybird(options.sandboxUrl),
6851
});
6952

70-
if (agentMode === 'gui') {
71-
baseOptions.toolCallEngine = guiToolCallEngine;
72-
baseOptions.plugins = [guiPlugin];
73-
} else if (agentMode === 'omni') {
74-
baseOptions.toolCallEngine = omniToolCallEngine;
75-
baseOptions.plugins = [
76-
mcpPluginBuilder({
77-
tavilyApiKey,
78-
googleApiKey,
79-
googleMcpUrl,
80-
linkReaderAK,
81-
linkReaderMcpUrl,
82-
}),
83-
codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
84-
guiPlugin,
85-
];
53+
switch (agentMode.id) {
54+
case 'game':
55+
case 'gui':
56+
baseOptions.toolCallEngine = createComposableToolCallEngineFactory({
57+
engines: [new GuiToolCallEngineProvider(agentMode)],
58+
});
59+
baseOptions.plugins = [guiPlugin];
60+
break;
61+
case 'omni':
62+
default:
63+
baseOptions.toolCallEngine = createComposableToolCallEngineFactory({
64+
engines: [
65+
new GuiToolCallEngineProvider(agentMode),
66+
new McpToolCallEngineProvider(),
67+
new CodeToolCallEngineProvider(),
68+
],
69+
});
70+
baseOptions.plugins = [
71+
mcpPluginBuilder({
72+
tavilyApiKey,
73+
googleApiKey,
74+
googleMcpUrl,
75+
linkReaderAK,
76+
linkReaderMcpUrl,
77+
}),
78+
codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
79+
guiPlugin,
80+
];
81+
break;
8682
}
8783

8884
return baseOptions as ComposableAgentOptions;

multimodal/tarko/agent-server-next/examples/bootstrap.ts

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,11 @@ const server = new AgentServer({
4141
agentMode: {
4242
type: 'string',
4343
title: 'Agent Mode',
44-
enum: ['omni', 'gui'],
45-
enumLabels: ['Omni', 'GUI'],
44+
enum: ['omni', 'gui', 'game'],
45+
enumLabels: ['Omni', 'GUI', 'Game'],
4646
default: 'omni',
4747
placement: 'chat-bottom',
4848
},
49-
5049
browserMode: {
5150
type: 'string',
5251
title: 'Browser Control',
@@ -61,6 +60,15 @@ const server = new AgentServer({
6160
},
6261
},
6362
},
63+
transform: (runtimeSettings: Record<string, unknown>) => {
64+
return {
65+
agentMode: {
66+
id: runtimeSettings.agentMode,
67+
browserMode: runtimeSettings.browserMode,
68+
link: 'http://example.com',
69+
},
70+
};
71+
},
6472
},
6573
storage: {
6674
type: 'mongodb',

0 commit comments

Comments
 (0)