Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 36 additions & 18 deletions multimodal/omni-tars/core/src/environments/prompt_t5.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* SPDX-License-Identifier: Apache-2.0
*/

import { getLogger } from '@tarko/agent';
import { AgentMode } from '../types';
import { getTimeString } from '../utils/hepler';
import { HOME_INSTRUCTION, PROXY_INSTRUCTION } from './code';
Expand All @@ -11,6 +12,7 @@ export const think_token = process.env.THINK_TOKEN || 'thinkt';
export const use_native_thinking = process.env.NATIVE_THINKING === 'true';
export const bypass_native_thinking = process.env.NATIVE_THINKING === 'bypass';

const logger = getLogger('prompt_t5');
const think_budget = '\n';

const task_description = `\nCurrent time is: ${getTimeString()}\n
Expand All @@ -19,6 +21,17 @@ As a professional personal assistant (Doubao) capable of solving various user pr
const gui_task_description = `\nCurrent time is: ${getTimeString()}\n
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n`;

const game_task_description = `You should begin by detailing the internal reasoning process, and then present the answer to the user. The reasoning process should be enclosed within <${think_token}> </${think_token}> tags, as follows:
<${think_token}> reasoning process here </${think_token}> answer here.
You have different modes of thinking:
Unrestricted think mode: Engage in an internal thinking process with thorough reasoning and reflections. You have an unlimited budget for thinking tokens and can continue thinking until you fully solve the problem.
Efficient think mode: Provide a concise internal thinking process with efficient reasoning and reflections. You don't have a strict token budget but be less verbose and more direct in your thinking.
No think mode: Respond directly to the question without any internal reasoning process or extra thinking tokens. Still follow the template with the minimum required thinking tokens to justify the answer.
Budgeted think mode: Limit your internal reasoning and reflections to stay within the specified token budget
Based on the complexity of the problem, select the appropriate mode for reasoning among the provided options listed below.
Provided Mode(s):
Unrestricted think.
You are provided with a task description, a history of previous actions, and corresponding screenshots. Your goal is to perform the next action to complete the task. Please note that if performing the same action multiple times results in a static screen with no changes, you should attempt a modified or alternative action.`;

//Mixed scenarios use this additional_notes
const omni_additional_notes = `- Use english in your reasoning process. \n
Expand Down Expand Up @@ -55,13 +68,11 @@ export const gui_functions = `
{"type": "function", "name": "wait", "parameters": {"type": "object", "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}}, "required": []}, "description": "Wait for a while."}
`;



const createPROMPT2 = (description: string) => {
return `You are an agent designed to accomplish tasks.
return `You are an agent designed to accomplish tasks.
${description}
<seed:cot_budget_reflect>${think_budget}</seed:cot_budget_reflect>`;
}
};

/** 3.1 Think Prompt */
const PROMPT1 = use_native_thinking
Expand All @@ -70,7 +81,6 @@ const PROMPT1 = use_native_thinking

/** 3.2 Role/Task Prompt */


/** 3.3 Action/Function Definition Prompt (如果没有functions则不需要这段prompt) */
const createPROMPT3 = (functions: string[], additionalNotes: string) => `## Function Definition

Expand Down Expand Up @@ -99,29 +109,37 @@ multiple lines
${additionalNotes}
`;


// Default SYSTEM_PROMPT_GROUP for backwards compatibility (omni mode)
export const SYSTEM_PROMPT_GROUP = [
PROMPT1,
PROMPT1,
createPROMPT2(task_description),
createPROMPT3([mcp_functions, code_functions, gui_functions], omni_additional_notes)
createPROMPT3([mcp_functions, code_functions, gui_functions], omni_additional_notes),
];

/**
* Create system prompt group based on agent mode
* @param agentMode - The agent mode ('omni' or 'gui')
* @returns Array of prompt strings
*/
export const createSystemPromptGroup = (agentMode: AgentMode = 'omni'): string[] => {
if (agentMode === 'omni') {
return SYSTEM_PROMPT_GROUP;
}else if(agentMode === 'gui') {
return [
PROMPT1,
createPROMPT2(gui_task_description),
createPROMPT3([gui_functions], gui_additional_notes)
]
export const createSystemPromptGroup = (agentMode: AgentMode): string[] => {
logger.info('agentMode: ', agentMode);

switch (agentMode.id) {
case 'omni':
return SYSTEM_PROMPT_GROUP;
case 'gui':
return [
PROMPT1,
createPROMPT2(gui_task_description),
createPROMPT3([gui_functions], gui_additional_notes),
];
case 'game':
return [
PROMPT1,
createPROMPT2(game_task_description),
createPROMPT3([gui_functions], gui_additional_notes),
];
}

return []
return [];
};
7 changes: 5 additions & 2 deletions multimodal/omni-tars/core/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,5 +62,8 @@ export interface ToolCallEngineCompositionConfig {
defaultEngine?: ToolCallEngineProvider;
}


export type AgentMode = 'omni' | 'gui';
export type AgentMode = {
id: 'omni' | 'gui' | 'game';
link?: string;
browserMode?: 'dom' | 'visual-grounding' | 'hybrid';
};
4 changes: 2 additions & 2 deletions multimodal/omni-tars/gui-agent/src/GUIAgentToolCallEngine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ export class GUIAgentToolCallEngine extends ToolCallEngine {

constructor(...args: unknown[]) {
super();
const agentMode = args[0] as AgentMode | undefined;
this.agentMode = agentMode || 'gui';
const agentMode = args[0] as AgentMode;
this.agentMode = agentMode;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@ export class GuiToolCallEngineProvider extends ToolCallEngineProvider<GUIAgentTo
readonly priority = 90; // High priority for GUI tasks
readonly description =
'Tool call engine optimized for GUI automation, computer use, and visual interface interactions';

private agentMode: AgentMode;

constructor(agentMode: AgentMode = 'gui') {
constructor(agentMode: AgentMode) {
super();
this.agentMode = agentMode;
}
Expand Down
64 changes: 30 additions & 34 deletions multimodal/omni-tars/omni-agent/src/options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,10 @@ import {
} from '@omni-tars/mcp-agent';
import { AgentAppConfig } from '@tarko/interface';

const mcpToolCallEngine = new McpToolCallEngineProvider();

const omniToolCallEngine = createComposableToolCallEngineFactory({
engines: [
new GuiToolCallEngineProvider('omni'),
mcpToolCallEngine,
new CodeToolCallEngineProvider(),
],
defaultEngine: mcpToolCallEngine,
});

const guiToolCallEngine = createComposableToolCallEngineFactory({
engines: [new GuiToolCallEngineProvider('gui')],
});

export type OmniTarsOption = AgentAppConfig &
MCPTarsExtraOption &
CodeAgentExtraOption & {
agentMode: AgentMode;
browserMode: 'dom' | 'visual-grounding' | 'hybrid';
};

export function getComposableOption(options: OmniTarsOption) {
Expand All @@ -52,8 +36,7 @@ export function getComposableOption(options: OmniTarsOption) {
ignoreSandboxCheck,
linkReaderAK,
linkReaderMcpUrl,
agentMode = 'omni',
browserMode = 'hybrid',
agentMode = { id: 'omni' },
...restOptions
} = options;

Expand All @@ -67,22 +50,35 @@ export function getComposableOption(options: OmniTarsOption) {
operatorManager: OperatorManager.createHybird(options.sandboxUrl),
});

if (agentMode === 'gui') {
baseOptions.toolCallEngine = guiToolCallEngine;
baseOptions.plugins = [guiPlugin];
} else if (agentMode === 'omni') {
baseOptions.toolCallEngine = omniToolCallEngine;
baseOptions.plugins = [
mcpPluginBuilder({
tavilyApiKey,
googleApiKey,
googleMcpUrl,
linkReaderAK,
linkReaderMcpUrl,
}),
codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
guiPlugin,
];
switch (agentMode.id) {
case 'game':
case 'gui':
baseOptions.toolCallEngine = createComposableToolCallEngineFactory({
engines: [new GuiToolCallEngineProvider(agentMode)],
});
baseOptions.plugins = [guiPlugin];
break;
case 'omni':
default:
baseOptions.toolCallEngine = createComposableToolCallEngineFactory({
engines: [
new GuiToolCallEngineProvider(agentMode),
new McpToolCallEngineProvider(),
new CodeToolCallEngineProvider(),
],
});
baseOptions.plugins = [
mcpPluginBuilder({
tavilyApiKey,
googleApiKey,
googleMcpUrl,
linkReaderAK,
linkReaderMcpUrl,
}),
codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
guiPlugin,
];
break;
}

return baseOptions as ComposableAgentOptions;
Expand Down
14 changes: 11 additions & 3 deletions multimodal/tarko/agent-server-next/examples/bootstrap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,11 @@ const server = new AgentServer({
agentMode: {
type: 'string',
title: 'Agent Mode',
enum: ['omni', 'gui'],
enumLabels: ['Omni', 'GUI'],
enum: ['omni', 'gui', 'game'],
enumLabels: ['Omni', 'GUI', 'Game'],
default: 'omni',
placement: 'chat-bottom',
},

browserMode: {
type: 'string',
title: 'Browser Control',
Expand All @@ -61,6 +60,15 @@ const server = new AgentServer({
},
},
},
transform: (runtimeSettings: Record<string, unknown>) => {
return {
agentMode: {
id: runtimeSettings.agentMode,
browserMode: runtimeSettings.browserMode,
link: 'http://example.com',
},
};
},
},
storage: {
type: 'mongodb',
Expand Down
Loading