Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 52 additions & 15 deletions multimodal/omni-tars/core/src/environments/prompt_t5.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* SPDX-License-Identifier: Apache-2.0
*/

import { AgentMode } from '../types';
import { getTimeString } from '../utils/hepler';
import { HOME_INSTRUCTION, PROXY_INSTRUCTION } from './code';

Expand All @@ -11,24 +12,32 @@ export const use_native_thinking = process.env.NATIVE_THINKING === 'true';
export const bypass_native_thinking = process.env.NATIVE_THINKING === 'bypass';

const think_budget = '\n';

const task_description = `\nCurrent time is: ${getTimeString()}\n
As a professional personal assistant (Doubao) capable of solving various user problems, you will first reason through a user's problem to devise a solution, flexibly using a series of tools in combination with your thinking to accomplish the task and provide an accurate, reliable answer. While thinking and using tools, you may continuously and flexibly adjust your solution approach based on the results of tool calls. \n`;

const gui_task_description = `\nCurrent time is: ${getTimeString()}\n
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n`;


//Mixed scenarios use this additional_notes
const additional_notes = '- Use english in your reasoning process.';
const omni_additional_notes = `- Use english in your reasoning process. \n
${HOME_INSTRUCTION}
${PROXY_INSTRUCTION}
`;
//Pure GUI scenarios use this additional_notes_gui
const additional_notes_gui = `- You can execute multiple actions within a single tool call. For example:\n<seed:tool_call>\n<function=example_function_1>\n<parameter=example_parameter_1>value_1</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n\n<function=example_function_2>\n<parameter=example_parameter_3>value_4</parameter>\n</function>\n</seed:tool_call>`;
const gui_additional_notes = `- You can execute multiple actions within a single tool call. For example:\n<seed:tool_call>\n<function=example_function_1>\n<parameter=example_parameter_1>value_1</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n\n<function=example_function_2>\n<parameter=example_parameter_3>value_4</parameter>\n</function>\n</seed:tool_call>`;

const mcp_functions = `
export const mcp_functions = `
{"type": "function", "name": "LinkReader", "description": "这是一个链接浏览工具,可以打开链接(可以是网页、pdf等)并根据需求描述汇总页面上的所有相关信息。建议对所有有价值的链接都调用该工具来获取信息,有价值的链接包括但不限于如下几种:1.任务中明确提供的网址,2.搜索结果提供的带有相关摘要的网址,3. 之前调用LinkReader返回的内容中包含的且判断可能含有有用信息的网址。请尽量避免自己凭空构造链接。", "parameters": {"properties": {"url": {"type": "string", "description": "目标链接,应该是一个完整的url(以 http 开头)"}, "description": {"type": "string", "description": "需求描述文本,详细描述在当前url内想要获取的内容"}}, "required": ["url", "description"]}}
{"type": "function", "name": "Search", "parameters": {"type": "object", "properties": {"query": {"type": "string", "description": "搜索问题"}}, "required": ["query"]}, "description": "这是一个联网搜索工具,输入搜索问题,返回网页列表与对应的摘要信息。搜索问题应该简洁清晰,复杂问题应该拆解成多步并一步一步搜索。如果没有搜索到有用的页面,可以调整问题描述(如减少限定词、更换搜索思路)后再次搜索。搜索结果质量和语种有关,对于中文资源可以尝试输入中文问题,非中资源可以尝试使用英文或对应语种。"}
`;
const code_functions = `
export const code_functions = `
{"type": "function", "name": "execute_bash", "description": "Execute a bash command in the terminal.\n* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = \`python3 app.py > server.log 2>&1 &\`.\n* One command at a time: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use \`&&\` or \`;\` to chain them together.\n", "parameters": {"type": "object", "properties": {"command": {"type": "string", "description": "The bash command to execute. Can be empty string to view additional logs when previous exit code is \`-1\`. Can be \`C-c\` (Ctrl+C) to interrupt the currently running process. Note: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use \`&&\` or \`;\` to chain them together."}}, "required": ["command"]}}
{"type": "function", "name": "str_replace_editor", "description": "Custom editing tool for viewing, creating and editing files in plain-text format\n* State is persistent across command calls and discussions with the user\n* If \`path\` is a file, \`view\` displays the result of applying \`cat -n\`. If \`path\` is a directory, \`view\` lists non-hidden files and directories up to 2 levels deep\n* The \`create\` command cannot be used if the specified \`path\` already exists as a file\n* If a \`command\` generates a long output, it will be truncated and marked with \`<response clipped>\`\n* The \`undo_edit\` command will revert the last edit made to the file at \`path\`\n\nNotes for using the \`str_replace\` command:\n* The \`old_str\` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!\n* If the \`old_str\` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in \`old_str\` to make it unique\n* The \`new_str\` parameter should contain the edited lines that should replace the \`old_str\`\n", "parameters": {"type": "object", "properties": {"command": {"description": "The commands to run. Allowed options are: \`view\`, \`create\`, \`str_replace\`, \`insert\`, \`undo_edit\`.", "enum": ["view", "create", "str_replace", "insert", "undo_edit"], "type": "string"}, "path": {"description": "Absolute path to file or directory, e.g. \`/workspace/file.py\` or \`/workspace\`.", "type": "string"}, "file_text": {"description": "Required parameter of \`create\` command, with the content of the file to be created.", "type": "string"}, "old_str": {"description": "Required parameter of \`str_replace\` command containing the string in \`path\` to replace.", "type": "string"}, "new_str": {"description": "Optional parameter of \`str_replace\` command containing the new string (if not given, no string will be added). Required parameter of \`insert\` command containing the string to insert.", "type": "string"}, "insert_line": {"description": "Required parameter of \`insert\` command. The \`new_str\` will be inserted AFTER the line \`insert_line\` of \`path\`.", "type": "integer"}, "view_range": {"description": "Optional parameter of \`view\` command when \`path\` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting \`[start_line, -1]\` shows all lines from \`start_line\` to the end of the file.", "items": {"type": "integer"}, "type": "array"}}, "required": ["command", "path"]}}
{"type": "function", "name": "JupyterCI", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": "code"}, "timeout": {"type": "integer", "description": "timeout in seconds"}}, "required": ["code"]}, "description": " JupyterCI 一个保留状态的代码沙盒工具。你可以在此工具中运行python代码"}
`;
const gui_functions = `
export const gui_functions = `
{"type": "function", "name": "navigate", "parameters": {"type": "object", "properties": {"content": {"type": "string", "description": "The url to navigate to."}}, "required": ["content"]}, "description": "Navigate to a url."}
{"type": "function", "name": "navigate_back", "parameters": {"type": "object", "properties": {}, "required": []}, "description": "Navigate back to the previous page."}
{"type": "function", "name": "click", "parameters": {"type": "object", "properties": {"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}}, "required": ["point"]}, "description": "Mouse left single click action."}
Expand All @@ -46,21 +55,27 @@ const gui_functions = `
{"type": "function", "name": "wait", "parameters": {"type": "object", "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}}, "required": []}, "description": "Wait for a while."}
`;



const createPROMPT2 = (description: string) => {
return `You are an agent designed to accomplish tasks.
${description}
<seed:cot_budget_reflect>${think_budget}</seed:cot_budget_reflect>`;
}

/** 3.1 Think Prompt */
const PROMPT1 = use_native_thinking
? ``
: `You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <${think_token}> </${think_token}> tags, i.e. <${think_token}> reasoning process here </${think_token}> answer here`;

/** 3.2 Role/Task Prompt */
const PROMPT2 = `You are an agent designed to accomplish tasks.
${task_description}
<seed:cot_budget_reflect>${think_budget}</seed:cot_budget_reflect>`;


/** 3.3 Action/Function Definition Prompt (如果没有functions则不需要这段prompt) */
const PROMPT3 = `## Function Definition
const createPROMPT3 = (functions: string[], additionalNotes: string) => `## Function Definition

- You have access to the following functions:
${[mcp_functions, code_functions, gui_functions].join('')}
${functions.join('')}

- To call a function, use the following structure without any suffix:

Expand All @@ -81,10 +96,32 @@ multiple lines
- All required parameters must be explicitly provided.

## Additional Notes
${additional_notes}

${HOME_INSTRUCTION}
${PROXY_INSTRUCTION}
${additionalNotes}
`;

export const SYSTEM_PROMPT_GROUP = [PROMPT1, PROMPT2, PROMPT3].filter(Boolean);

// Default SYSTEM_PROMPT_GROUP for backwards compatibility (omni mode)
export const SYSTEM_PROMPT_GROUP = [
PROMPT1,
createPROMPT2(task_description),
createPROMPT3([mcp_functions, code_functions, gui_functions], omni_additional_notes)
];

/**
* Create system prompt group based on agent mode
* @param agentMode - The agent mode ('omni' or 'gui')
* @returns Array of prompt strings
*/
export const createSystemPromptGroup = (agentMode: AgentMode = 'omni'): string[] => {
if (agentMode === 'omni') {
return SYSTEM_PROMPT_GROUP;
}else if(agentMode === 'gui') {
return [
PROMPT1,
createPROMPT2(gui_task_description),
createPROMPT3([gui_functions], gui_additional_notes)
]
}

return []
};
4 changes: 2 additions & 2 deletions multimodal/omni-tars/core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ export { ComposableAgent } from './ComposableAgent';
export type { ComposableAgentOptions } from './ComposableAgent';
export { createComposableToolCallEngineFactory } from './ComposableToolCallEngineFactory';
export { ToolCallEngineProvider } from './types';
export type { ToolCallEngineContext } from './types';
export type { ToolCallEngineContext, AgentMode } from './types';

export { CODE_ENVIRONMENT } from './environments/code';
export { MCP_ENVIRONMENT } from './environments/mcp';
Expand All @@ -18,5 +18,5 @@ export { AgentPlugin } from './AgentPlugin';
export { parseCodeContent, parseComputerContent, parseMcpContent } from './utils/parser';
export * from './utils/streamingParser';
export * from './utils/streamingParserT5';
export { SYSTEM_PROMPT_GROUP, think_token } from './environments/prompt_t5';
export { SYSTEM_PROMPT_GROUP, createSystemPromptGroup, think_token } from './environments/prompt_t5';
export { getAioUrl, extractAioPort } from './utils/hepler';
3 changes: 3 additions & 0 deletions multimodal/omni-tars/core/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,6 @@ export interface ToolCallEngineCompositionConfig {
/** Default engine to use when no specific engine matches */
defaultEngine?: ToolCallEngineProvider;
}


export type AgentMode = 'omni' | 'gui';
18 changes: 11 additions & 7 deletions multimodal/omni-tars/gui-agent/src/GUIAgentToolCallEngine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,15 @@ import {
MultimodalToolCallResult,
AgentEventStream,
ChatCompletionMessageParam,
ChatCompletionMessageToolCall,
ParsedModelResponse,
StreamProcessingState,
StreamChunkResult,
} from '@tarko/agent-interface';
import { actionParser, actionStringParser } from '@gui-agent/action-parser';
import { getScreenInfo } from './shared';
import {
processT5StreamingChunk as omniProcessStreamingChunk,
T5StreamProcessingState as OmniStreamProcessingState,
createT5InitState as createInitState,
SYSTEM_PROMPT_GROUP,
createSystemPromptGroup,
AgentMode,
} from '@omni-tars/core';
import { getLogger } from '@tarko/agent';
import { GUIAgentT5Adapter } from './GUIAgentT5Adapter';
Expand All @@ -37,12 +34,19 @@ import { GUIAgentT5Adapter } from './GUIAgentT5Adapter';
export class GUIAgentToolCallEngine extends ToolCallEngine {
private logger = getLogger('GUIAgentToolCallEngine');
private t5Adapter = new GUIAgentT5Adapter(this.logger);
private agentMode: AgentMode;

constructor(...args: unknown[]) {
super();
const agentMode = args[0] as AgentMode | undefined;
this.agentMode = agentMode || 'gui';
}

/**
* Prepare system prompt with tool information and instructions
* Prepare system prompt with tool information and instructions based on agent mode
*/
preparePrompt(instructions: string, tools: Tool[]) {
return SYSTEM_PROMPT_GROUP;
return createSystemPromptGroup(this.agentMode);
}

/**
Expand Down
11 changes: 9 additions & 2 deletions multimodal/omni-tars/gui-agent/src/GuiToolCallEngineProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,24 @@
* Provides optimized tool call engine for GUI automation and computer use tasks
*/

import { ToolCallEngineProvider, ToolCallEngineContext } from '@omni-tars/core';
import { ToolCallEngineProvider, ToolCallEngineContext, AgentMode } from '@omni-tars/core';
import { GUIAgentToolCallEngine } from './GUIAgentToolCallEngine';

export class GuiToolCallEngineProvider extends ToolCallEngineProvider<GUIAgentToolCallEngine> {
readonly name = 'gui-tool-call-engine';
readonly priority = 90; // High priority for GUI tasks
readonly description =
'Tool call engine optimized for GUI automation, computer use, and visual interface interactions';

private agentMode: AgentMode;

constructor(agentMode: AgentMode = 'gui') {
super();
this.agentMode = agentMode;
}

protected createEngine(): GUIAgentToolCallEngine {
return new GUIAgentToolCallEngine();
return new GUIAgentToolCallEngine(this.agentMode);
}

canHandle(context: ToolCallEngineContext): boolean {
Expand Down
56 changes: 5 additions & 51 deletions multimodal/omni-tars/omni-agent/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,12 @@
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import {
codePluginBuilder,
CodeToolCallEngineProvider,
CodeAgentExtraOption,
} from '@omni-tars/code-agent';
import {
mcpPluginBuilder,
McpToolCallEngineProvider,
MCPTarsExtraOption,
} from '@omni-tars/mcp-agent';
import { GuiAgentPlugin, GuiToolCallEngineProvider, OperatorManager } from '@omni-tars/gui-agent';
import { ComposableAgent, createComposableToolCallEngineFactory } from '@omni-tars/core';
import { AgentOptions } from '@tarko/agent';
import { AgentWebUIImplementation } from '@tarko/interface';

const mcpToolCallEngine = new McpToolCallEngineProvider();

const toolCallEngine = createComposableToolCallEngineFactory({
engines: [new GuiToolCallEngineProvider(), mcpToolCallEngine, new CodeToolCallEngineProvider()],
defaultEngine: mcpToolCallEngine,
});
import { ComposableAgent } from '@omni-tars/core';
import { AgentWebUIImplementation } from '@tarko/interface';
import { getComposableOption, OmniTarsOption } from './options';

const sandboxBaseUrl = process.env.AIO_SANDBOX_URL ?? '.';

type OmniTarsOption = AgentOptions & MCPTarsExtraOption & CodeAgentExtraOption;

export default class OmniTARSAgent extends ComposableAgent {
static label = 'Omni Agent';

Expand Down Expand Up @@ -76,33 +56,7 @@ export default class OmniTARSAgent extends ComposableAgent {
},
};

constructor(options: OmniTarsOption) {
const {
tavilyApiKey,
googleApiKey,
googleMcpUrl,
sandboxUrl,
ignoreSandboxCheck,
linkReaderAK,
linkReaderMcpUrl,
...restOptions
} = options;
super({
...restOptions,
plugins: [
mcpPluginBuilder({
tavilyApiKey,
googleApiKey,
googleMcpUrl,
linkReaderAK,
linkReaderMcpUrl,
}),
codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
new GuiAgentPlugin({ operatorManager: OperatorManager.createHybird(options.sandboxUrl) }),
],
toolCallEngine,
maxTokens: 32768,
enableStreamingToolCallEvents: true,
});
constructor(option: OmniTarsOption) {
super(getComposableOption(option));
}
}
68 changes: 68 additions & 0 deletions multimodal/omni-tars/omni-agent/src/options.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/

import { CodeAgentExtraOption, codePluginBuilder, CodeToolCallEngineProvider } from "@omni-tars/code-agent";
import { AgentMode, ComposableAgentOptions, createComposableToolCallEngineFactory } from "@omni-tars/core";
import { GuiAgentPlugin, GuiToolCallEngineProvider, OperatorManager } from "@omni-tars/gui-agent";
import { mcpPluginBuilder, MCPTarsExtraOption, McpToolCallEngineProvider } from "@omni-tars/mcp-agent";
import { AgentAppConfig } from "@tarko/interface";


const mcpToolCallEngine = new McpToolCallEngineProvider();

const omniToolCallEngine = createComposableToolCallEngineFactory({
engines: [new GuiToolCallEngineProvider('omni'), mcpToolCallEngine, new CodeToolCallEngineProvider()],
defaultEngine: mcpToolCallEngine,
});

const guiToolCallEngine = createComposableToolCallEngineFactory({ engines: [new GuiToolCallEngineProvider('gui')] });


export type OmniTarsOption = AgentAppConfig & MCPTarsExtraOption & CodeAgentExtraOption & {
agentMode: AgentMode
};


export function getComposableOption(options: OmniTarsOption) {
const {
tavilyApiKey,
googleApiKey,
googleMcpUrl,
sandboxUrl,
ignoreSandboxCheck,
linkReaderAK,
linkReaderMcpUrl,
agentMode = 'omni',
...restOptions
} = options;

const baseOptions: Partial<ComposableAgentOptions> = {
...restOptions,
maxTokens: 32768,
enableStreamingToolCallEvents: true,
};

if(agentMode === 'gui') {
baseOptions.toolCallEngine = guiToolCallEngine;
baseOptions.plugins = [
new GuiAgentPlugin({ operatorManager: OperatorManager.createHybird(options.sandboxUrl) }),
];
} else if(agentMode === 'omni') {
baseOptions.toolCallEngine = omniToolCallEngine;
baseOptions.plugins = [
mcpPluginBuilder({
tavilyApiKey,
googleApiKey,
googleMcpUrl,
linkReaderAK,
linkReaderMcpUrl,
}),
codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
new GuiAgentPlugin({ operatorManager: OperatorManager.createHybird(options.sandboxUrl) }),
];
}

return baseOptions as ComposableAgentOptions;
}
15 changes: 14 additions & 1 deletion multimodal/tarko/agent-server-next/examples/bootstrap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,19 @@ const server = new AgentServer({
type: process.env.NATIVE_THINKING === 'true' ? 'enabled' : 'disabled',
},
server: {
runtimeSettings: {
schema: {
type: 'object',
properties: {
agentMode: {
type: 'string',
title: 'Agent Mode',
enum: ['omni', 'gui'],
default: 'omni',
},
},
},
},
storage: {
type: 'mongodb',
uri: process.env.MONGO_URI,
Expand Down Expand Up @@ -80,7 +93,7 @@ const server = new AgentServer({
webui: {
type: 'remote',
remoteUrl: process.env.WEBUI_REMOTE_URL,
}
},
},
});

Expand Down
2 changes: 1 addition & 1 deletion multimodal/tarko/agent-server-next/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@
"typescript": "^5.5.3",
"vitest": "3.2.4"
}
}
}
Loading