Skip to content

Commit 8d6b08e

Browse files
authored
feat(tarko): add runtime settings api in server-next (#1634)
1 parent 7d9ca1f commit 8d6b08e

File tree

15 files changed

+313
-104
lines changed

15 files changed

+313
-104
lines changed

multimodal/omni-tars/core/src/environments/prompt_t5.ts

Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

6+
import { AgentMode } from '../types';
67
import { getTimeString } from '../utils/hepler';
78
import { HOME_INSTRUCTION, PROXY_INSTRUCTION } from './code';
89

@@ -11,24 +12,32 @@ export const use_native_thinking = process.env.NATIVE_THINKING === 'true';
1112
export const bypass_native_thinking = process.env.NATIVE_THINKING === 'bypass';
1213

1314
const think_budget = '\n';
15+
1416
const task_description = `\nCurrent time is: ${getTimeString()}\n
1517
As a professional personal assistant (Doubao) capable of solving various user problems, you will first reason through a user's problem to devise a solution, flexibly using a series of tools in combination with your thinking to accomplish the task and provide an accurate, reliable answer. While thinking and using tools, you may continuously and flexibly adjust your solution approach based on the results of tool calls. \n`;
1618

19+
const gui_task_description = `\nCurrent time is: ${getTimeString()}\n
20+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n`;
21+
22+
1723
//Mixed scenarios use this additional_notes
18-
const additional_notes = '- Use english in your reasoning process.';
24+
const omni_additional_notes = `- Use english in your reasoning process. \n
25+
${HOME_INSTRUCTION}
26+
${PROXY_INSTRUCTION}
27+
`;
1928
//Pure GUI scenarios use this additional_notes_gui
20-
const additional_notes_gui = `- You can execute multiple actions within a single tool call. For example:\n<seed:tool_call>\n<function=example_function_1>\n<parameter=example_parameter_1>value_1</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n\n<function=example_function_2>\n<parameter=example_parameter_3>value_4</parameter>\n</function>\n</seed:tool_call>`;
29+
const gui_additional_notes = `- You can execute multiple actions within a single tool call. For example:\n<seed:tool_call>\n<function=example_function_1>\n<parameter=example_parameter_1>value_1</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n\n<function=example_function_2>\n<parameter=example_parameter_3>value_4</parameter>\n</function>\n</seed:tool_call>`;
2130

22-
const mcp_functions = `
31+
export const mcp_functions = `
2332
{"type": "function", "name": "LinkReader", "description": "这是一个链接浏览工具,可以打开链接(可以是网页、pdf等)并根据需求描述汇总页面上的所有相关信息。建议对所有有价值的链接都调用该工具来获取信息,有价值的链接包括但不限于如下几种:1.任务中明确提供的网址,2.搜索结果提供的带有相关摘要的网址,3. 之前调用LinkReader返回的内容中包含的且判断可能含有有用信息的网址。请尽量避免自己凭空构造链接。", "parameters": {"properties": {"url": {"type": "string", "description": "目标链接,应该是一个完整的url(以 http 开头)"}, "description": {"type": "string", "description": "需求描述文本,详细描述在当前url内想要获取的内容"}}, "required": ["url", "description"]}}
2433
{"type": "function", "name": "Search", "parameters": {"type": "object", "properties": {"query": {"type": "string", "description": "搜索问题"}}, "required": ["query"]}, "description": "这是一个联网搜索工具,输入搜索问题,返回网页列表与对应的摘要信息。搜索问题应该简洁清晰,复杂问题应该拆解成多步并一步一步搜索。如果没有搜索到有用的页面,可以调整问题描述(如减少限定词、更换搜索思路)后再次搜索。搜索结果质量和语种有关,对于中文资源可以尝试输入中文问题,非中资源可以尝试使用英文或对应语种。"}
2534
`;
26-
const code_functions = `
35+
export const code_functions = `
2736
{"type": "function", "name": "execute_bash", "description": "Execute a bash command in the terminal.\n* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = \`python3 app.py > server.log 2>&1 &\`.\n* One command at a time: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use \`&&\` or \`;\` to chain them together.\n", "parameters": {"type": "object", "properties": {"command": {"type": "string", "description": "The bash command to execute. Can be empty string to view additional logs when previous exit code is \`-1\`. Can be \`C-c\` (Ctrl+C) to interrupt the currently running process. Note: You can only execute one bash command at a time. If you need to run multiple commands sequentially, you can use \`&&\` or \`;\` to chain them together."}}, "required": ["command"]}}
2837
{"type": "function", "name": "str_replace_editor", "description": "Custom editing tool for viewing, creating and editing files in plain-text format\n* State is persistent across command calls and discussions with the user\n* If \`path\` is a file, \`view\` displays the result of applying \`cat -n\`. If \`path\` is a directory, \`view\` lists non-hidden files and directories up to 2 levels deep\n* The \`create\` command cannot be used if the specified \`path\` already exists as a file\n* If a \`command\` generates a long output, it will be truncated and marked with \`<response clipped>\`\n* The \`undo_edit\` command will revert the last edit made to the file at \`path\`\n\nNotes for using the \`str_replace\` command:\n* The \`old_str\` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!\n* If the \`old_str\` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in \`old_str\` to make it unique\n* The \`new_str\` parameter should contain the edited lines that should replace the \`old_str\`\n", "parameters": {"type": "object", "properties": {"command": {"description": "The commands to run. Allowed options are: \`view\`, \`create\`, \`str_replace\`, \`insert\`, \`undo_edit\`.", "enum": ["view", "create", "str_replace", "insert", "undo_edit"], "type": "string"}, "path": {"description": "Absolute path to file or directory, e.g. \`/workspace/file.py\` or \`/workspace\`.", "type": "string"}, "file_text": {"description": "Required parameter of \`create\` command, with the content of the file to be created.", "type": "string"}, "old_str": {"description": "Required parameter of \`str_replace\` command containing the string in \`path\` to replace.", "type": "string"}, "new_str": {"description": "Optional parameter of \`str_replace\` command containing the new string (if not given, no string will be added). Required parameter of \`insert\` command containing the string to insert.", "type": "string"}, "insert_line": {"description": "Required parameter of \`insert\` command. The \`new_str\` will be inserted AFTER the line \`insert_line\` of \`path\`.", "type": "integer"}, "view_range": {"description": "Optional parameter of \`view\` command when \`path\` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting \`[start_line, -1]\` shows all lines from \`start_line\` to the end of the file.", "items": {"type": "integer"}, "type": "array"}}, "required": ["command", "path"]}}
2938
{"type": "function", "name": "JupyterCI", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": "code"}, "timeout": {"type": "integer", "description": "timeout in seconds"}}, "required": ["code"]}, "description": " JupyterCI 一个保留状态的代码沙盒工具。你可以在此工具中运行python代码"}
3039
`;
31-
const gui_functions = `
40+
export const gui_functions = `
3241
{"type": "function", "name": "navigate", "parameters": {"type": "object", "properties": {"content": {"type": "string", "description": "The url to navigate to."}}, "required": ["content"]}, "description": "Navigate to a url."}
3342
{"type": "function", "name": "navigate_back", "parameters": {"type": "object", "properties": {}, "required": []}, "description": "Navigate back to the previous page."}
3443
{"type": "function", "name": "click", "parameters": {"type": "object", "properties": {"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}}, "required": ["point"]}, "description": "Mouse left single click action."}
@@ -46,21 +55,27 @@ const gui_functions = `
4655
{"type": "function", "name": "wait", "parameters": {"type": "object", "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}}, "required": []}, "description": "Wait for a while."}
4756
`;
4857

58+
59+
60+
const createPROMPT2 = (description: string) => {
61+
return `You are an agent designed to accomplish tasks.
62+
${description}
63+
<seed:cot_budget_reflect>${think_budget}</seed:cot_budget_reflect>`;
64+
}
65+
4966
/** 3.1 Think Prompt */
5067
const PROMPT1 = use_native_thinking
5168
? ``
5269
: `You should first think about the reasoning process in the mind and then provide the user with the answer. The reasoning process is enclosed within <${think_token}> </${think_token}> tags, i.e. <${think_token}> reasoning process here </${think_token}> answer here`;
5370

5471
/** 3.2 Role/Task Prompt */
55-
const PROMPT2 = `You are an agent designed to accomplish tasks.
56-
${task_description}
57-
<seed:cot_budget_reflect>${think_budget}</seed:cot_budget_reflect>`;
72+
5873

5974
/** 3.3 Action/Function Definition Prompt (如果没有functions则不需要这段prompt) */
60-
const PROMPT3 = `## Function Definition
75+
const createPROMPT3 = (functions: string[], additionalNotes: string) => `## Function Definition
6176
6277
- You have access to the following functions:
63-
${[mcp_functions, code_functions, gui_functions].join('')}
78+
${functions.join('')}
6479
6580
- To call a function, use the following structure without any suffix:
6681
@@ -81,10 +96,32 @@ multiple lines
8196
- All required parameters must be explicitly provided.
8297
8398
## Additional Notes
84-
${additional_notes}
85-
86-
${HOME_INSTRUCTION}
87-
${PROXY_INSTRUCTION}
99+
${additionalNotes}
88100
`;
89101

90-
export const SYSTEM_PROMPT_GROUP = [PROMPT1, PROMPT2, PROMPT3].filter(Boolean);
102+
103+
// Default SYSTEM_PROMPT_GROUP for backwards compatibility (omni mode)
104+
export const SYSTEM_PROMPT_GROUP = [
105+
PROMPT1,
106+
createPROMPT2(task_description),
107+
createPROMPT3([mcp_functions, code_functions, gui_functions], omni_additional_notes)
108+
];
109+
110+
/**
111+
* Create system prompt group based on agent mode
112+
* @param agentMode - The agent mode ('omni' or 'gui')
113+
* @returns Array of prompt strings
114+
*/
115+
export const createSystemPromptGroup = (agentMode: AgentMode = 'omni'): string[] => {
116+
if (agentMode === 'omni') {
117+
return SYSTEM_PROMPT_GROUP;
118+
}else if(agentMode === 'gui') {
119+
return [
120+
PROMPT1,
121+
createPROMPT2(gui_task_description),
122+
createPROMPT3([gui_functions], gui_additional_notes)
123+
]
124+
}
125+
126+
return []
127+
};

multimodal/omni-tars/core/src/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export { ComposableAgent } from './ComposableAgent';
77
export type { ComposableAgentOptions } from './ComposableAgent';
88
export { createComposableToolCallEngineFactory } from './ComposableToolCallEngineFactory';
99
export { ToolCallEngineProvider } from './types';
10-
export type { ToolCallEngineContext } from './types';
10+
export type { ToolCallEngineContext, AgentMode } from './types';
1111

1212
export { CODE_ENVIRONMENT } from './environments/code';
1313
export { MCP_ENVIRONMENT } from './environments/mcp';
@@ -18,5 +18,5 @@ export { AgentPlugin } from './AgentPlugin';
1818
export { parseCodeContent, parseComputerContent, parseMcpContent } from './utils/parser';
1919
export * from './utils/streamingParser';
2020
export * from './utils/streamingParserT5';
21-
export { SYSTEM_PROMPT_GROUP, think_token } from './environments/prompt_t5';
21+
export { SYSTEM_PROMPT_GROUP, createSystemPromptGroup, think_token } from './environments/prompt_t5';
2222
export { getAioUrl, extractAioPort } from './utils/hepler';

multimodal/omni-tars/core/src/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,6 @@ export interface ToolCallEngineCompositionConfig {
6161
/** Default engine to use when no specific engine matches */
6262
defaultEngine?: ToolCallEngineProvider;
6363
}
64+
65+
66+
export type AgentMode = 'omni' | 'gui';

multimodal/omni-tars/gui-agent/src/GUIAgentToolCallEngine.ts

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,15 @@ import {
88
MultimodalToolCallResult,
99
AgentEventStream,
1010
ChatCompletionMessageParam,
11-
ChatCompletionMessageToolCall,
1211
ParsedModelResponse,
13-
StreamProcessingState,
1412
StreamChunkResult,
1513
} from '@tarko/agent-interface';
16-
import { actionParser, actionStringParser } from '@gui-agent/action-parser';
17-
import { getScreenInfo } from './shared';
1814
import {
1915
processT5StreamingChunk as omniProcessStreamingChunk,
2016
T5StreamProcessingState as OmniStreamProcessingState,
2117
createT5InitState as createInitState,
22-
SYSTEM_PROMPT_GROUP,
18+
createSystemPromptGroup,
19+
AgentMode,
2320
} from '@omni-tars/core';
2421
import { getLogger } from '@tarko/agent';
2522
import { GUIAgentT5Adapter } from './GUIAgentT5Adapter';
@@ -37,12 +34,19 @@ import { GUIAgentT5Adapter } from './GUIAgentT5Adapter';
3734
export class GUIAgentToolCallEngine extends ToolCallEngine {
3835
private logger = getLogger('GUIAgentToolCallEngine');
3936
private t5Adapter = new GUIAgentT5Adapter(this.logger);
37+
private agentMode: AgentMode;
38+
39+
constructor(...args: unknown[]) {
40+
super();
41+
const agentMode = args[0] as AgentMode | undefined;
42+
this.agentMode = agentMode || 'gui';
43+
}
4044

4145
/**
42-
* Prepare system prompt with tool information and instructions
46+
* Prepare system prompt with tool information and instructions based on agent mode
4347
*/
4448
preparePrompt(instructions: string, tools: Tool[]) {
45-
return SYSTEM_PROMPT_GROUP;
49+
return createSystemPromptGroup(this.agentMode);
4650
}
4751

4852
/**

multimodal/omni-tars/gui-agent/src/GuiToolCallEngineProvider.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,24 @@
33
* Provides optimized tool call engine for GUI automation and computer use tasks
44
*/
55

6-
import { ToolCallEngineProvider, ToolCallEngineContext } from '@omni-tars/core';
6+
import { ToolCallEngineProvider, ToolCallEngineContext, AgentMode } from '@omni-tars/core';
77
import { GUIAgentToolCallEngine } from './GUIAgentToolCallEngine';
88

99
export class GuiToolCallEngineProvider extends ToolCallEngineProvider<GUIAgentToolCallEngine> {
1010
readonly name = 'gui-tool-call-engine';
1111
readonly priority = 90; // High priority for GUI tasks
1212
readonly description =
1313
'Tool call engine optimized for GUI automation, computer use, and visual interface interactions';
14+
15+
private agentMode: AgentMode;
16+
17+
constructor(agentMode: AgentMode = 'gui') {
18+
super();
19+
this.agentMode = agentMode;
20+
}
1421

1522
protected createEngine(): GUIAgentToolCallEngine {
16-
return new GUIAgentToolCallEngine();
23+
return new GUIAgentToolCallEngine(this.agentMode);
1724
}
1825

1926
canHandle(context: ToolCallEngineContext): boolean {

multimodal/omni-tars/omni-agent/src/index.ts

Lines changed: 5 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,32 +2,12 @@
22
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
5-
import {
6-
codePluginBuilder,
7-
CodeToolCallEngineProvider,
8-
CodeAgentExtraOption,
9-
} from '@omni-tars/code-agent';
10-
import {
11-
mcpPluginBuilder,
12-
McpToolCallEngineProvider,
13-
MCPTarsExtraOption,
14-
} from '@omni-tars/mcp-agent';
15-
import { GuiAgentPlugin, GuiToolCallEngineProvider, OperatorManager } from '@omni-tars/gui-agent';
16-
import { ComposableAgent, createComposableToolCallEngineFactory } from '@omni-tars/core';
17-
import { AgentOptions } from '@tarko/agent';
18-
import { AgentWebUIImplementation } from '@tarko/interface';
195

20-
const mcpToolCallEngine = new McpToolCallEngineProvider();
21-
22-
const toolCallEngine = createComposableToolCallEngineFactory({
23-
engines: [new GuiToolCallEngineProvider(), mcpToolCallEngine, new CodeToolCallEngineProvider()],
24-
defaultEngine: mcpToolCallEngine,
25-
});
6+
import { ComposableAgent } from '@omni-tars/core';
7+
import { AgentWebUIImplementation } from '@tarko/interface';
8+
import { getComposableOption, OmniTarsOption } from './options';
269

2710
const sandboxBaseUrl = process.env.AIO_SANDBOX_URL ?? '.';
28-
29-
type OmniTarsOption = AgentOptions & MCPTarsExtraOption & CodeAgentExtraOption;
30-
3111
export default class OmniTARSAgent extends ComposableAgent {
3212
static label = 'Omni Agent';
3313

@@ -76,33 +56,7 @@ export default class OmniTARSAgent extends ComposableAgent {
7656
},
7757
};
7858

79-
constructor(options: OmniTarsOption) {
80-
const {
81-
tavilyApiKey,
82-
googleApiKey,
83-
googleMcpUrl,
84-
sandboxUrl,
85-
ignoreSandboxCheck,
86-
linkReaderAK,
87-
linkReaderMcpUrl,
88-
...restOptions
89-
} = options;
90-
super({
91-
...restOptions,
92-
plugins: [
93-
mcpPluginBuilder({
94-
tavilyApiKey,
95-
googleApiKey,
96-
googleMcpUrl,
97-
linkReaderAK,
98-
linkReaderMcpUrl,
99-
}),
100-
codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
101-
new GuiAgentPlugin({ operatorManager: OperatorManager.createHybird(options.sandboxUrl) }),
102-
],
103-
toolCallEngine,
104-
maxTokens: 32768,
105-
enableStreamingToolCallEvents: true,
106-
});
59+
constructor(option: OmniTarsOption) {
60+
super(getComposableOption(option));
10761
}
10862
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
import { CodeAgentExtraOption, codePluginBuilder, CodeToolCallEngineProvider } from "@omni-tars/code-agent";
7+
import { AgentMode, ComposableAgentOptions, createComposableToolCallEngineFactory } from "@omni-tars/core";
8+
import { GuiAgentPlugin, GuiToolCallEngineProvider, OperatorManager } from "@omni-tars/gui-agent";
9+
import { mcpPluginBuilder, MCPTarsExtraOption, McpToolCallEngineProvider } from "@omni-tars/mcp-agent";
10+
import { AgentAppConfig } from "@tarko/interface";
11+
12+
13+
const mcpToolCallEngine = new McpToolCallEngineProvider();
14+
15+
const omniToolCallEngine = createComposableToolCallEngineFactory({
16+
engines: [new GuiToolCallEngineProvider('omni'), mcpToolCallEngine, new CodeToolCallEngineProvider()],
17+
defaultEngine: mcpToolCallEngine,
18+
});
19+
20+
const guiToolCallEngine = createComposableToolCallEngineFactory({ engines: [new GuiToolCallEngineProvider('gui')] });
21+
22+
23+
export type OmniTarsOption = AgentAppConfig & MCPTarsExtraOption & CodeAgentExtraOption & {
24+
agentMode: AgentMode
25+
};
26+
27+
28+
export function getComposableOption(options: OmniTarsOption) {
29+
const {
30+
tavilyApiKey,
31+
googleApiKey,
32+
googleMcpUrl,
33+
sandboxUrl,
34+
ignoreSandboxCheck,
35+
linkReaderAK,
36+
linkReaderMcpUrl,
37+
agentMode = 'omni',
38+
...restOptions
39+
} = options;
40+
41+
const baseOptions: Partial<ComposableAgentOptions> = {
42+
...restOptions,
43+
maxTokens: 32768,
44+
enableStreamingToolCallEvents: true,
45+
};
46+
47+
if(agentMode === 'gui') {
48+
baseOptions.toolCallEngine = guiToolCallEngine;
49+
baseOptions.plugins = [
50+
new GuiAgentPlugin({ operatorManager: OperatorManager.createHybird(options.sandboxUrl) }),
51+
];
52+
} else if(agentMode === 'omni') {
53+
baseOptions.toolCallEngine = omniToolCallEngine;
54+
baseOptions.plugins = [
55+
mcpPluginBuilder({
56+
tavilyApiKey,
57+
googleApiKey,
58+
googleMcpUrl,
59+
linkReaderAK,
60+
linkReaderMcpUrl,
61+
}),
62+
codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
63+
new GuiAgentPlugin({ operatorManager: OperatorManager.createHybird(options.sandboxUrl) }),
64+
];
65+
}
66+
67+
return baseOptions as ComposableAgentOptions;
68+
}

multimodal/tarko/agent-server-next/examples/bootstrap.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,19 @@ const server = new AgentServer({
3434
type: process.env.NATIVE_THINKING === 'true' ? 'enabled' : 'disabled',
3535
},
3636
server: {
37+
runtimeSettings: {
38+
schema: {
39+
type: 'object',
40+
properties: {
41+
agentMode: {
42+
type: 'string',
43+
title: 'Agent Mode',
44+
enum: ['omni', 'gui'],
45+
default: 'omni',
46+
},
47+
},
48+
},
49+
},
3750
storage: {
3851
type: 'mongodb',
3952
uri: process.env.MONGO_URI,
@@ -80,7 +93,7 @@ const server = new AgentServer({
8093
webui: {
8194
type: 'remote',
8295
remoteUrl: process.env.WEBUI_REMOTE_URL,
83-
}
96+
},
8497
},
8598
});
8699

multimodal/tarko/agent-server-next/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,4 @@
4545
"typescript": "^5.5.3",
4646
"vitest": "3.2.4"
4747
}
48-
}
48+
}

0 commit comments

Comments
 (0)