feat(omni-agent): agentMode structure update; add game mode support (#1649)

cjraft · web-flow · commit f3ec7d471d7e · 2025-09-25T22:24:42.000+08:00
diff --git a/multimodal/omni-tars/core/src/environments/prompt_t5.ts b/multimodal/omni-tars/core/src/environments/prompt_t5.ts
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+import { getLogger } from '@tarko/agent';
 import { AgentMode } from '../types';
 import { getTimeString } from '../utils/hepler';
 import { HOME_INSTRUCTION, PROXY_INSTRUCTION } from './code';
@@ -11,6 +12,7 @@ export const think_token = process.env.THINK_TOKEN || 'thinkt';
 export const use_native_thinking = process.env.NATIVE_THINKING === 'true';
 export const bypass_native_thinking = process.env.NATIVE_THINKING === 'bypass';
 
+const logger = getLogger('prompt_t5');
 const think_budget = '\n';
 
 const task_description = `\nCurrent time is: ${getTimeString()}\n
@@ -19,6 +21,17 @@ As a professional personal assistant (Doubao) capable of solving various user pr
 const gui_task_description = `\nCurrent time is: ${getTimeString()}\n
 You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. \n`;
 
+const game_task_description = `You should begin by detailing the internal reasoning process, and then present the answer to the user. The reasoning process should be enclosed within <${think_token}> </${think_token}> tags, as follows:
+<${think_token}> reasoning process here </${think_token}> answer here. 
+You have different modes of thinking:
+Unrestricted think mode: Engage in an internal thinking process with thorough reasoning and reflections. You have an unlimited budget for thinking tokens and can continue thinking until you fully solve the problem.
+Efficient think mode: Provide a concise internal thinking process with efficient reasoning and reflections. You don't have a strict token budget but be less verbose and more direct in your thinking. 
+No think mode: Respond directly to the question without any internal reasoning process or extra thinking tokens. Still follow the template with the minimum required thinking tokens to justify the answer. 
+Budgeted think mode: Limit your internal reasoning and reflections to stay within the specified token budget
+Based on the complexity of the problem, select the appropriate mode for reasoning among the provided options listed below.
+Provided Mode(s):
+Unrestricted think.
+You are provided with a task description, a history of previous actions, and corresponding screenshots. Your goal is to perform the next action to complete the task. Please note that if performing the same action multiple times results in a static screen with no changes, you should attempt a modified or alternative action.`;
 
 //Mixed scenarios use this additional_notes
 const omni_additional_notes = `- Use english in your reasoning process. \n
@@ -55,13 +68,11 @@ export const gui_functions = `
 {"type": "function", "name": "wait", "parameters": {"type": "object", "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}}, "required": []}, "description": "Wait for a while."}
 `;
 
-
-
 const createPROMPT2 = (description: string) => {
-    return `You are an agent designed to accomplish tasks.
+  return `You are an agent designed to accomplish tasks.
 ${description}
 <seed:cot_budget_reflect>${think_budget}</seed:cot_budget_reflect>`;
-}
+};
 
 /** 3.1 Think Prompt */
 const PROMPT1 = use_native_thinking
@@ -70,7 +81,6 @@ const PROMPT1 = use_native_thinking
 
 /** 3.2 Role/Task Prompt */
 
-
 /** 3.3 Action/Function Definition Prompt (如果没有functions则不需要这段prompt) */
 const createPROMPT3 = (functions: string[], additionalNotes: string) => `## Function Definition
 
@@ -99,29 +109,37 @@ multiple lines
 ${additionalNotes}
 `;
 
-
 // Default SYSTEM_PROMPT_GROUP for backwards compatibility (omni mode)
 export const SYSTEM_PROMPT_GROUP = [
-  PROMPT1, 
+  PROMPT1,
   createPROMPT2(task_description),
-  createPROMPT3([mcp_functions, code_functions, gui_functions], omni_additional_notes)
+  createPROMPT3([mcp_functions, code_functions, gui_functions], omni_additional_notes),
 ];
 
 /**
  * Create system prompt group based on agent mode
  * @param agentMode - The agent mode ('omni' or 'gui')
  * @returns Array of prompt strings
  */
-export const createSystemPromptGroup = (agentMode: AgentMode = 'omni'): string[] => {
-  if (agentMode === 'omni') {
-    return SYSTEM_PROMPT_GROUP;
-  }else if(agentMode === 'gui') {
-    return [
-      PROMPT1, 
-      createPROMPT2(gui_task_description),
-      createPROMPT3([gui_functions], gui_additional_notes)
-    ]
+export const createSystemPromptGroup = (agentMode: AgentMode): string[] => {
+  logger.info('agentMode: ', agentMode);
+
+  switch (agentMode.id) {
+    case 'omni':
+      return SYSTEM_PROMPT_GROUP;
+    case 'gui':
+      return [
+        PROMPT1,
+        createPROMPT2(gui_task_description),
+        createPROMPT3([gui_functions], gui_additional_notes),
+      ];
+    case 'game':
+      return [
+        PROMPT1,
+        createPROMPT2(game_task_description),
+        createPROMPT3([gui_functions], gui_additional_notes),
+      ];
   }
 
-  return []
+  return [];
 };
diff --git a/multimodal/omni-tars/core/src/types.ts b/multimodal/omni-tars/core/src/types.ts
@@ -62,5 +62,8 @@ export interface ToolCallEngineCompositionConfig {
   defaultEngine?: ToolCallEngineProvider;
 }
 
-
-export type AgentMode = 'omni' | 'gui';
+export type AgentMode = {
+  id: 'omni' | 'gui' | 'game';
+  link?: string;
+  browserMode?: 'dom' | 'visual-grounding' | 'hybrid';
+};
diff --git a/multimodal/omni-tars/gui-agent/src/GUIAgentToolCallEngine.ts b/multimodal/omni-tars/gui-agent/src/GUIAgentToolCallEngine.ts
@@ -38,8 +38,8 @@ export class GUIAgentToolCallEngine extends ToolCallEngine {
 
   constructor(...args: unknown[]) {
     super();
-    const agentMode = args[0] as AgentMode | undefined;
-    this.agentMode = agentMode || 'gui';
+    const agentMode = args[0] as AgentMode;
+    this.agentMode = agentMode;
   }
 
   /**
diff --git a/multimodal/omni-tars/gui-agent/src/GuiToolCallEngineProvider.ts b/multimodal/omni-tars/gui-agent/src/GuiToolCallEngineProvider.ts
@@ -11,10 +11,9 @@ export class GuiToolCallEngineProvider extends ToolCallEngineProvider<GUIAgentTo
   readonly priority = 90; // High priority for GUI tasks
   readonly description =
     'Tool call engine optimized for GUI automation, computer use, and visual interface interactions';
-  
   private agentMode: AgentMode;
 
-  constructor(agentMode: AgentMode = 'gui') {
+  constructor(agentMode: AgentMode) {
     super();
     this.agentMode = agentMode;
   }
diff --git a/multimodal/omni-tars/omni-agent/src/options.ts b/multimodal/omni-tars/omni-agent/src/options.ts
@@ -21,26 +21,10 @@ import {
 } from '@omni-tars/mcp-agent';
 import { AgentAppConfig } from '@tarko/interface';
 
-const mcpToolCallEngine = new McpToolCallEngineProvider();
-
-const omniToolCallEngine = createComposableToolCallEngineFactory({
-  engines: [
-    new GuiToolCallEngineProvider('omni'),
-    mcpToolCallEngine,
-    new CodeToolCallEngineProvider(),
-  ],
-  defaultEngine: mcpToolCallEngine,
-});
-
-const guiToolCallEngine = createComposableToolCallEngineFactory({
-  engines: [new GuiToolCallEngineProvider('gui')],
-});
-
 export type OmniTarsOption = AgentAppConfig &
   MCPTarsExtraOption &
   CodeAgentExtraOption & {
     agentMode: AgentMode;
-    browserMode: 'dom' | 'visual-grounding' | 'hybrid';
   };
 
 export function getComposableOption(options: OmniTarsOption) {
@@ -52,8 +36,7 @@ export function getComposableOption(options: OmniTarsOption) {
     ignoreSandboxCheck,
     linkReaderAK,
     linkReaderMcpUrl,
-    agentMode = 'omni',
-    browserMode = 'hybrid',
+    agentMode = { id: 'omni' },
     ...restOptions
   } = options;
 
@@ -67,22 +50,35 @@ export function getComposableOption(options: OmniTarsOption) {
     operatorManager: OperatorManager.createHybird(options.sandboxUrl),
   });
 
-  if (agentMode === 'gui') {
-    baseOptions.toolCallEngine = guiToolCallEngine;
-    baseOptions.plugins = [guiPlugin];
-  } else if (agentMode === 'omni') {
-    baseOptions.toolCallEngine = omniToolCallEngine;
-    baseOptions.plugins = [
-      mcpPluginBuilder({
-        tavilyApiKey,
-        googleApiKey,
-        googleMcpUrl,
-        linkReaderAK,
-        linkReaderMcpUrl,
-      }),
-      codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
-      guiPlugin,
-    ];
+  switch (agentMode.id) {
+    case 'game':
+    case 'gui':
+      baseOptions.toolCallEngine = createComposableToolCallEngineFactory({
+        engines: [new GuiToolCallEngineProvider(agentMode)],
+      });
+      baseOptions.plugins = [guiPlugin];
+      break;
+    case 'omni':
+    default:
+      baseOptions.toolCallEngine = createComposableToolCallEngineFactory({
+        engines: [
+          new GuiToolCallEngineProvider(agentMode),
+          new McpToolCallEngineProvider(),
+          new CodeToolCallEngineProvider(),
+        ],
+      });
+      baseOptions.plugins = [
+        mcpPluginBuilder({
+          tavilyApiKey,
+          googleApiKey,
+          googleMcpUrl,
+          linkReaderAK,
+          linkReaderMcpUrl,
+        }),
+        codePluginBuilder({ sandboxUrl, ignoreSandboxCheck }),
+        guiPlugin,
+      ];
+      break;
   }
 
   return baseOptions as ComposableAgentOptions;
diff --git a/multimodal/tarko/agent-server-next/examples/bootstrap.ts b/multimodal/tarko/agent-server-next/examples/bootstrap.ts
@@ -41,12 +41,11 @@ const server = new AgentServer({
             agentMode: {
               type: 'string',
               title: 'Agent Mode',
-              enum: ['omni', 'gui'],
-              enumLabels: ['Omni', 'GUI'],
+              enum: ['omni', 'gui', 'game'],
+              enumLabels: ['Omni', 'GUI', 'Game'],
               default: 'omni',
               placement: 'chat-bottom',
             },
-
             browserMode: {
               type: 'string',
               title: 'Browser Control',
@@ -61,6 +60,15 @@ const server = new AgentServer({
             },
           },
         },
+        transform: (runtimeSettings: Record<string, unknown>) => {
+          return {
+            agentMode: {
+              id: runtimeSettings.agentMode,
+              browserMode: runtimeSettings.browserMode,
+              link: 'http://example.com',
+            },
+          };
+        },
       },
       storage: {
         type: 'mongodb',

Original file line number	Diff line number	Diff line change
`@@ -38,8 +38,8 @@ export class GUIAgentToolCallEngine extends ToolCallEngine {`
`38`	`38`
`39`	`39`	`constructor(...args: unknown[]) {`
`40`	`40`	`super();`
`41`		`- const agentMode = args[0] as AgentMode \| undefined;`
`42`		`- this.agentMode = agentMode \|\| 'gui';`
	`41`	`+ const agentMode = args[0] as AgentMode;`
	`42`	`+ this.agentMode = agentMode;`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`/**`