Skip to content

Commit eab67bf

Browse files
authored
feat(omni-agent): add functions prompt for game mode (#1668)
1 parent 3276766 commit eab67bf

File tree

3 files changed

+27
-8
lines changed

3 files changed

+27
-8
lines changed

multimodal/omni-tars/core/src/environments/prompt_t5.ts

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,22 @@ export const gui_functions = `
6868
{"type": "function", "name": "wait", "parameters": {"type": "object", "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}}, "required": []}, "description": "Wait for a while."}
6969
`;
7070

71+
export const game_functions = `
72+
{"type": "function", "name": "click", "parameters": {"type": "object", "properties": {"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}}, "required": ["point"]}, "description": "Mouse left single click action."}
73+
{"type": "function", "name": "drag", "parameters": {"type": "object", "properties": {"start_point": {"type": "string", "description": "Drag start point. The format is: <point>x y</point>"}, "end_point": {"type": "string", "description": "Drag end point. The format is: <point>x y</point>"}}, "required": ["start_point", "end_point"]}, "description": "Mouse left button drag action."}
74+
{"type": "function", "name": "hotkey", "parameters": {"type": "object", "properties": {"key": {"type": "string", "description": "Hotkeys you want to press. Split keys with a space and use lowercase."}}, "required": ["key"]}, "description": "Press hotkey."}
75+
{"type": "function", "name": "left_double", "parameters": {"type": "object", "properties": {"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}}, "required": ["point"]}, "description": "Mouse left double click action."}
76+
{"type": "function", "name": "mouse_down", "parameters": {"type": "object", "properties": {"point": {"type": "string", "description": "Mouse down position. If not specified, default to execute on the current mouse position. The format is: <point>x y</point>"}, "button": {"type": "string", "description": "Down button. Default to left.", "enum": ["left", "right"]}}, "required": []}, "description": "Mouse down action."}
77+
{"type": "function", "name": "mouse_up", "parameters": {"type": "object", "properties": {"point": {"type": "string", "description": "Mouse up position. If not specified, default to execute on the current mouse position. The format is: <point>x y</point>"}, "button": {"type": "string", "description": "Up button. Default to left.", "enum": ["left", "right"]}}, "required": []}, "description": "Mouse up action."}
78+
{"type": "function", "name": "move_to", "parameters": {"type": "object", "properties": {"point": {"type": "string", "description": "Target coordinates. The format is: <point>x y</point>"}}, "required": ["point"]}, "description": "Mouse move action."}
79+
{"type": "function", "name": "press", "parameters": {"type": "object", "properties": {"key": {"type": "string", "description": "Key you want to press. Only one key can be pressed at one time."}}, "required": ["key"]}, "description": "Press key."}
80+
{"type": "function", "name": "release", "parameters": {"type": "object", "properties": {"key": {"type": "string", "description": "Key you want to release. Only one key can be released at one time."}}, "required": ["key"]}, "description": "Release key."}
81+
{"type": "function", "name": "right_single", "parameters": {"type": "object", "properties": {"point": {"type": "string", "description": "Click coordinates. The format is: <point>x y</point>"}}, "required": ["point"]}, "description": "Mouse right single click action."}
82+
{"type": "function", "name": "scroll", "parameters": {"type": "object", "properties": {"point": {"type": "string", "description": "Scroll start position. If not specified, default to execute on the current mouse position. The format is: <point>x y</point>"}, "direction": {"type": "string", "description": "Scroll direction.", "enum": ["up", "down", "left", "right"]}}, "required": ["direction"]}, "description": "Scroll action."}
83+
{"type": "function", "name": "type", "parameters": {"type": "object", "properties": {"content": {"type": "string", "description": "Type content. If you want to submit your input, use \n at the end of content."}}, "required": ["content"]}, "description": "Type content."}
84+
{"type": "function", "name": "wait", "parameters": {"type": "object", "properties": {"time": {"type": "integer", "description": "Wait time in seconds."}}, "required": []}, "description": "Wait for a while."}
85+
`;
86+
7187
const createPROMPT2 = (description: string) => {
7288
return `You are an agent designed to accomplish tasks.
7389
${description}
@@ -137,7 +153,7 @@ export const createSystemPromptGroup = (agentMode: AgentMode): string[] => {
137153
return [
138154
PROMPT1,
139155
createPROMPT2(game_task_description),
140-
createPROMPT3([gui_functions], gui_additional_notes),
156+
createPROMPT3([game_functions], gui_additional_notes),
141157
];
142158
}
143159

multimodal/omni-tars/gui-agent/src/GuiAgentPlugin.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ export class GuiAgentPlugin extends AgentPlugin {
169169
const eventStream = this.agent.getEventStream();
170170
const events = eventStream.getEvents();
171171
// Only emit if no user messages exist yet
172-
const hasUserMessage = events.some(event => event.type === 'user_message');
172+
const hasUserMessage = events.some((event) => event.type === 'user_message');
173173
if (!hasUserMessage) {
174174
const event = eventStream.createEvent('user_message', {
175175
content: `Goto: ${this.agentMode!.link}`,
@@ -182,10 +182,10 @@ export class GuiAgentPlugin extends AgentPlugin {
182182
const eventStream = this.agent.getEventStream();
183183
const events = eventStream.getEvents();
184184
// Only emit if no assistant messages exist yet
185-
const hasAssistantMessage = events.some(event => event.type === 'assistant_message');
185+
const hasAssistantMessage = events.some((event) => event.type === 'assistant_message');
186186
if (!hasAssistantMessage) {
187187
const event = eventStream.createEvent('assistant_message', {
188-
content: `Successfully navigated to ${this.agentMode!.link}, page loaded successfully`,
188+
content: `Successfully navigated to ${this.agentMode!.link}, page loaded completely.`,
189189
});
190190
eventStream.sendEvent(event);
191191
}

multimodal/omni-tars/gui-agent/src/GuiToolCallEngineProvider.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@ export class GuiToolCallEngineProvider extends ToolCallEngineProvider<GUIAgentTo
2525
canHandle(context: ToolCallEngineContext): boolean {
2626
//Check if any tools are GUI/computer use related
2727
if (context.toolCalls) {
28-
const guiToolNames = [
29-
'navigate',
30-
'navigate_back',
28+
const toolNames = [
3129
'call_user',
3230
'click',
3331
'drag',
@@ -45,8 +43,13 @@ export class GuiToolCallEngineProvider extends ToolCallEngineProvider<GUIAgentTo
4543
'wait',
4644
];
4745

46+
if (this.agentMode.id !== 'game') {
47+
toolNames.push('navigate');
48+
toolNames.push('navigate_back');
49+
}
50+
4851
const hasGuiTools = context?.toolCalls?.some((tool) =>
49-
guiToolNames.some((guiName) =>
52+
toolNames.some((guiName) =>
5053
tool.function.name.toLowerCase().includes(guiName.toLowerCase()),
5154
),
5255
);

0 commit comments

Comments
 (0)