Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions multimodal/gui-agent/agent-sdk/src/GUIAgent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,20 @@ import { GUIAgentToolCallEngine } from './ToolCallEngine';
import { SYSTEM_PROMPT } from './prompts';
import { Base64ImageParser } from '@agent-infra/media-utils';
import { Operator, BaseGUIAgent } from '@gui-agent/shared/base';
import { GUIAgentConfig, NormalizeCoordinates } from '@gui-agent/shared/types';
import {
GUIAgentConfig,
NormalizeCoordinates,
ImageDetailCalculator,
} from '@gui-agent/shared/types';
import {
assembleSystemPrompt,
isSystemPromptTemplate,
defaultNormalizeCoords,
normalizeActionCoords,
sleep,
} from '@gui-agent/shared/utils';
import { GUI_ADAPTED_TOOL_NAME } from './constants';
import { convertToAgentUIAction, createGUIErrorResponse } from './utils';
import { defaultNormalizeCoords, defaultDetailCalculator } from './defaultImpls';

const defaultLogger = new ConsoleLogger('[GUIAgent]', LogLevel.DEBUG);

Expand All @@ -31,6 +35,7 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {

private operator: Operator | undefined;
private normalizeCoordinates: NormalizeCoordinates;
private detailCalculator: ImageDetailCalculator;
private loopIntervalInMs: number;

constructor(config: GUIAgentConfig<T>) {
Expand All @@ -40,6 +45,7 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
systemPrompt,
customeActionParser,
normalizeCoordinates,
detailCalculator,
maxLoopCount,
loopIntervalInMs = 500,
} = config;
Expand Down Expand Up @@ -69,6 +75,8 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
});
this.operator = operator;
this.normalizeCoordinates = normalizeCoordinates ?? defaultNormalizeCoords;
// Default detail calculator implementation
this.detailCalculator = detailCalculator ?? defaultDetailCalculator;
this.loopIntervalInMs = loopIntervalInMs;
this.logger = this.logger.spawn('[GUIAgent]');
}
Expand Down Expand Up @@ -163,11 +171,17 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
return;
}

const { width: imageWidth, height: imageHeight } = base64Tool.getDimensions() || {
width: -1,
height: -1,
};

const content: ChatCompletionContentPart[] = [
{
type: 'image_url',
image_url: {
url: base64Uri,
detail: this.detailCalculator(imageWidth, imageHeight),
},
},
];
Expand Down
48 changes: 48 additions & 0 deletions multimodal/gui-agent/agent-sdk/src/defaultImpls.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import { Coordinates, ImageDetailCalculator, NormalizeCoordinates } from '@gui-agent/shared/types';

/**
* Default coordinate normalization function
* Normalizes raw coordinates by dividing by 1000 (simple scaling)
* @param rawCoords - The raw coordinates to normalize
* @returns Object containing normalized coordinates
*/
export const defaultNormalizeCoords: NormalizeCoordinates = (rawCoords: Coordinates) => {
if (!rawCoords.raw) {
return { normalized: rawCoords };
}
const normalizedCoords = {
...rawCoords,
normalized: {
x: rawCoords.raw.x / 1000,
y: rawCoords.raw.y / 1000,
},
};
return { normalized: normalizedCoords };
};

/**
* Default implementation for detail calculation based on pixel count
* detail:low mode: 1,048,576 px (1024×1024)
* detail:high mode: 4,014,080 px (2048×1960)
*/
export const defaultDetailCalculator: ImageDetailCalculator = (
width: number,
height: number,
): 'low' | 'high' | 'auto' => {
const LOW_DETAIL_THRESHOLD = 1024 * 1024; // 1,048,576 px
const HIGH_DETAIL_THRESHOLD = 2048 * 1960; // 4,014,080 px

const pixelCount = width * height;
if (pixelCount <= LOW_DETAIL_THRESHOLD) {
return 'low';
} else if (pixelCount <= HIGH_DETAIL_THRESHOLD) {
return 'high';
} else {
// For images larger than high detail threshold, use high detail
return 'auto';
}
};
7 changes: 7 additions & 0 deletions multimodal/gui-agent/shared/src/types/agents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ export type ExecuteOutput = {
url?: string; // url of the page
} & Record<string, any>;

/**
* Function type for calculating detail level based on image dimensions
*/
export type ImageDetailCalculator = (width: number, height: number) => 'low' | 'high' | 'auto';

export interface ScreenshotOutput extends ExecuteOutput {
/** screenshot base64, `keep screenshot size as physical pixels` */
base64: string;
Expand Down Expand Up @@ -113,6 +118,8 @@ export interface GUIAgentConfig<TOperator> extends AgentOptions {
customeActionParser?: CustomActionParser;
/** The function to normalize raw coordinates */
normalizeCoordinates?: NormalizeCoordinates;
/** The function to calculate detail level based on image dimensions */
detailCalculator?: ImageDetailCalculator;
/** Maximum number of turns for Agent to execute, @default 1000 */
maxLoopCount?: number;
/** Time interval between two loop iterations (in milliseconds), @default 0 */
Expand Down
20 changes: 0 additions & 20 deletions multimodal/gui-agent/shared/src/utils/coordinateNormalizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,6 @@

import { BaseAction, Coordinates, NormalizeCoordinates } from '../types';

/**
* Default coordinate normalization function
* Normalizes raw coordinates by dividing by 1000 (simple scaling)
* @param rawCoords - The raw coordinates to normalize
* @returns Object containing normalized coordinates
*/
export const defaultNormalizeCoords: NormalizeCoordinates = (rawCoords: Coordinates) => {
if (!rawCoords.raw) {
return { normalized: rawCoords };
}
const normalizedCoords = {
...rawCoords,
normalized: {
x: rawCoords.raw.x / 1000,
y: rawCoords.raw.y / 1000,
},
};
return { normalized: normalizedCoords };
};

/**
* Normalizes coordinates in a BaseAction object
* Processes point, start, and end coordinate fields if they exist
Expand Down