Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
628 changes: 345 additions & 283 deletions multimodal/gui-agent/action-parser/src/ActionParserHelper.ts

Large diffs are not rendered by default.

52 changes: 20 additions & 32 deletions multimodal/gui-agent/action-parser/src/DefaultActionParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,29 +31,15 @@ export class DefaultActionParser extends BaseActionParser {
input = input.trim();

let reasoningContent = null;
let rawActionStrings = null;
let actions = null;
let rawActionStrings: string[] | undefined = undefined;
let actions: BaseAction[] | undefined = undefined;
try {
({ reasoningContent, rawActionStrings, actions } = this.extractActionStrings(input));
} catch (error) {
return {
errorMessage: (error as Error).message,
rawContent: originInput,
rawActionStrings: [],
actions: [],
};
}

if (!rawActionStrings || rawActionStrings.length <= 0) {
return {
errorMessage: 'There is no GUI action detected',
rawContent: originInput,
reasoningContent,
rawActionStrings: [],
actions: [],
};
return this.createErrorResponse((error as Error).message, originInput);
}

// if actions has prased, just return it
if (actions && actions.length > 0) {
return {
rawContent: originInput,
Expand All @@ -63,28 +49,22 @@ export class DefaultActionParser extends BaseActionParser {
};
}

if (!rawActionStrings || rawActionStrings.length <= 0) {
return this.createErrorResponse('There is no GUI action detected', originInput);
}

actions = [];
try {
for (const actionString of rawActionStrings) {
const action = this.helper.parseActionFromString(actionString);
const action = this.helper.parseActionCallString(actionString);
if (action) actions.push(action);
}
} catch (error) {
return {
errorMessage: (error as Error).message,
rawContent: originInput,
reasoningContent,
rawActionStrings,
actions: [],
};
return this.createErrorResponse((error as Error).message, originInput);
}

this.logger.debug(
'[parsePrediction] final result: reasoningContent:',
reasoningContent,
', actions lenth:',
actions.length,
);
this.logger.debug('[parsePrediction] reasoningContent:', reasoningContent);
this.logger.debug('[parsePrediction] actions lenth:', actions.length);

return {
rawContent: originInput,
Expand Down Expand Up @@ -118,4 +98,12 @@ export class DefaultActionParser extends BaseActionParser {
actions: actions || undefined,
};
}

private createErrorResponse(errorMessage: string, rawContent: string): ParsedGUIResponse {
return {
errorMessage,
rawContent,
actions: [],
};
}
}
57 changes: 30 additions & 27 deletions multimodal/gui-agent/action-parser/src/FomatParsers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
* SPDX-License-Identifier: Apache-2.0
*/
import { ConsoleLogger } from '@agent-infra/logger';
import { ConsoleLogger, LogLevel } from '@agent-infra/logger';
import { BaseAction } from '@gui-agent/shared/types';
import { XMLBuilder, XMLParser } from 'fast-xml-parser';
// Remove circular dependency
import { ActionParserHelper } from './ActionParserHelper';
import { serializeAction } from '@gui-agent/shared/utils';

export interface FormatParser {
parse(text: string): {
reasoningContent: string | null;
rawActionStrings: string[] | null;
actions: BaseAction[] | null;
rawActionStrings: string[] | undefined;
actions: BaseAction[] | undefined;
} | null;
}

Expand All @@ -24,8 +25,8 @@ export class XMLFormatParser implements FormatParser {

parse(text: string): {
reasoningContent: string | null;
rawActionStrings: string[] | null;
actions: BaseAction[] | null;
rawActionStrings: string[] | undefined;
actions: BaseAction[] | undefined;
} | null {
if (text.includes('computer_env')) {
// The text is omni format, not a solid XML format, refuse parse
Expand Down Expand Up @@ -71,7 +72,7 @@ export class XMLFormatParser implements FormatParser {
}
if (key === 'seed:tool_call') {
canParse = true;
actions.push(...this.helper.standardizeGUIActions(value));
actions.push(...this.helper.standardizeGUIActionsFromXMLObject(value));
continue;
}
}
Expand All @@ -85,9 +86,7 @@ export class XMLFormatParser implements FormatParser {

const rawActionStrings: string[] = [];
for (const action of actions) {
rawActionStrings.push(
this.helper.convertRoughActionInputsToLegacyActionString(action.type, action.inputs),
);
rawActionStrings.push(serializeAction(action));
}

return {
Expand Down Expand Up @@ -123,8 +122,8 @@ export class OmniFormatParser implements FormatParser {

parse(text: string): {
reasoningContent: string | null;
rawActionStrings: string[] | null;
actions: BaseAction[] | null;
rawActionStrings: string[] | undefined;
actions: BaseAction[] | undefined;
} | null {
if (!this.canParse(text)) {
return null;
Expand All @@ -148,7 +147,7 @@ export class OmniFormatParser implements FormatParser {
const result = {
reasoningContent,
rawActionStrings: actionStr.split('\n\n').filter((action) => action.trim() !== ''),
actions: null,
actions: undefined,
};
return result;
}
Expand Down Expand Up @@ -187,8 +186,8 @@ export class UnifiedBCFormatParser implements FormatParser {

parse(text: string): {
reasoningContent: string | null;
rawActionStrings: string[] | null;
actions: BaseAction[] | null;
rawActionStrings: string[] | undefined;
actions: BaseAction[] | undefined;
} | null {
if (!this.canParse(text)) {
return null;
Expand Down Expand Up @@ -220,7 +219,7 @@ export class UnifiedBCFormatParser implements FormatParser {
return {
reasoningContent,
rawActionStrings: actionStr.split('\n\n').filter((action) => action.trim() !== ''),
actions: null,
actions: undefined,
};
}
}
Expand All @@ -245,8 +244,8 @@ class BCComplexFormatParser implements FormatParser {

parse(text: string): {
reasoningContent: string | null;
rawActionStrings: string[] | null;
actions: BaseAction[] | null;
rawActionStrings: string[] | undefined;
actions: BaseAction[] | undefined;
} | null {
if (!this.canParse(text)) {
return null;
Expand Down Expand Up @@ -282,7 +281,7 @@ class BCComplexFormatParser implements FormatParser {
reasoningContent:
reflection && thought ? `${reflection}, ${thought}` : (thought ?? reflection),
rawActionStrings: actionStr.split('\n\n').filter((action) => action.trim() !== ''),
actions: null,
actions: undefined,
};
}
}
Expand All @@ -306,8 +305,8 @@ class O1FormatParser implements FormatParser {

parse(text: string): {
reasoningContent: string | null;
rawActionStrings: string[] | null;
actions: BaseAction[] | null;
rawActionStrings: string[] | undefined;
actions: BaseAction[] | undefined;
} | null {
// this.logger.debug('[O1FormatParser] start...');
if (!this.canParse(text)) {
Expand All @@ -333,7 +332,7 @@ class O1FormatParser implements FormatParser {
return {
reasoningContent: thought,
rawActionStrings: actionContent.split('\n\n').filter((action) => action.trim() !== ''),
actions: null,
actions: undefined,
};
}
}
Expand All @@ -350,8 +349,8 @@ class FallbackFormatParser implements FormatParser {

parse(text: string): {
reasoningContent: string | null;
rawActionStrings: string[] | null;
actions: BaseAction[] | null;
rawActionStrings: string[] | undefined;
actions: BaseAction[] | undefined;
} | null {
this.logger.debug('[FallbackFormatParser] canParse: always true');

Expand Down Expand Up @@ -386,15 +385,19 @@ class FallbackFormatParser implements FormatParser {
return {
reasoningContent: thoughtStr,
rawActionStrings: actionStr.split('\n\n').filter((action) => action.trim() !== ''),
actions: null,
actions: undefined,
};
}
}

const defaultLogger = new ConsoleLogger(undefined, LogLevel.DEBUG);

export class FormatParserChain {
private logger: ConsoleLogger;
private parsers: FormatParser[];

constructor(private logger: ConsoleLogger) {
constructor(logger: ConsoleLogger = defaultLogger) {
this.logger = logger.spawn('[FormatParserChain]');
this.parsers = [
new XMLFormatParser(this.logger),
new OmniFormatParser(this.logger),
Expand All @@ -407,8 +410,8 @@ export class FormatParserChain {

parse(text: string): {
reasoningContent: string | null;
rawActionStrings: string[] | null;
actions: BaseAction[] | null;
rawActionStrings: string[] | undefined;
actions: BaseAction[] | undefined;
} {
this.logger.debug('[FormatParserChain] start...');

Expand Down
Loading
Loading