diff --git a/rdagent/components/coder/data_science/ensemble/eval.py b/rdagent/components/coder/data_science/ensemble/eval.py index ad207c454..d424e0d13 100644 --- a/rdagent/components/coder/data_science/ensemble/eval.py +++ b/rdagent/components/coder/data_science/ensemble/eval.py @@ -67,7 +67,7 @@ def evaluate( implementation.inject_files(**{fname: test_code}) result = implementation.run(env=env, entry=f"python {fname}") - stdout = result.stdout + stdout = result.get_truncated_stdout() ret_code = result.exit_code stdout += f"\nNOTE: the above scripts run with return code {ret_code}" diff --git a/rdagent/components/coder/data_science/feature/eval.py b/rdagent/components/coder/data_science/feature/eval.py index af1844fd4..8f51c62e4 100644 --- a/rdagent/components/coder/data_science/feature/eval.py +++ b/rdagent/components/coder/data_science/feature/eval.py @@ -69,7 +69,7 @@ def evaluate( workflow_code=implementation.all_codes, ) user_prompt = T(".prompts:feature_eval.user").r( - stdout=shrink_text(result.stdout), + stdout=result.get_truncated_stdout(), workflow_stdout=workflow_stdout, ) diff --git a/rdagent/components/coder/data_science/model/eval.py b/rdagent/components/coder/data_science/model/eval.py index fe485d286..56ec86c2b 100644 --- a/rdagent/components/coder/data_science/model/eval.py +++ b/rdagent/components/coder/data_science/model/eval.py @@ -71,7 +71,7 @@ def evaluate( ) # only check the model changed this time implementation.inject_files(**{fname: test_code}) result = implementation.run(env=env, entry=f"python {fname}") - stdout = result.stdout + stdout = result.get_truncated_stdout() ret_code = result.exit_code if stdout is None: @@ -118,6 +118,6 @@ def evaluate( user_prompt=user_prompt, init_kwargs_update_func=ModelSingleFeedback.val_and_update_init_dict, ) - fb.final_decision = fb.final_decision and result.exit_code == 0 + fb.final_decision = fb.final_decision and ret_code == 0 return fb diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py index fd3ca2a91..148bcbd31 100644 --- a/rdagent/components/coder/data_science/pipeline/eval.py +++ b/rdagent/components/coder/data_science/pipeline/eval.py @@ -70,6 +70,7 @@ def evaluate( result = implementation.run( env=env, entry=f"strace -e trace=file -f -o trace.log python -m coverage run main.py" ) + result_stdout = result.get_truncated_stdout() nb_conversion_ret_code = 0 nb_conversion_check_text = "" @@ -84,7 +85,7 @@ def evaluate( notebook_converter.convert( task=target_task, code=code, - stdout=result.stdout, + stdout=result_stdout, outfile=implementation.workspace_path / "main.ipynb", use_debug_flag=DS_RD_SETTING.sample_data_by_LLM, ) @@ -103,16 +104,16 @@ def evaluate( stdout += f"Code opened the sample submission file '{sample_submission_file_name}' during execution.\n Reject the implementation!\n" sample_submission_check = False - result.stdout = remove_eda_part(result.stdout) + result_stdout = remove_eda_part(result_stdout) if result.exit_code != 0: - stdout += f"Code failed to run. Please check the stdout:\n Following the stdout of the debug mode run:\n{result.stdout.strip()}\n" + stdout += f"Code failed to run. Please check the stdout:\n Following the stdout of the debug mode run:\n{result_stdout.strip()}\n" else: - stdout += f"Code ran successfully.\n Following the stdout of the debug mode run:\n{result.stdout.strip()}\n" + stdout += f"Code ran successfully.\n Following the stdout of the debug mode run:\n{result_stdout.strip()}\n" if DS_RD_SETTING.sample_data_by_LLM: debug_time, full_estimated_time = None, None - if match := re.search(r"debug_time:\s*(\d+(?:.\d+)?)", result.stdout, re.DOTALL): + if match := re.search(r"debug_time:\s*(\d+(?:.\d+)?)", result_stdout, re.DOTALL): debug_time = float(match.group(1)) - if match := re.search(r"estimated_time:\s*(\d+(?:.\d+)?)", result.stdout, re.DOTALL): + if match := re.search(r"estimated_time:\s*(\d+(?:.\d+)?)", result_stdout, re.DOTALL): full_estimated_time = float(match.group(1)) if debug_time is not None and full_estimated_time is not None: stdout += f"Debug mode ran in {debug_time:.2f} seconds, estimated full run time is {full_estimated_time:.2f} seconds. The estimated time is {full_estimated_time / env.conf.running_timeout_period * 100:.2f}% the debug time." @@ -167,7 +168,7 @@ def evaluate( implementation.inject_files(**{"test/submission_format_test.py": base_check_code}) # stdout += "----Submission Check 1-----\n" submission_result = implementation.run(env=env, entry="python test/submission_format_test.py") - submission_check_out = submission_result.stdout + submission_check_out = submission_result.get_truncated_stdout() submission_ret_code = submission_result.exit_code stdout += "\n" + submission_check_out diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval.py b/rdagent/components/coder/data_science/raw_data_loader/eval.py index e21e2fae0..2289f56f7 100644 --- a/rdagent/components/coder/data_science/raw_data_loader/eval.py +++ b/rdagent/components/coder/data_science/raw_data_loader/eval.py @@ -56,7 +56,7 @@ def evaluate( test_code = (DIRNAME / "eval_tests" / "data_loader_test.txt").read_text() implementation.inject_files(**{fname: test_code}) result = implementation.run(env=env, entry=f"python {fname}") - stdout = result.stdout + stdout = result.get_truncated_stdout() ret_code = result.exit_code match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===(.*)", stdout, re.DOTALL) stdout_part_1, eda_output, stdout_part_2 = match.groups() if match else (stdout, None, "") diff --git a/rdagent/components/coder/data_science/workflow/eval.py b/rdagent/components/coder/data_science/workflow/eval.py index 49fbc97c7..d8d489fea 100644 --- a/rdagent/components/coder/data_science/workflow/eval.py +++ b/rdagent/components/coder/data_science/workflow/eval.py @@ -125,7 +125,7 @@ def evaluate( implementation.inject_files(**{"test/submission_format_test.py": base_check_code}) # stdout += "----Submission Check 1-----\n" submission_result = implementation.run(env=env, entry="python test/submission_format_test.py") - submission_check_out = submission_result.stdout + submission_check_out = submission_result.get_truncated_stdout() submission_ret_code = submission_result.exit_code stdout += "\n" + submission_check_out diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py index 12b2abf93..e2f48aa98 100644 --- a/rdagent/core/experiment.py +++ b/rdagent/core/experiment.py @@ -17,12 +17,10 @@ from rdagent.core.conf import RD_AGENT_SETTINGS from rdagent.core.evaluation import Feedback -from rdagent.utils import filter_redundant_text if TYPE_CHECKING: from rdagent.utils.env import EnvResult -from rdagent.utils.fmt import shrink_text if typing.TYPE_CHECKING: from rdagent.core.proposal import Hypothesis @@ -280,7 +278,7 @@ def execute(self, env: Env, entry: str) -> str: Before each execution, make sure to prepare and inject code. """ result = self.run(env, entry) - return result.stdout + return result.get_truncated_stdout() # NOTE: truncating just for aligning with the old code. def run(self, env: Env, entry: str) -> EnvResult: """ @@ -290,14 +288,7 @@ def run(self, env: Env, entry: str) -> EnvResult: """ self.prepare() self.inject_files(**self.file_dict) - result = env.run(entry, str(self.workspace_path), env={"PYTHONPATH": "./"}) - # result is EnvResult - result.stdout = shrink_text( - filter_redundant_text(result.stdout), - context_lines=RD_AGENT_SETTINGS.stdout_context_len, - line_len=RD_AGENT_SETTINGS.stdout_line_len, - ) - return result + return env.run(entry, str(self.workspace_path), env={"PYTHONPATH": "./"}) def create_ws_ckp(self) -> None: """ diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py index 073572a2d..55db014aa 100644 --- a/rdagent/scenarios/data_science/dev/runner/__init__.py +++ b/rdagent/scenarios/data_science/dev/runner/__init__.py @@ -1,3 +1,5 @@ +from typing import Literal + import pandas as pd from rdagent.app.data_science.conf import DS_RD_SETTING @@ -19,7 +21,6 @@ from rdagent.core.scenario import Scenario from rdagent.log import rdagent_logger as logger from rdagent.oai.llm_utils import APIBackend, md5_hash -from rdagent.scenarios.data_science.dev.runner.eval import DSRunnerEvaluator from rdagent.utils.agent.ret import PythonBatchEditOut, PythonBatchPatchOut from rdagent.utils.agent.tpl import T from rdagent.utils.workflow import wait_retry @@ -34,6 +35,7 @@ class Config: max_seconds_multiplier: int = 1 env_type: str = "docker" diff_mode: bool = False + dump_stdout_type: Literal["full", "truncated"] = "truncated" # TODO: extract a function for env and conf. @@ -143,6 +145,10 @@ def __init__( **kwargs, ) -> None: + from rdagent.scenarios.data_science.dev.runner.eval import ( + DSRunnerEvaluator, # avoid circular import + ) + eval_l = [DSRunnerEvaluator(scen=scen)] if DS_RD_SETTING.enable_model_dump: eval_l.append(ModelDumpEvaluator(scen=scen, data_type="full")) diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py index b98c4d7b5..c46797414 100644 --- a/rdagent/scenarios/data_science/dev/runner/eval.py +++ b/rdagent/scenarios/data_science/dev/runner/eval.py @@ -17,6 +17,7 @@ from rdagent.core.experiment import FBWorkspace, Task from rdagent.log import rdagent_logger as logger from rdagent.log.timer import RD_Agent_TIMER_wrapper +from rdagent.scenarios.data_science.dev.runner import DSRunnerCoSTEERSettings from rdagent.scenarios.data_science.test_eval import ( MLETestEval, NoTestEvalError, @@ -99,7 +100,7 @@ def evaluate( # execute workflow result = implementation.run(env=env, entry="python -m coverage run main.py") - stdout = result.stdout + stdout = result.get_truncated_stdout() execute_ret_code = result.exit_code implementation.running_info.running_time = result.running_time @@ -107,7 +108,12 @@ def evaluate( eda_output = match.groups()[1] if match else None if eda_output is None: eda_output = "No EDA output." - implementation.inject_files(**{"EDA.md": eda_output}) + implementation.inject_files( + **{ + "EDA.md": eda_output, + "stdout.txt": result.stdout if DSRunnerCoSTEERSettings().dump_stdout_type == "full" else stdout, + } + ) # stdout.txt is used for debugging. not used in any other place. stdout = remove_eda_part(stdout) stdout += f"The code executed {'successfully' if execute_ret_code == 0 else 'failed'}. {'The EDA output is removed from the stdout. ' if eda_output else ''}" diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/select/submit.py b/rdagent/scenarios/data_science/proposal/exp_gen/select/submit.py index 8187d6fde..aead29089 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/select/submit.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/select/submit.py @@ -448,7 +448,7 @@ def _generate_and_run_script( result = ws.run( env=env, entry=f"python {script_type}.py --cache-buster={time.time()}" ) # Do not cache the result - stdout = re.sub(r"^chmod:.*\n?", "", result.stdout, flags=re.MULTILINE) + stdout = re.sub(r"^chmod:.*\n?", "", result.get_truncated_stdout(), flags=re.MULTILINE) if result.exit_code == 0: logger.info(f"Successfully generated and ran {script_type}.py.") @@ -458,7 +458,7 @@ def _generate_and_run_script( running_timeout_period=DS_RD_SETTING.full_timeout, ) result = ws.run(env=env, entry=f"python main.py --cache-buster={time.time()}") - stdout = re.sub(r"^chmod:.*\n?", "", result.stdout, flags=re.MULTILINE) + stdout = re.sub(r"^chmod:.*\n?", "", result.get_truncated_stdout(), flags=re.MULTILINE) if result.exit_code == 0: # move submission.csv to mock_folder if Path(ws.workspace_path / "submission.csv").exists(): @@ -530,7 +530,7 @@ def process_experiment( env.conf.running_timeout_period = DS_RD_SETTING.debug_timeout result = ws.run(env=env, entry="python grade.py") if result.exit_code == 0: - grade_stdout = re.sub(r"^chmod:.*\n?", "", result.stdout, flags=re.MULTILINE) + grade_stdout = re.sub(r"^chmod:.*\n?", "", result.get_truncated_stdout(), flags=re.MULTILINE) logger.info(f"Ran grade.py for {competition}/{loop_id}; exit_code: {result.exit_code}") else: logger.warning(f"Skipping grading for {competition}/{loop_id} due to main.py execution failure.") diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py index bc1512d17..b66f1cc0c 100644 --- a/rdagent/utils/env.py +++ b/rdagent/utils/env.py @@ -41,7 +41,9 @@ from rdagent.core.experiment import RD_AGENT_SETTINGS from rdagent.log import rdagent_logger as logger from rdagent.oai.llm_utils import md5_hash +from rdagent.utils import filter_redundant_text from rdagent.utils.agent.tpl import T +from rdagent.utils.fmt import shrink_text from rdagent.utils.workflow import wait_retry @@ -145,6 +147,13 @@ class EnvResult: exit_code: int running_time: float + def get_truncated_stdout(self) -> str: + return shrink_text( + filter_redundant_text(self.stdout), + context_lines=RD_AGENT_SETTINGS.stdout_context_len, + line_len=RD_AGENT_SETTINGS.stdout_line_len, + ) + class Env(Generic[ASpecificEnvConf]): """