Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rdagent/components/coder/data_science/ensemble/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def evaluate(

implementation.inject_files(**{fname: test_code})
result = implementation.run(env=env, entry=f"python {fname}")
stdout = result.stdout
stdout = result.get_truncated_stdout()
ret_code = result.exit_code

stdout += f"\nNOTE: the above scripts run with return code {ret_code}"
Expand Down
2 changes: 1 addition & 1 deletion rdagent/components/coder/data_science/feature/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def evaluate(
workflow_code=implementation.all_codes,
)
user_prompt = T(".prompts:feature_eval.user").r(
stdout=shrink_text(result.stdout),
stdout=result.get_truncated_stdout(),
workflow_stdout=workflow_stdout,
)

Expand Down
4 changes: 2 additions & 2 deletions rdagent/components/coder/data_science/model/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def evaluate(
) # only check the model changed this time
implementation.inject_files(**{fname: test_code})
result = implementation.run(env=env, entry=f"python {fname}")
stdout = result.stdout
stdout = result.get_truncated_stdout()
ret_code = result.exit_code

if stdout is None:
Expand Down Expand Up @@ -118,6 +118,6 @@ def evaluate(
user_prompt=user_prompt,
init_kwargs_update_func=ModelSingleFeedback.val_and_update_init_dict,
)
fb.final_decision = fb.final_decision and result.exit_code == 0
fb.final_decision = fb.final_decision and ret_code == 0

return fb
15 changes: 8 additions & 7 deletions rdagent/components/coder/data_science/pipeline/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def evaluate(
result = implementation.run(
env=env, entry=f"strace -e trace=file -f -o trace.log python -m coverage run main.py"
)
result_stdout = result.get_truncated_stdout()

nb_conversion_ret_code = 0
nb_conversion_check_text = ""
Expand All @@ -84,7 +85,7 @@ def evaluate(
notebook_converter.convert(
task=target_task,
code=code,
stdout=result.stdout,
stdout=result_stdout,
outfile=implementation.workspace_path / "main.ipynb",
use_debug_flag=DS_RD_SETTING.sample_data_by_LLM,
)
Expand All @@ -103,16 +104,16 @@ def evaluate(
stdout += f"Code opened the sample submission file '{sample_submission_file_name}' during execution.\n Reject the implementation!\n"
sample_submission_check = False

result.stdout = remove_eda_part(result.stdout)
result_stdout = remove_eda_part(result_stdout)
if result.exit_code != 0:
stdout += f"Code failed to run. Please check the stdout:\n Following the stdout of the debug mode run:\n{result.stdout.strip()}\n"
stdout += f"Code failed to run. Please check the stdout:\n Following the stdout of the debug mode run:\n{result_stdout.strip()}\n"
else:
stdout += f"Code ran successfully.\n Following the stdout of the debug mode run:\n{result.stdout.strip()}\n"
stdout += f"Code ran successfully.\n Following the stdout of the debug mode run:\n{result_stdout.strip()}\n"
if DS_RD_SETTING.sample_data_by_LLM:
debug_time, full_estimated_time = None, None
if match := re.search(r"debug_time:\s*(\d+(?:.\d+)?)", result.stdout, re.DOTALL):
if match := re.search(r"debug_time:\s*(\d+(?:.\d+)?)", result_stdout, re.DOTALL):
debug_time = float(match.group(1))
if match := re.search(r"estimated_time:\s*(\d+(?:.\d+)?)", result.stdout, re.DOTALL):
if match := re.search(r"estimated_time:\s*(\d+(?:.\d+)?)", result_stdout, re.DOTALL):
full_estimated_time = float(match.group(1))
if debug_time is not None and full_estimated_time is not None:
stdout += f"Debug mode ran in {debug_time:.2f} seconds, estimated full run time is {full_estimated_time:.2f} seconds. The estimated time is {full_estimated_time / env.conf.running_timeout_period * 100:.2f}% the debug time."
Expand Down Expand Up @@ -167,7 +168,7 @@ def evaluate(
implementation.inject_files(**{"test/submission_format_test.py": base_check_code})
# stdout += "----Submission Check 1-----\n"
submission_result = implementation.run(env=env, entry="python test/submission_format_test.py")
submission_check_out = submission_result.stdout
submission_check_out = submission_result.get_truncated_stdout()
submission_ret_code = submission_result.exit_code
stdout += "\n" + submission_check_out

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def evaluate(
test_code = (DIRNAME / "eval_tests" / "data_loader_test.txt").read_text()
implementation.inject_files(**{fname: test_code})
result = implementation.run(env=env, entry=f"python {fname}")
stdout = result.stdout
stdout = result.get_truncated_stdout()
ret_code = result.exit_code
match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===(.*)", stdout, re.DOTALL)
stdout_part_1, eda_output, stdout_part_2 = match.groups() if match else (stdout, None, "")
Expand Down
2 changes: 1 addition & 1 deletion rdagent/components/coder/data_science/workflow/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def evaluate(
implementation.inject_files(**{"test/submission_format_test.py": base_check_code})
# stdout += "----Submission Check 1-----\n"
submission_result = implementation.run(env=env, entry="python test/submission_format_test.py")
submission_check_out = submission_result.stdout
submission_check_out = submission_result.get_truncated_stdout()
submission_ret_code = submission_result.exit_code
stdout += "\n" + submission_check_out

Expand Down
13 changes: 2 additions & 11 deletions rdagent/core/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@

from rdagent.core.conf import RD_AGENT_SETTINGS
from rdagent.core.evaluation import Feedback
from rdagent.utils import filter_redundant_text

if TYPE_CHECKING:
from rdagent.utils.env import EnvResult

from rdagent.utils.fmt import shrink_text

if typing.TYPE_CHECKING:
from rdagent.core.proposal import Hypothesis
Expand Down Expand Up @@ -280,7 +278,7 @@ def execute(self, env: Env, entry: str) -> str:
Before each execution, make sure to prepare and inject code.
"""
result = self.run(env, entry)
return result.stdout
return result.get_truncated_stdout() # NOTE: truncating just for aligning with the old code.

def run(self, env: Env, entry: str) -> EnvResult:
"""
Expand All @@ -290,14 +288,7 @@ def run(self, env: Env, entry: str) -> EnvResult:
"""
self.prepare()
self.inject_files(**self.file_dict)
result = env.run(entry, str(self.workspace_path), env={"PYTHONPATH": "./"})
# result is EnvResult
result.stdout = shrink_text(
filter_redundant_text(result.stdout),
context_lines=RD_AGENT_SETTINGS.stdout_context_len,
line_len=RD_AGENT_SETTINGS.stdout_line_len,
)
return result
return env.run(entry, str(self.workspace_path), env={"PYTHONPATH": "./"})

def create_ws_ckp(self) -> None:
"""
Expand Down
8 changes: 7 additions & 1 deletion rdagent/scenarios/data_science/dev/runner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Literal

import pandas as pd

from rdagent.app.data_science.conf import DS_RD_SETTING
Expand All @@ -19,7 +21,6 @@
from rdagent.core.scenario import Scenario
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend, md5_hash
from rdagent.scenarios.data_science.dev.runner.eval import DSRunnerEvaluator
from rdagent.utils.agent.ret import PythonBatchEditOut, PythonBatchPatchOut
from rdagent.utils.agent.tpl import T
from rdagent.utils.workflow import wait_retry
Expand All @@ -34,6 +35,7 @@ class Config:
max_seconds_multiplier: int = 1
env_type: str = "docker"
diff_mode: bool = False
dump_stdout_type: Literal["full", "truncated"] = "truncated"
# TODO: extract a function for env and conf.


Expand Down Expand Up @@ -143,6 +145,10 @@ def __init__(
**kwargs,
) -> None:

from rdagent.scenarios.data_science.dev.runner.eval import (
DSRunnerEvaluator, # avoid circular import
)

eval_l = [DSRunnerEvaluator(scen=scen)]
if DS_RD_SETTING.enable_model_dump:
eval_l.append(ModelDumpEvaluator(scen=scen, data_type="full"))
Expand Down
10 changes: 8 additions & 2 deletions rdagent/scenarios/data_science/dev/runner/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from rdagent.core.experiment import FBWorkspace, Task
from rdagent.log import rdagent_logger as logger
from rdagent.log.timer import RD_Agent_TIMER_wrapper
from rdagent.scenarios.data_science.dev.runner import DSRunnerCoSTEERSettings
from rdagent.scenarios.data_science.test_eval import (
MLETestEval,
NoTestEvalError,
Expand Down Expand Up @@ -99,15 +100,20 @@ def evaluate(

# execute workflow
result = implementation.run(env=env, entry="python -m coverage run main.py")
stdout = result.stdout
stdout = result.get_truncated_stdout()
execute_ret_code = result.exit_code
implementation.running_info.running_time = result.running_time

match = re.search(r"(.*?)=== Start of EDA part ===(.*)=== End of EDA part ===", stdout, re.DOTALL)
eda_output = match.groups()[1] if match else None
if eda_output is None:
eda_output = "No EDA output."
implementation.inject_files(**{"EDA.md": eda_output})
implementation.inject_files(
**{
"EDA.md": eda_output,
"stdout.txt": result.stdout if DSRunnerCoSTEERSettings().dump_stdout_type == "full" else stdout,
}
) # stdout.txt is used for debugging. not used in any other place.
stdout = remove_eda_part(stdout)
stdout += f"The code executed {'successfully' if execute_ret_code == 0 else 'failed'}. {'The EDA output is removed from the stdout. ' if eda_output else ''}"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ def _generate_and_run_script(
result = ws.run(
env=env, entry=f"python {script_type}.py --cache-buster={time.time()}"
) # Do not cache the result
stdout = re.sub(r"^chmod:.*\n?", "", result.stdout, flags=re.MULTILINE)
stdout = re.sub(r"^chmod:.*\n?", "", result.get_truncated_stdout(), flags=re.MULTILINE)

if result.exit_code == 0:
logger.info(f"Successfully generated and ran {script_type}.py.")
Expand All @@ -458,7 +458,7 @@ def _generate_and_run_script(
running_timeout_period=DS_RD_SETTING.full_timeout,
)
result = ws.run(env=env, entry=f"python main.py --cache-buster={time.time()}")
stdout = re.sub(r"^chmod:.*\n?", "", result.stdout, flags=re.MULTILINE)
stdout = re.sub(r"^chmod:.*\n?", "", result.get_truncated_stdout(), flags=re.MULTILINE)
if result.exit_code == 0:
# move submission.csv to mock_folder
if Path(ws.workspace_path / "submission.csv").exists():
Expand Down Expand Up @@ -530,7 +530,7 @@ def process_experiment(
env.conf.running_timeout_period = DS_RD_SETTING.debug_timeout
result = ws.run(env=env, entry="python grade.py")
if result.exit_code == 0:
grade_stdout = re.sub(r"^chmod:.*\n?", "", result.stdout, flags=re.MULTILINE)
grade_stdout = re.sub(r"^chmod:.*\n?", "", result.get_truncated_stdout(), flags=re.MULTILINE)
logger.info(f"Ran grade.py for {competition}/{loop_id}; exit_code: {result.exit_code}")
else:
logger.warning(f"Skipping grading for {competition}/{loop_id} due to main.py execution failure.")
Expand Down
9 changes: 9 additions & 0 deletions rdagent/utils/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@
from rdagent.core.experiment import RD_AGENT_SETTINGS
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import md5_hash
from rdagent.utils import filter_redundant_text
from rdagent.utils.agent.tpl import T
from rdagent.utils.fmt import shrink_text
from rdagent.utils.workflow import wait_retry


Expand Down Expand Up @@ -145,6 +147,13 @@ class EnvResult:
exit_code: int
running_time: float

def get_truncated_stdout(self) -> str:
return shrink_text(
filter_redundant_text(self.stdout),
context_lines=RD_AGENT_SETTINGS.stdout_context_len,
line_len=RD_AGENT_SETTINGS.stdout_line_len,
)


class Env(Generic[ASpecificEnvConf]):
"""
Expand Down