Skip to content

Commit 10246fd

Browse files
RolandMinruiXuTPLin22Hoder-zyfpeteryang1
authored
fix: add spec for hyperparameters in task design and coder (#995)
* init commit * remove the 5-fold spec from prompts * refine the hyperparameter specification * do not sample data * a small spelling issue * refine prompt to avoid submission cheating * do not sample data * simplify code * refine the coder evaluator prompt * refine wording * remove runtime from proposal * refine wording * refine prompt * add gpu info in runtime_info.py * modify the spec * add router and add refinement exp gen * fix prompt bug * use rule-based logic for router * complete the prompt * fix circular import bug * fix bug * make refine_decision optional * update pipeline prompts: (1) add scenary: in an iterative cooding loop and use sample datasets (2)add some generation tops in coding (3)add evaluation guidelines in evaluation (4)polish the json schema and description * fix a small bug * fix a small bug * rdagent/scenarios/data_science/loop.py back to the original version * refactor: replace _get_exp_gen with default_exp_gen for exp generation * import * refactor: make the __init__ back to main * fix small bugs * fix bugs for proposal_version * move refine into runner * check early stop * EDA improvement & coder classes number * fix CI * slightly refine the prompt * remove rule_base_eval and remove useless prompt --------- Co-authored-by: Xu <[email protected]> Co-authored-by: TPLin22 <[email protected]> Co-authored-by: amstrongzyf <[email protected]> Co-authored-by: Xu Yang <[email protected]> Co-authored-by: Xu Yang <[email protected]> Co-authored-by: Young <[email protected]>
1 parent 4ccd7fc commit 10246fd

File tree

26 files changed

+510
-463
lines changed

26 files changed

+510
-463
lines changed

.devcontainer/env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ ENABLE_CACHE=False
3939
PROMPT_CACHE_PATH=./log/prompt_cache.db
4040

4141
DS_CODER_COSTEER_ENV_TYPE=conda
42-
DS_PROPOSAL_VERSION=v2
42+
# DS_PROPOSAL_VERSION=v2 deprecated
4343

4444
DS_CODER_ON_WHOLE_PIPELINE=True
4545
COSTEER_V2_QUERY_FORMER_TRACE_LIMIT=3

rdagent/app/data_science/conf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
3838
spec_enabled: bool = True
3939

4040
#### proposal related
41-
proposal_version: str = "v1"
42-
coder_on_whole_pipeline: bool = False
41+
# proposal_version: str = "v2" deprecated
42+
43+
coder_on_whole_pipeline: bool = True
4344
max_trace_hist: int = 3
4445

4546
coder_max_loop: int = 10
4647
runner_max_loop: int = 1
4748

48-
rule_base_eval: bool = False
4949
sample_data_by_LLM: bool = False
5050
use_raw_description: bool = False
5151
show_nan_columns: bool = False

rdagent/components/coder/data_science/pipeline/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ def implement_one_task(
9595
queried_former_failed_knowledge=queried_former_failed_knowledge[0],
9696
out_spec=PythonAgentOut.get_spec(),
9797
runtime_environment=runtime_environment,
98-
spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
9998
enable_model_dump=DS_RD_SETTING.enable_model_dump,
10099
enable_debug_mode=DS_RD_SETTING.sample_data_by_LLM,
101100
)

rdagent/components/coder/data_science/pipeline/eval.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,7 @@ def evaluate(
7575
if match := re.search(r"estimated_time:\s*(\d+(?:.\d+)?)", result.stdout, re.DOTALL):
7676
full_estimated_time = float(match.group(1))
7777
if debug_time is not None and full_estimated_time is not None:
78-
stdout += f"Debug mode ran in {debug_time:.2f} seconds, estimated full run time is {full_estimated_time:.2f} seconds.\n"
79-
if full_estimated_time < env.conf.running_timeout_period * 3:
80-
stdout += "The estimated full run time is less than three times the timeout period.\n"
81-
else:
82-
stdout += f"The estimated full run time is more than three times the timeout period.\n"
78+
stdout += f"Debug mode ran in {debug_time:.2f} seconds, estimated full run time is {full_estimated_time:.2f} seconds. The estimated time is {full_estimated_time / env.conf.running_timeout_period * 100:.2f}% the debug time."
8379
else:
8480
stdout += "Debug mode did not provide debug_time or estimated_time, it's a buggy implementation.\n"
8581

@@ -130,21 +126,6 @@ def evaluate(
130126
submission_result = implementation.run(env=env, entry="python test/submission_format_test.py")
131127
submission_check_out = submission_result.stdout
132128
submission_ret_code = submission_result.exit_code
133-
if DS_RD_SETTING.rule_base_eval:
134-
if execute_ret_code == 0 and score_ret_code == 0 and submission_ret_code == 0:
135-
return PipelineSingleFeedback(
136-
execution=stdout,
137-
return_checking=score_check_text + "\n" + submission_check_out,
138-
code="Code evaluation is not available.",
139-
final_decision=True,
140-
)
141-
else:
142-
return PipelineSingleFeedback(
143-
execution=stdout,
144-
return_checking=score_check_text + "\n" + submission_check_out,
145-
code="Code evaluation is not available.",
146-
final_decision=False,
147-
)
148129
stdout += "\n" + submission_check_out
149130

150131
if not isinstance(implementation, FBWorkspace):

rdagent/components/coder/data_science/pipeline/prompts.yaml

Lines changed: 68 additions & 20 deletions
Large diffs are not rendered by default.

rdagent/core/proposal.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,13 @@ def __init__(
5757
*,
5858
code_change_summary: str | None = None,
5959
decision: bool,
60+
refine_decision: bool = False,
61+
eda_improvement: str | None = None,
6062
exception: Exception | None = None,
6163
) -> None:
6264
self.decision = decision
65+
self.refine_decision = refine_decision
66+
self.eda_improvement = eda_improvement
6367
self.reason = reason
6468
# Exception is not None means failing to generate runnable experiments due to exception.
6569
# Runable reuslts are not always good.
@@ -96,8 +100,16 @@ def __init__(
96100
*,
97101
code_change_summary: str | None = None,
98102
decision: bool,
103+
refine_decision: bool = False,
104+
eda_improvement: str | None = None,
99105
) -> None:
100-
super().__init__(reason, decision=decision, code_change_summary=code_change_summary)
106+
super().__init__(
107+
reason,
108+
decision=decision,
109+
refine_decision=refine_decision,
110+
code_change_summary=code_change_summary,
111+
eda_improvement=eda_improvement,
112+
)
101113
self.observations = observations
102114
self.hypothesis_evaluation = hypothesis_evaluation
103115
self.new_hypothesis = new_hypothesis

rdagent/scenarios/data_science/dev/feedback.py

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -61,32 +61,6 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
6161
f"The current score is {cur_score}, while the SOTA score is {sota_score}. "
6262
f"{'In this competition, higher is better.' if self.scen.metric_direction else 'In this competition, lower is better.'}"
6363
)
64-
if DS_RD_SETTING.rule_base_eval:
65-
if sota_exp:
66-
if cur_score > sota_score:
67-
return HypothesisFeedback(
68-
observations="The current score bigger than the SOTA score.",
69-
hypothesis_evaluation="The current score is bigger than the SOTA score.",
70-
new_hypothesis="No new hypothesis provided",
71-
reason="The current score is bigger than the SOTA score.",
72-
decision=True if self.scen.metric_direction else False,
73-
)
74-
elif cur_score < sota_score:
75-
return HypothesisFeedback(
76-
observations="The current score smaller than the SOTA score.",
77-
hypothesis_evaluation="The current score is smaller than the SOTA score.",
78-
new_hypothesis="No new hypothesis provided",
79-
reason="The current score is smaller than the SOTA score.",
80-
decision=False if self.scen.metric_direction else True,
81-
)
82-
else:
83-
return HypothesisFeedback(
84-
observations="The current score equals to the SOTA score.",
85-
hypothesis_evaluation="The current score equals to the SOTA score.",
86-
new_hypothesis="No new hypothesis provided",
87-
reason="The current score equals to the SOTA score.",
88-
decision=False,
89-
)
9064

9165
eda_output = exp.experiment_workspace.file_dict.get("EDA.md", None)
9266
system_prompt = T(".prompts:exp_feedback.system").r(
@@ -128,6 +102,8 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
128102
if evaluation_not_aligned
129103
else convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no"))
130104
),
105+
refine_decision=convert2bool(dict_get_with_warning(resp_dict, "Refine Decision", "no")),
106+
eda_improvement=dict_get_with_warning(resp_dict, "EDA Improvement", "no"), # EDA improvement suggestion
131107
)
132108

133109
if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:

rdagent/scenarios/data_science/dev/prompts.yaml

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ exp_feedback:
55
Below is a detailed description of the current Kaggle competition scenario:
66
{{ scenario }}
77
8-
Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous experiments and the best previous result (SOTA).
8+
Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous best SOTA result step by step.
99
10-
Step-by-step Analysis Process:
10+
# Step-by-step Analysis Process:
1111
1212
Step 1: Verify Submission Format
1313
- If the submission format check fails:
@@ -57,9 +57,14 @@ exp_feedback:
5757
- Please examine the code carefully based on the above criteria and provide a detailed analysis of the code.
5858
- Begin your `reasoning` with `[Code Analysis]`, clearly stating why the current code is better or worse than SOTA, based on the analysis of code implementation.
5959
- If the current code is not better than SOTA, set `"Replace Best Result": "no"`. Otherwise, set `"Replace Best Result": "yes"`.
60-
61-
Provide detailed and constructive feedback structured as follows:
62-
Example JSON Structure for Result Analysis:
60+
61+
Step 5: EDA improvement analysis (if needed)
62+
- The user might provide Data Overview in EDA format which is the output of the EDA code. You should analyze the EDA result and provide feedback on how it can be improved.
63+
- The improvement might include some addons or modifications or deletions to some part of the EDA code.
64+
- You should provide your feedback based on the current code and SOTA code. Especially focus on the feature engineering part.
65+
- For example, if the code truncate the line with N words, you can suggest to print the mean, median or quantile of the length of the line for better understanding of the data in the next rounds of experiments.
66+
67+
Provide detailed and constructive feedback structured as follows without anything else:
6368
{
6469
"Submission Format Check": "yes or no",
6570
"First Valid Submission": "yes or no",
@@ -68,7 +73,9 @@ exp_feedback:
6873
"Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
6974
"Evaluation Aligned With Task": "yes or no",
7075
"Replace Best Result": "yes or no",
71-
"Reasoning": "Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences."
76+
"Refine Decision": "yes or no",
77+
"Reasoning": "Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences.",
78+
"EDA Improvement": "improvement suggestion for EDA code, if needed, otherwise set to 'no'. If there is no EDA code, set to 'no'."
7279
}
7380
7481
user: |-

rdagent/scenarios/data_science/dev/runner/__init__.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,23 @@ def implement_one_task(
4040
if prev_task_feedback is None:
4141
# if no prev_tak_feedback, it is the first loop; we do not make any changes and goto evaluators directly.
4242
return {}
43-
44-
task_information_str = target_task.get_task_information()
45-
# 1. code
46-
system_prompt = T(".prompts:DSCoSTEER_debugger.system").r(
47-
task_desc=task_information_str,
48-
out_spec=PythonBatchEditOut.get_spec(with_del=False),
49-
)
50-
user_prompt = T(".prompts:DSCoSTEER_debugger.user").r(
43+
if prev_task_feedback.hyperparameter_tuning_decision:
44+
task_information_str = target_task.get_task_information()
45+
# 1. code
46+
system_prompt = T(".prompts:DSCoSTEER.system_refine").r(
47+
out_spec=PythonBatchEditOut.get_spec(with_del=False),
48+
)
49+
else:
50+
task_information_str = target_task.get_task_information()
51+
# 1. code
52+
system_prompt = T(".prompts:DSCoSTEER.system_refine").r(
53+
task_desc=task_information_str,
54+
out_spec=PythonBatchEditOut.get_spec(with_del=False),
55+
)
56+
user_prompt = T(".prompts:DSCoSTEER.user").r(
5157
code=workspace.all_codes,
5258
feedback=prev_task_feedback,
59+
hyperparameter_tuning_suggestion=prev_task_feedback.hyperparameter_tuning_suggestion,
5360
)
5461

5562
batch_edit = PythonBatchEditOut.extract_output(

rdagent/scenarios/data_science/dev/runner/eval.py

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,19 @@
2525

2626
DIRNAME = Path(__file__).absolute().resolve().parent
2727

28-
DSCoSTEEREvalFeedback = CoSTEERSingleFeedback
28+
29+
class DSCoSTEEREvalFeedback(CoSTEERSingleFeedback):
30+
"""
31+
Feedback for Data Science CoSTEER evaluation.
32+
This feedback is used to evaluate the code and execution of the Data Science CoSTEER task.
33+
"""
34+
35+
def __init__(
36+
self, *args, hyperparameter_tuning_decision: bool = None, hyperparameter_tuning_suggestion: str = None, **kwargs
37+
):
38+
super().__init__(*args, **kwargs)
39+
self.hyperparameter_tuning_decision = hyperparameter_tuning_decision
40+
self.hyperparameter_tuning_suggestion = hyperparameter_tuning_suggestion
2941

3042

3143
class DSCoSTEERCoSTEEREvaluator(CoSTEEREvaluator):
@@ -116,27 +128,6 @@ def evaluate(
116128
if test_eval.enabled(self.scen.competition):
117129
submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
118130
stdout += f"\nSubmission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
119-
if DS_RD_SETTING.rule_base_eval:
120-
if DS_RD_SETTING.if_using_mle_data:
121-
score_check_text = score_check_text + "\n" + submission_check_out
122-
if (
123-
execute_ret_code == 0
124-
and score_ret_code == 0
125-
and (not DS_RD_SETTING.if_using_mle_data or submission_ret_code == 0)
126-
):
127-
return DSCoSTEEREvalFeedback(
128-
execution=stdout,
129-
return_checking=score_check_text,
130-
code="Code evaluation is not available.",
131-
final_decision=True,
132-
)
133-
else:
134-
return DSCoSTEEREvalFeedback(
135-
execution=stdout,
136-
return_checking=score_check_text,
137-
code="Code evaluation is not available.",
138-
final_decision=False,
139-
)
140131

141132
system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
142133
scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
@@ -146,6 +137,9 @@ def evaluate(
146137
user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
147138
code=implementation.all_codes,
148139
stdout=shrink_text(stdout),
140+
time_spent=f"{implementation.running_info.running_time:.2f} seconds",
141+
timeout=f"{env.conf.running_timeout_period} seconds",
142+
percent_of_timeout_used=f"{(implementation.running_info.running_time / env.conf.running_timeout_period) * 100:.2f}%",
149143
)
150144

151145
feedback = build_cls_from_json_with_retry(

0 commit comments

Comments
 (0)