microsoft
diff --git a/‎.devcontainer/env‎
Lines changed: 1 addition & 1 deletion b/‎.devcontainer/env‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎rdagent/app/data_science/conf.py‎
Lines changed: 3 additions & 3 deletions b/‎rdagent/app/data_science/conf.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎rdagent/components/coder/data_science/pipeline/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎rdagent/components/coder/data_science/pipeline/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎rdagent/components/coder/data_science/pipeline/eval.py‎
Lines changed: 1 addition & 20 deletions b/‎rdagent/components/coder/data_science/pipeline/eval.py‎
Lines changed: 1 addition & 20 deletions
diff --git a/‎rdagent/components/coder/data_science/pipeline/prompts.yaml‎
Lines changed: 68 additions & 20 deletions b/‎rdagent/components/coder/data_science/pipeline/prompts.yaml‎
Lines changed: 68 additions & 20 deletions
diff --git a/‎rdagent/core/proposal.py‎
Lines changed: 13 additions & 1 deletion b/‎rdagent/core/proposal.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎rdagent/scenarios/data_science/dev/feedback.py‎
Lines changed: 2 additions & 26 deletions b/‎rdagent/scenarios/data_science/dev/feedback.py‎
Lines changed: 2 additions & 26 deletions
diff --git a/‎rdagent/scenarios/data_science/dev/prompts.yaml‎
Lines changed: 13 additions & 6 deletions b/‎rdagent/scenarios/data_science/dev/prompts.yaml‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎rdagent/scenarios/data_science/dev/runner/__init__.py‎
Lines changed: 15 additions & 8 deletions b/‎rdagent/scenarios/data_science/dev/runner/__init__.py‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎rdagent/scenarios/data_science/dev/runner/eval.py‎
Lines changed: 16 additions & 22 deletions b/‎rdagent/scenarios/data_science/dev/runner/eval.py‎
Lines changed: 16 additions & 22 deletions
@@ -39,7 +39,7 @@ ENABLE_CACHE=False
 PROMPT_CACHE_PATH=./log/prompt_cache.db
 
 DS_CODER_COSTEER_ENV_TYPE=conda
-DS_PROPOSAL_VERSION=v2
+# DS_PROPOSAL_VERSION=v2 deprecated
 
 DS_CODER_ON_WHOLE_PIPELINE=True
 COSTEER_V2_QUERY_FORMER_TRACE_LIMIT=3
 
@@ -38,14 +38,14 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     spec_enabled: bool = True
 
     #### proposal related
-    proposal_version: str = "v1"
-    coder_on_whole_pipeline: bool = False
+    # proposal_version: str = "v2" deprecated
+
+    coder_on_whole_pipeline: bool = True
     max_trace_hist: int = 3
 
     coder_max_loop: int = 10
     runner_max_loop: int = 1
 
-    rule_base_eval: bool = False
     sample_data_by_LLM: bool = False
     use_raw_description: bool = False
     show_nan_columns: bool = False
 
@@ -95,7 +95,6 @@ def implement_one_task(
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=PythonAgentOut.get_spec(),
             runtime_environment=runtime_environment,
-            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
             enable_model_dump=DS_RD_SETTING.enable_model_dump,
             enable_debug_mode=DS_RD_SETTING.sample_data_by_LLM,
         )
 
@@ -75,11 +75,7 @@ def evaluate(
             if match := re.search(r"estimated_time:\s*(\d+(?:.\d+)?)", result.stdout, re.DOTALL):
                 full_estimated_time = float(match.group(1))
             if debug_time is not None and full_estimated_time is not None:
-                stdout += f"Debug mode ran in {debug_time:.2f} seconds, estimated full run time is {full_estimated_time:.2f} seconds.\n"
-                if full_estimated_time < env.conf.running_timeout_period * 3:
-                    stdout += "The estimated full run time is less than three times the timeout period.\n"
-                else:
-                    stdout += f"The estimated full run time is more than three times the timeout period.\n"
+                stdout += f"Debug mode ran in {debug_time:.2f} seconds, estimated full run time is {full_estimated_time:.2f} seconds. The estimated time is {full_estimated_time / env.conf.running_timeout_period * 100:.2f}% the debug time."
             else:
                 stdout += "Debug mode did not provide debug_time or estimated_time, it's a buggy implementation.\n"
 
@@ -130,21 +126,6 @@ def evaluate(
             submission_result = implementation.run(env=env, entry="python test/submission_format_test.py")
             submission_check_out = submission_result.stdout
             submission_ret_code = submission_result.exit_code
-            if DS_RD_SETTING.rule_base_eval:
-                if execute_ret_code == 0 and score_ret_code == 0 and submission_ret_code == 0:
-                    return PipelineSingleFeedback(
-                        execution=stdout,
-                        return_checking=score_check_text + "\n" + submission_check_out,
-                        code="Code evaluation is not available.",
-                        final_decision=True,
-                    )
-                else:
-                    return PipelineSingleFeedback(
-                        execution=stdout,
-                        return_checking=score_check_text + "\n" + submission_check_out,
-                        code="Code evaluation is not available.",
-                        final_decision=False,
-                    )
             stdout += "\n" + submission_check_out
 
         if not isinstance(implementation, FBWorkspace):
 
@@ -57,9 +57,13 @@ def __init__(
         *,
         code_change_summary: str | None = None,
         decision: bool,
+        refine_decision: bool = False,
+        eda_improvement: str | None = None,
         exception: Exception | None = None,
     ) -> None:
         self.decision = decision
+        self.refine_decision = refine_decision
+        self.eda_improvement = eda_improvement
         self.reason = reason
         # Exception is not None means failing to generate runnable experiments due to exception.
         # Runable reuslts are not always good.
@@ -96,8 +100,16 @@ def __init__(
         *,
         code_change_summary: str | None = None,
         decision: bool,
+        refine_decision: bool = False,
+        eda_improvement: str | None = None,
     ) -> None:
-        super().__init__(reason, decision=decision, code_change_summary=code_change_summary)
+        super().__init__(
+            reason,
+            decision=decision,
+            refine_decision=refine_decision,
+            code_change_summary=code_change_summary,
+            eda_improvement=eda_improvement,
+        )
         self.observations = observations
         self.hypothesis_evaluation = hypothesis_evaluation
         self.new_hypothesis = new_hypothesis
 
@@ -61,32 +61,6 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
                 f"The current score is {cur_score}, while the SOTA score is {sota_score}. "
                 f"{'In this competition, higher is better.' if self.scen.metric_direction else 'In this competition, lower is better.'}"
             )
-        if DS_RD_SETTING.rule_base_eval:
-            if sota_exp:
-                if cur_score > sota_score:
-                    return HypothesisFeedback(
-                        observations="The current score bigger than the SOTA score.",
-                        hypothesis_evaluation="The current score is bigger than the SOTA score.",
-                        new_hypothesis="No new hypothesis provided",
-                        reason="The current score is bigger than the SOTA score.",
-                        decision=True if self.scen.metric_direction else False,
-                    )
-                elif cur_score < sota_score:
-                    return HypothesisFeedback(
-                        observations="The current score smaller than the SOTA score.",
-                        hypothesis_evaluation="The current score is smaller than the SOTA score.",
-                        new_hypothesis="No new hypothesis provided",
-                        reason="The current score is smaller than the SOTA score.",
-                        decision=False if self.scen.metric_direction else True,
-                    )
-                else:
-                    return HypothesisFeedback(
-                        observations="The current score equals to the SOTA score.",
-                        hypothesis_evaluation="The current score equals to the SOTA score.",
-                        new_hypothesis="No new hypothesis provided",
-                        reason="The current score equals to the SOTA score.",
-                        decision=False,
-                    )
 
         eda_output = exp.experiment_workspace.file_dict.get("EDA.md", None)
         system_prompt = T(".prompts:exp_feedback.system").r(
@@ -128,6 +102,8 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
                 if evaluation_not_aligned
                 else convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no"))
             ),
+            refine_decision=convert2bool(dict_get_with_warning(resp_dict, "Refine Decision", "no")),
+            eda_improvement=dict_get_with_warning(resp_dict, "EDA Improvement", "no"),  # EDA improvement suggestion
         )
 
         if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:
 
@@ -5,9 +5,9 @@ exp_feedback:
     Below is a detailed description of the current Kaggle competition scenario:
     {{ scenario }}
 
-    Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous experiments and the best previous result (SOTA).
+    Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous best SOTA result step by step.
 
-    Step-by-step Analysis Process:
+    # Step-by-step Analysis Process:
 
     Step 1: Verify Submission Format
     - If the submission format check fails:
@@ -57,9 +57,14 @@ exp_feedback:
     - Please examine the code carefully based on the above criteria and provide a detailed analysis of the code.
     - Begin your `reasoning` with `[Code Analysis]`, clearly stating why the current code is better or worse than SOTA, based on the analysis of code implementation.
     - If the current code is not better than SOTA, set `"Replace Best Result": "no"`. Otherwise, set `"Replace Best Result": "yes"`.
- 
-    Provide detailed and constructive feedback structured as follows:
-    Example JSON Structure for Result Analysis:
+
+    Step 5: EDA improvement analysis (if needed)
+    - The user might provide Data Overview in EDA format which is the output of the EDA code. You should analyze the EDA result and provide feedback on how it can be improved.
+    - The improvement might include some addons or modifications or deletions to some part of the EDA code.
+    - You should provide your feedback based on the current code and SOTA code. Especially focus on the feature engineering part.
+    - For example, if the code truncate the line with N words, you can suggest to print the mean, median or quantile of the length of the line for better understanding of the data in the next rounds of experiments.
+
+    Provide detailed and constructive feedback structured as follows without anything else:
     {
       "Submission Format Check": "yes or no",
       "First Valid Submission": "yes or no",
@@ -68,7 +73,9 @@ exp_feedback:
       "Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
       "Evaluation Aligned With Task": "yes or no",
       "Replace Best Result": "yes or no",
-      "Reasoning": "Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences."
+      "Refine Decision": "yes or no",
+      "Reasoning": "Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences.",
+      "EDA Improvement": "improvement suggestion for EDA code, if needed, otherwise set to 'no'. If there is no EDA code, set to 'no'."
     }
 
   user: |-
 
@@ -40,16 +40,23 @@ def implement_one_task(
         if prev_task_feedback is None:
             # if no prev_tak_feedback, it is the first loop; we do not make any changes and goto evaluators directly.
             return {}
-
-        task_information_str = target_task.get_task_information()
-        # 1. code
-        system_prompt = T(".prompts:DSCoSTEER_debugger.system").r(
-            task_desc=task_information_str,
-            out_spec=PythonBatchEditOut.get_spec(with_del=False),
-        )
-        user_prompt = T(".prompts:DSCoSTEER_debugger.user").r(
+        if prev_task_feedback.hyperparameter_tuning_decision:
+            task_information_str = target_task.get_task_information()
+            # 1. code
+            system_prompt = T(".prompts:DSCoSTEER.system_refine").r(
+                out_spec=PythonBatchEditOut.get_spec(with_del=False),
+            )
+        else:
+            task_information_str = target_task.get_task_information()
+            # 1. code
+            system_prompt = T(".prompts:DSCoSTEER.system_refine").r(
+                task_desc=task_information_str,
+                out_spec=PythonBatchEditOut.get_spec(with_del=False),
+            )
+        user_prompt = T(".prompts:DSCoSTEER.user").r(
             code=workspace.all_codes,
             feedback=prev_task_feedback,
+            hyperparameter_tuning_suggestion=prev_task_feedback.hyperparameter_tuning_suggestion,
         )
 
         batch_edit = PythonBatchEditOut.extract_output(
 
@@ -25,7 +25,19 @@
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
-DSCoSTEEREvalFeedback = CoSTEERSingleFeedback
+
+class DSCoSTEEREvalFeedback(CoSTEERSingleFeedback):
+    """
+    Feedback for Data Science CoSTEER evaluation.
+    This feedback is used to evaluate the code and execution of the Data Science CoSTEER task.
+    """
+
+    def __init__(
+        self, *args, hyperparameter_tuning_decision: bool = None, hyperparameter_tuning_suggestion: str = None, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.hyperparameter_tuning_decision = hyperparameter_tuning_decision
+        self.hyperparameter_tuning_suggestion = hyperparameter_tuning_suggestion
 
 
 class DSCoSTEERCoSTEEREvaluator(CoSTEEREvaluator):
@@ -116,27 +128,6 @@ def evaluate(
         if test_eval.enabled(self.scen.competition):
             submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
             stdout += f"\nSubmission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
-        if DS_RD_SETTING.rule_base_eval:
-            if DS_RD_SETTING.if_using_mle_data:
-                score_check_text = score_check_text + "\n" + submission_check_out
-            if (
-                execute_ret_code == 0
-                and score_ret_code == 0
-                and (not DS_RD_SETTING.if_using_mle_data or submission_ret_code == 0)
-            ):
-                return DSCoSTEEREvalFeedback(
-                    execution=stdout,
-                    return_checking=score_check_text,
-                    code="Code evaluation is not available.",
-                    final_decision=True,
-                )
-            else:
-                return DSCoSTEEREvalFeedback(
-                    execution=stdout,
-                    return_checking=score_check_text,
-                    code="Code evaluation is not available.",
-                    final_decision=False,
-                )
 
         system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
             scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
@@ -146,6 +137,9 @@ def evaluate(
         user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
             code=implementation.all_codes,
             stdout=shrink_text(stdout),
+            time_spent=f"{implementation.running_info.running_time:.2f} seconds",
+            timeout=f"{env.conf.running_timeout_period} seconds",
+            percent_of_timeout_used=f"{(implementation.running_info.running_time / env.conf.running_timeout_period) * 100:.2f}%",
         )
 
         feedback = build_cls_from_json_with_retry(
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,6 @@ def implement_one_task(`
`95`	`95`	`queried_former_failed_knowledge=queried_former_failed_knowledge[0],`
`96`	`96`	`out_spec=PythonAgentOut.get_spec(),`
`97`	`97`	`runtime_environment=runtime_environment,`
`98`		`- spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),`
`99`	`98`	`enable_model_dump=DS_RD_SETTING.enable_model_dump,`
`100`	`99`	`enable_debug_mode=DS_RD_SETTING.sample_data_by_LLM,`
`101`	`100`	`)`