microsoft · peteryang1 · Jul 8, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -38,8 +38,8 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     spec_enabled: bool = True
 
     #### proposal related
-    proposal_version: str = "v1"
-    coder_on_whole_pipeline: bool = False
+    proposal_version: str = "v2"
+    coder_on_whole_pipeline: bool = True
     max_trace_hist: int = 3
 
     coder_max_loop: int = 10

diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -95,7 +95,6 @@ def implement_one_task(
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=PythonAgentOut.get_spec(),
             runtime_environment=runtime_environment,
-            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
             enable_model_dump=DS_RD_SETTING.enable_model_dump,
         )
         user_prompt = T(".prompts:pipeline_coder.user").r(

diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -9,8 +9,13 @@ pipeline_coder:
     ## The runtime environment your code will running on
     {{ runtime_environment }}
 
+    ## Hyperparameters Specification
+    Follow the hyperparameter choices if they are specified in the task description, unless they are unreasonable or incorrect.
+    In this case, refer to the guidelines below for appropriate adjustments:
+    {% include "scenarios.data_science.share:spec.hyperparameter" %}
+
     ## Specification your code should follow
-    {{ spec }}
+    {% include "scenarios.data_science.share:component_spec.Pipeline" %}
 
     {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
     ## Relevant Information for This Task
@@ -118,7 +123,9 @@ pipeline_eval:
     Step 1: Executes successfully without any errors. Please distinguish between the errors and warnings.
 
     Step 2: Correctly generates a final submission in the correct format, ensuring: they align with the submission structure, the index names and column names should match the sample, and the items should not be empty or apparently incorrect.
-
+    - Carefully verify that the submission file and any reported scores are the result of actual model inference. Random predictions, fabricated results, or hard-coded outputs constitute cheating and must result in evaluation failure.
+    - Sampling a subset of the training data for efficiency (e.g., randomly selecting a portion of the data) is discouraged unless it demonstrably improves performance (e.g., removing irrelevant or outlier samples).
+
     Step 3: Aligns with the competition requirements. This includes:
     - CAREFULLY ANALYZE WHETHER THE EXPERIMENTAL SETUP AND CODE MAY CAUSE MISALIGNMENT BETWEEN VALIDATION AND TEST PERFORMANCE.
     - Confirm strict adherence to the competition's evaluation rules listed in `scenario`:
@@ -137,6 +144,7 @@ pipeline_eval:
     [Note] 
     1. Model performance is NOT a concern in this evaluation—only correct execution and formatting matter.
     2. You only check the format of the submission since we only feed you part of the data, so the submission might has different index to the sample submission data.
+    3. Submissions and scores must be the result of actual model inference. Any form of cheating or fabrication (e.g., random or hard-coded outputs) is strictly prohibited and should lead to rejection.
 
     Please respond with your feedback in the following JSON format and order
     ```json

diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -57,9 +57,11 @@ def __init__(
         *,
         code_change_summary: str | None = None,
         decision: bool,
+        refine_decision: bool = False,
         exception: Exception | None = None,
     ) -> None:
         self.decision = decision
+        self.refine_decision = refine_decision
         self.reason = reason
         # Exception is not None means failing to generate runnable experiments due to exception.
         # Runable reuslts are not always good.
@@ -95,8 +97,11 @@ def __init__(
         *,
         code_change_summary: str | None = None,
         decision: bool,
+        refine_decision: bool = False,
     ) -> None:
-        super().__init__(reason, decision=decision, code_change_summary=code_change_summary)
+        super().__init__(
+            reason, decision=decision, refine_decision=refine_decision, code_change_summary=code_change_summary
+        )
         self.observations = observations
         self.hypothesis_evaluation = hypothesis_evaluation
         self.new_hypothesis = new_hypothesis

diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
@@ -123,6 +123,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
                 resp_dict, "Code Change Summary", "No code change summary provided"
             ),
             decision=convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no")),
+            refine_decision=convert2bool(dict_get_with_warning(resp_dict, "Refine Decision", "no")),
         )
 
         if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:

diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -5,9 +5,9 @@ exp_feedback:
     Below is a detailed description of the current Kaggle competition scenario:
     {{ scenario }}
 
-    Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous experiments and the best previous result (SOTA).
+    Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous best SOTA result step by step.
 
-    Step-by-step Analysis Process:
+    # Step-by-step Analysis Process:
 
     Step 1: Verify Submission Format
     - If the submission format check fails:
@@ -57,9 +57,18 @@ exp_feedback:
     - Please examine the code carefully based on the above criteria and provide a detailed analysis of the code.
     - Begin your `reasoning` with `[Code Analysis]`, clearly stating why the current code is better or worse than SOTA, based on the analysis of code implementation.
     - If the current code is not better than SOTA, set `"Replace Best Result": "no"`. Otherwise, set `"Replace Best Result": "yes"`.
-
-    Provide detailed and constructive feedback structured as follows:
-    Example JSON Structure for Result Analysis:
+
+    Step 5: Analyze Code Effectiveness
+    - Goal: After confirming that the current code replaces the SOTA, determine whether further refinement is needed in terms of efficiency and hyperparameter optimization. Assess if the current implementation can be improved for better resource usage, faster running time, or more optimal hyperparameter settings.
+    - If the current code is effective in terms of both efficiency and hyperparameters:
+      - Set `"Replace Best Result": "yes"`.
+      - Set `"Refine Decision": "no"`.
+    - If the current code is NOT effective in terms of both efficiency and hyperparameters and further refinement is necessary.
+      - Set `"Replace Best Result": "yes"`.
+      - Set `"Refine Decision": "yes"`.
+    - Begin your `reasoning` with `[Effectiveness Analysis]`, clearly stating whether the current code is effective enough or needs further refinement, especially regarding hyperparameter choices and efficiency.
+
+    Provide detailed and constructive feedback structured as follows without anything else:
     {
       "Submission Format Check": "yes or no",
       "First Valid Submission": "yes or no",
@@ -68,6 +77,7 @@ exp_feedback:
       "Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
       "Evaluation Aligned With Task": "yes or no",
       "Replace Best Result": "yes or no",
+      "Refine Decision": "yes or no",
       "Reasoning": "Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences."
     }
 

diff --git a/rdagent/scenarios/data_science/loop.py b/rdagent/scenarios/data_science/loop.py
@@ -30,7 +30,7 @@
 from rdagent.scenarios.data_science.dev.feedback import DSExperiment2Feedback
 from rdagent.scenarios.data_science.dev.runner import DSCoSTEERRunner
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
+from rdagent.scenarios.data_science.proposal.exp_gen import DSExpGen, DSTrace
 from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSKnowledgeBase
 from rdagent.utils.workflow.misc import wait_retry
 
@@ -113,8 +113,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
         self.ckp_selector = import_class(PROP_SETTING.selector_name)()
         self.sota_exp_selector = import_class(PROP_SETTING.sota_exp_selector_name)()
-
-        self.exp_gen: ExpGen = self._get_exp_gen(PROP_SETTING.hypothesis_gen, scen)
+        self.exp_gen = DSExpGen(scen)
 
         # coders
         self.data_loader_coder = DataLoaderCoSTEER(scen)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py b/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py
@@ -1,3 +1,50 @@
-from rdagent.scenarios.data_science.proposal.exp_gen.base import DSTrace
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.core.proposal import ExpGen
+from rdagent.core.scenario import Scenario
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend, md5_hash
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.proposal.exp_gen.base import DSHypothesis, DSTrace
+from rdagent.scenarios.data_science.proposal.exp_gen.draft import DSDraftExpGen
+from rdagent.scenarios.data_science.proposal.exp_gen.proposal import (
+    DSProposalV1ExpGen,
+    DSProposalV2ExpGen,
+)
+from rdagent.scenarios.data_science.proposal.exp_gen.refine import DSRefineExpGen
+from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.utils.agent.tpl import T
 
-__all__ = ["DSTrace"]
+
+class DSExpGen(ExpGen):
+    """
+    Data Science Task Generator.
+    This is a experiment router generator;
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def gen(self, trace: DSTrace) -> DSExperiment:
+        pipeline = DS_RD_SETTING.coder_on_whole_pipeline
+        sota_exp = trace.sota_experiment()
+
+        # Draft
+        # TODO: draft here
+        if sota_exp is None:
+            pass
+
+        # Refine
+        # TODO: introduce LLMs to decide whether to refine. Current: rule-based.
+        if sota_exp is not None:
+            # TODO: I think the logic of the sota_experiment_fb() should be refined
+            sota_exp_fb = trace.sota_experiment_fb()[-1]
+            sota_exp_idx = trace.sota_experiment_idx()
+            last_exp_idx = -1
+            if pipeline and sota_exp_fb.refine_decision and (last_exp_idx - sota_exp_idx) <= 2:
+                return DSRefineExpGen(scen=self.scen).gen(trace=trace)
+
+        # Propose
+        if DS_RD_SETTING.proposal_version == "v1":
+            return DSProposalV1ExpGen(scen=self.scen).gen(trace=trace)
+        if DS_RD_SETTING.proposal_version == "v2":
+            return DSProposalV2ExpGen(scen=self.scen).gen(trace=trace)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
@@ -208,6 +208,26 @@ def experiment_and_feedback_list_after_init(
         else:
             raise ValueError("Invalid return_type. Must be 'sota', 'failed', or 'all'.")
 
+    def sota_experiment_idx(
+        self,
+        search_type: Literal["all", "ancestors"] = "ancestors",
+        selection: tuple[int, ...] | None = None,
+    ) -> int | None:
+        """
+        Returns
+        -------
+        int or None
+            The index (in reversed order: -1, -2, ...) of the SOTA experiment in the trace,
+            or None if not found.
+        """
+        search_list = self.retrieve_search_list(search_type, selection=selection)
+
+        if DS_RD_SETTING.coder_on_whole_pipeline or self.next_incomplete_component() is None:
+            for rev_idx, (exp, ef) in enumerate(search_list[::-1]):
+                if ef.decision:
+                    return -(rev_idx + 1)
+        return None
+
     def sota_experiment_fb(
         self,
         search_type: Literal["all", "ancestors"] = "ancestors",
@@ -226,7 +246,7 @@ def sota_experiment_fb(
                 # the sota exp should be accepted decision and all required components are completed.
                 if ef.decision:
                     return exp, ef
-        return None
+        return None, None
 
     def sota_experiment(
         self,

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompt_refine.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompt_refine.yaml
@@ -0,0 +1,42 @@
+task_gen:
+  system: |-
+    {% include "scenarios.data_science.share:scen.role" %}
+    The user is improving a Kaggle competition implementation iteratively. Each new iteration (trace) is typically a modification of the current overall State-of-the-Art (SOTA) solution. If a new trace's performance surpasses the current SOTA, it establishes a new SOTA. Otherwise, it is considered a failed experiment.
+  
+    You will be provided with:
+    1. A detailed competition scenario description.
+    2. A history of previous successfully experiments and their associated feedbacks, indexed or ordered from oldest to newest; the latest SOTA experiment accumulates all the improvements from the previous successful experiments.
+    3. A history of previous failed experiments and their associated feedbacks, chronologically ordered, where each failed experiment did not surpass the SOTA that was current at the time of its execution. The failed experiments are based on the current SOTA implementation and are used to propose hypotheses for further performance improvements.
+    4. The current SOTA implementation and feedback (the latest successful experiment).
+    5. A proposed refinement hypothesis. Previous analysis demonstrated that the current SOTA implementation is not effective enough in terms of efficiency and hyperparameters.
+    
+    Your goal is to generate a detailed, step-by-step **refinement plan** for current SOTA implementation that effectively implements the `Proposed Hypothesis`.
+
+    # Task Design Specification
+    {% include "scenarios.data_science.share:component_spec.Pipeline" %}
+
+    # Task Design Guidelines
+    {% include "scenarios.data_science.share:guidelines.refine" %}
+
+    # Refinement Specification
+    ## Hypothesis: {{ hypothesis.hypothesis }}
+    ### Reason: {{ hypothesis.reason }} 
+    ## Hyperparameters Tuning:
+    {% include "scenarios.data_science.share:spec.hyperparameter" %}
+
+    # Output Format
+    Your final output should strictly adhere to the following JSON format without anything else:
+    {{ task_output_format }}
+    
+  user: |-
+    # Competition Scenario Description
+    {{ scenario_desc }}
+
+    # Data Folder Structure (All files are under {% include "scenarios.data_science.share:scen.input_path" %})
+    {{ data_folder_info }}
+
+    # Current SOTA Implementation & Feedback
+    {{ sota_exp_desc }}
+
+    # Previous Experiments & Feedback
+    {{ exp_and_feedback_list_desc }}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -206,7 +206,7 @@ hypothesis_gen:
         - *Good Example (Efficiency)*: "To resolve the 'timeout during training' challenge, reduce `NUM_EPOCHS` from 5 to 2 and `N_SPLITS` for cross-validation from 5 to 3 in the main training loop, aiming to complete execution within the 1-hour limit while minimizing impact on the F1-score."
         - *Poor Example*: "Tune the model for better results."
       - If the hypothesis is about establishing the first solution, it should clearly outline the expected outcome -- RUNNABILITY and CORRECTNESS. Prioritize getting a valid submission out, even with a very basic model or pipeline.
-        - *Good Example*: "Implement a simple RandomForest classifier with default parameters, using 5-fold cross-validation for model evaluation. This will lead to a decent baseline model that can run to completion and generate a valid submission file."
+        - *Good Example*: "Implement a simple RandomForest classifier with default parameters, using 3-fold cross-validation for model evaluation. This will lead to a decent baseline model that can run to completion and generate a valid submission file."
     3. **Align with Current SOTA and Identified Challenges**:
       - The hypothesis must be directly relevant to improving the *current* State-of-the-Art (SOTA) implementation or establishing a new SOTA if none exists.
       - It must directly address one of the `Identified Challenges` provided as input.
@@ -280,7 +280,7 @@ task_gen:
 
     Your primary goal is to generate a detailed, step-by-step **sketch or refinement plan** for a new data processing and modeling pipeline, specifically for the main workflow script (`main.py`), that effectively implements the `Proposed Hypothesis`. This sketch will guide a developer to write the code correctly.
 
-    ### BACKGROUND CONTEXT: Pipeline Implementation Standards & Constraints ###
+    # BACKGROUND CONTEXT: Pipeline Implementation Standards & Constraints
 
     The `main.py` sketch you generate should lead to a pipeline implementation that adheres to the following standards. These are guiding principles for the final *outcome* of your sketch:
 
@@ -309,15 +309,13 @@ task_gen:
       - Prevent data leakage from test/validation sets into any training stage.
     7. **Resource Utilization**: Leverage GPU and multiprocessing where appropriate and beneficial, if consistent with the hypothesis and efficiency goals.
     8. **Metric Calculation and Storage (`scores.csv`)**:
-      - Calculate the official competition metric on a proper validation set (e.g., K-fold CV, typically 3-5 folds unless efficiency dictates fewer). Save results to `scores.csv`.
+      - Calculate the official competition metric on a proper validation set. Save results to `scores.csv`.
       - The sketch must ensure this step is included. A successful run should always produce scores.
       - `scores.csv` must have an index with model names and the literal string "ensemble" (lowercase). Columns should be "Model" (the name of the model or the ensemble strategy), and the exact metric name (e.g., "AUC").
       - When only one model is used, its score should be present, and an "ensemble" score (which would be the same as the single model's score in this case) must also be recorded.
       - Ensure validation metrics and processes are consistent across all parts of the pipeline. Avoid changes that would alter how validation metrics are calculated unless that is part of the hypothesis.
     9. **Submission File (`submission.csv`)**: Generate `submission.csv` in the **exact format** required (column names, order, data types), as detailed by `sample_submission.csv` in the `Competition Scenario Description`. This is a critical step.
 
-    ### END OF BACKGROUND CONTEXT ###
-
     # Guidelines for Sketching the `main.py` Workflow
 
     YOUR TASK IS TO create a conceptual sketch for drafting or updating the `main.py` workflow. This is a plan, not code.
@@ -354,6 +352,10 @@ task_gen:
       - Confirm no `tqdm` or other progress bars are in the final script.
       - Double-check that validation scores are saved correctly to `scores.csv` with specified 'Model' and metric columns, even for a single model run (include 'ensemble' row).
 
+    # Hyperparameters Specification
+    Choose hyperparameters to ensure strong performance while meeting resource and time constraints. Specify values only when clearly justified by evidence or strong rationale.
+    {% include "scenarios.data_science.share:spec.hyperparameter" %}
+
     {% if task_output_format is not none %}
     ## [Partial Response Format 1] Task Output Format:
     {{ task_output_format }}

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -20,6 +20,7 @@
 from rdagent.scenarios.data_science.proposal.exp_gen.base import DSHypothesis, DSTrace
 from rdagent.scenarios.data_science.proposal.exp_gen.draft import DSDraftExpGen
 from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSIdea
+from rdagent.scenarios.data_science.proposal.exp_gen.refine import DSRefineExpGen
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.repo.diff import generate_diff_from_dict
 from rdagent.utils.workflow import wait_retry
@@ -806,7 +807,6 @@ def gen(
         self,
         trace: DSTrace,
     ) -> DSExperiment:
-
         pipeline = DS_RD_SETTING.coder_on_whole_pipeline
         if not pipeline and (draft_exp := draft_exp_in_decomposition(self.scen, trace)):
             return draft_exp