microsoft · peteryang1 · Jul 8, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -95,7 +95,6 @@ def implement_one_task(
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=PythonAgentOut.get_spec(),
             runtime_environment=runtime_environment,
-            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
             enable_model_dump=DS_RD_SETTING.enable_model_dump,
         )
         user_prompt = T(".prompts:pipeline_coder.user").r(

diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -9,8 +9,13 @@ pipeline_coder:
     ## The runtime environment your code will running on
     {{ runtime_environment }}
 
+    ## Hyperparameters Specification
+    Follow the hyperparameter choices if they are specified in the task description, unless they are unreasonable or incorrect.
+    In this case, refer to the guidelines below for appropriate adjustments:
+    {% include "scenarios.data_science.share:spec.hyperparameter" %}
+
     ## Specification your code should follow
-    {{ spec }}
+    {% include "scenarios.data_science.share:component_spec.Pipeline" %}
 
     {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
     ## Relevant Information for This Task
@@ -118,7 +123,9 @@ pipeline_eval:
     Step 1: Executes successfully without any errors. Please distinguish between the errors and warnings.
 
     Step 2: Correctly generates a final submission in the correct format, ensuring: they align with the submission structure, the index names and column names should match the sample, and the items should not be empty or apparently incorrect.
-
+    - Carefully verify that the submission file and any reported scores are the result of actual model inference. Random predictions, fabricated results, or hard-coded outputs constitute cheating and must result in evaluation failure.
+    - Sampling a subset of the training data for efficiency (e.g., randomly selecting a portion of the data) is discouraged unless it demonstrably improves performance (e.g., removing irrelevant or outlier samples).
+
     Step 3: Aligns with the competition requirements. This includes:
     - CAREFULLY ANALYZE WHETHER THE EXPERIMENTAL SETUP AND CODE MAY CAUSE MISALIGNMENT BETWEEN VALIDATION AND TEST PERFORMANCE.
     - Confirm strict adherence to the competition's evaluation rules listed in `scenario`:
@@ -137,6 +144,7 @@ pipeline_eval:
     [Note] 
     1. Model performance is NOT a concern in this evaluation—only correct execution and formatting matter.
     2. You only check the format of the submission since we only feed you part of the data, so the submission might has different index to the sample submission data.
+    3. Submissions and scores must be the result of actual model inference. Any form of cheating or fabrication (e.g., random or hard-coded outputs) is strictly prohibited and should lead to rejection.
 
     Please respond with your feedback in the following JSON format and order
     ```json

diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -57,9 +57,11 @@ def __init__(
         *,
         code_change_summary: str | None = None,
         decision: bool,
+        refine_decision: bool,
         exception: Exception | None = None,
     ) -> None:
         self.decision = decision
+        self.refine_decision = refine_decision
         self.reason = reason
         # Exception is not None means failing to generate runnable experiments due to exception.
         # Runable reuslts are not always good.
@@ -95,8 +97,11 @@ def __init__(
         *,
         code_change_summary: str | None = None,
         decision: bool,
+        refine_decision: bool,
     ) -> None:
-        super().__init__(reason, decision=decision, code_change_summary=code_change_summary)
+        super().__init__(
+            reason, decision=decision, refine_decision=refine_decision, code_change_summary=code_change_summary
+        )
         self.observations = observations
         self.hypothesis_evaluation = hypothesis_evaluation
         self.new_hypothesis = new_hypothesis

diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
@@ -123,6 +123,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
                 resp_dict, "Code Change Summary", "No code change summary provided"
             ),
             decision=convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no")),
+            refine_decision=convert2bool(dict_get_with_warning(resp_dict, "Effectiveness Check", "no")),
         )
 
         if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:

diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -5,9 +5,9 @@ exp_feedback:
     Below is a detailed description of the current Kaggle competition scenario:
     {{ scenario }}
 
-    Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous experiments and the best previous result (SOTA).
+    Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous best SOTA result step by step.
 
-    Step-by-step Analysis Process:
+    # Step-by-step Analysis Process:
 
     Step 1: Verify Submission Format
     - If the submission format check fails:
@@ -19,7 +19,7 @@ exp_feedback:
       - If this is the first valid submission ever, set `"Replace Best Result": "yes"`.
       - Otherwise, proceed to Step 2.
 
-    Step 2: Evaluate Alignment with Competition Requirements (if format correct)
+    Step 2: Evaluate Alignment with Competition Requirements
     - GOAL: CAREFULLY ANALYZE WHETHER THE EXPERIMENTAL SETUP AND CODE MAY CAUSE MISALIGNMENT BETWEEN VALIDATION AND TEST PERFORMANCE.
     - Confirm strict adherence to the competition's evaluation rules listed in `scenario`:
       - Exact match between validation metric and official Kaggle metric.
@@ -35,7 +35,7 @@ exp_feedback:
       - Begin your `reasoning` with `[Evaluation error]`, explicitly stating the evaluation alignment issues causing experiment failure.
     - If evaluation alignment passes, set `"Evaluation Aligned With Task": "yes"`, and then proceed to Step 3.
 
-    Step 3: Analyze Experimental Results (if format and evaluation alignment correct)
+    Step 3: Analyze Experimental Results
     - Explicitly confirm or refute the hypothesis with precise data points or performance trends.
     - Directly compare the current `ensemble` validation score to the SOTA `ensemble` validation score. Do not focus on individual models unless anomalies are significant.
     - Based on the metric used in the competition, the comparison should fit into the following categories:
@@ -57,9 +57,18 @@ exp_feedback:
     - Please examine the code carefully based on the above criteria and provide a detailed analysis of the code.
     - Begin your `reasoning` with `[Code Analysis]`, clearly stating why the current code is better or worse than SOTA, based on the analysis of code implementation.
     - If the current code is not better than SOTA, set `"Replace Best Result": "no"`. Otherwise, set `"Replace Best Result": "yes"`.
-
-    Provide detailed and constructive feedback structured as follows:
-    Example JSON Structure for Result Analysis:
+
+    Step 5: Analyze Code Effectiveness
+    - Goal: Determine whether the current code requires further refinement, primarily focusing on hyperparameter tuning. Assess if the current settings are appropriate by referring to running time, resource usage, and other relevant statistics.
+    - If the current code is effective in terms of both efficiency and hyperparameters:
+      - Set `"Replace Best Result": "yes"`.
+      - Set `"Effectiveness Check": "yes"`.
+    - If the current code is NOT effective in terms of both efficiency and hyperparameters:
+      - Set `"Replace Best Result": "yes"`.
+      - Set `"Effectiveness Check": "no"`.
+    - Begin your `reasoning` with `[Effectiveness Analysis]`, clearly stating whether the current code is effective enough or needs further refinement, especially regarding hyperparameter choices and efficiency.
+
+    Provide detailed and constructive feedback structured as follows without anything else:
     {
       "Submission Format Check": "yes or no",
       "First Valid Submission": "yes or no",
@@ -68,6 +77,7 @@ exp_feedback:
       "Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
       "Evaluation Aligned With Task": "yes or no",
       "Replace Best Result": "yes or no",
+      "Effectiveness Check": "yes or no",
       "Reasoning": "Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences."
     }
 

diff --git a/rdagent/scenarios/data_science/loop.py b/rdagent/scenarios/data_science/loop.py
@@ -30,7 +30,7 @@
 from rdagent.scenarios.data_science.dev.feedback import DSExperiment2Feedback
 from rdagent.scenarios.data_science.dev.runner import DSCoSTEERRunner
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
+from rdagent.scenarios.data_science.proposal.exp_gen import DSExpGen, DSTrace
 from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSKnowledgeBase
 from rdagent.utils.workflow.misc import wait_retry
 
@@ -113,8 +113,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
         self.ckp_selector = import_class(PROP_SETTING.selector_name)()
         self.sota_exp_selector = import_class(PROP_SETTING.sota_exp_selector_name)()
-
-        self.exp_gen: ExpGen = self._get_exp_gen(PROP_SETTING.hypothesis_gen, scen)
+        self.exp_gen = DSExpGen(scen)
 
         # coders
         self.data_loader_coder = DataLoaderCoSTEER(scen)

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py b/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py
@@ -1,3 +1,129 @@
-from rdagent.scenarios.data_science.proposal.exp_gen.base import DSTrace
+from typing import Any, Dict, List, Optional, Tuple
 
-__all__ = ["DSTrace"]
+from pydantic import BaseModel, Field
+
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
+from rdagent.components.coder.data_science.feature.exp import FeatureTask
+from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.components.coder.data_science.pipeline.exp import PipelineTask
+from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+from rdagent.core.proposal import ExpGen
+from rdagent.core.scenario import Scenario
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend, md5_hash
+from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
+from rdagent.scenarios.data_science.proposal.exp_gen.base import DSHypothesis, DSTrace
+from rdagent.scenarios.data_science.proposal.exp_gen.draft import DSDraftExpGen
+from rdagent.scenarios.data_science.proposal.exp_gen.proposal import (
+    DSProposalV1ExpGen,
+    DSProposalV2ExpGen,
+)
+from rdagent.scenarios.data_science.proposal.exp_gen.refine import DSRefineExpGen
+from rdagent.scenarios.data_science.scen import DataScienceScen
+from rdagent.utils.agent.tpl import T
+
+_COMPONENT_META: Dict[str, Dict[str, Any]] = {
+    "DataLoadSpec": {
+        "target_name": "Data loader and specification generation",
+        "spec_file": "spec/data_loader.md",
+        "output_format_key": ".prompts:output_format.data_loader",
+        "task_class": DataLoaderTask,
+    },
+    "FeatureEng": {
+        "target_name": "Feature engineering",
+        "spec_file": "spec/feature.md",
+        "output_format_key": ".prompts:output_format.feature",
+        "task_class": FeatureTask,
+    },
+    "Model": {
+        "target_name": "Model",
+        "spec_file": "spec/model.md",
+        "output_format_key": ".prompts:output_format.model",
+        "task_class": ModelTask,
+    },
+    "Ensemble": {
+        "target_name": "Ensemble",
+        "spec_file": "spec/ensemble.md",
+        "output_format_key": ".prompts:output_format.ensemble",
+        "task_class": EnsembleTask,
+    },
+    "Workflow": {
+        "target_name": "Workflow",
+        "spec_file": "spec/workflow.md",
+        "output_format_key": ".prompts:output_format.workflow",
+        "task_class": WorkflowTask,
+    },
+    "Pipeline": {
+        "target_name": "Pipeline",
+        "spec_file": None,
+        "output_format_key": ".prompts:output_format.pipeline",
+        "task_class": PipelineTask,
+    },
+}
+
+
+def get_component(name: str) -> Dict[str, Any]:
+    meta = _COMPONENT_META.get(name)
+    if meta is None:
+        raise KeyError(f"Unknown component: {name!r}")
+
+    return {
+        "target_name": meta["target_name"],
+        "spec_file": meta["spec_file"],
+        "task_output_format": T(meta["output_format_key"]).r(),
+        "task_class": meta["task_class"],
+    }
+
+
+class CodingSketch(BaseModel):
+    current_state: str = Field(
+        description="A summary of the current `main.py` script that serves as the baseline for the planned changes. Focusing on parts that are related to the hypothesis. If `main.py` does not yet exist (i.e., it will be created from scratch based on this sketch), use the string 'N/A'."
+    )
+    modifications: List[str] = Field(
+        description="A list of specific, targeted changes to be applied to the existing code identified in `current_state`. Each string in the list should concisely describe (in 3-4 sentences): "
+        "(a) the specific part of the code to be altered (e.g., a function name, a class, or a logical block); "
+        "(b) the nature of the modification (e.g., bug fix, feature addition, refactoring of a small section, performance optimization, deletion); and "
+        "(c) a brief explanation or high-level sketch of the new logic or change. "
+        "If no direct modifications to existing code are planned (e.g., if creating an entirely new `main.py` as detailed in `structure`), this list should be empty."
+    )
+    structure: List[str] = Field(
+        description="An outline of the new high-level architectural components (primarily functions and classes) if a new `main.py` script is being created from scratch, or if the existing `main.py` is undergoing a major refactor that fundamentally alters or replaces its core structure. "
+        "Each string in the list should define a planned function or class, detailing its name, primary responsibility, key parameters (if applicable), return values (if applicable), and core functionality in 2-3 sentences. "
+        "This field is typically used when `current_state` is 'N/A' or when the scope of change requires a new architectural blueprint rather than just targeted `modifications`. "
+        "Leave empty if the plan only involves direct `modifications` to the existing structure in `current_state`."
+    )
+    sketch: str = Field(
+        description="A detailed, step-by-step narrative that elaborates on how to implement the planned code. "
+        "This section should synthesize the information from `modifications` (if any) and/or `structure` (if any) into a comprehensive and actionable coding plan for `main.py`. "
+        "The content **must** be formatted using Markdown, with logical sections, key decision points, or implementation steps clearly organized by level-3 headings (i.e., `###`). "
+        "This field should provide sufficient detail for a developer to understand the implementation flow, algorithms, data handling, and key logic points without ambiguity."
+    )
+
+
+class DSExpGen(ExpGen):
+    """
+    Data Science Task Generator.
+    This is a experiment router generator;
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def gen(self, trace: DSTrace) -> DSExperiment:
+        pipeline = DS_RD_SETTING.coder_on_whole_pipeline
+
+        # Draft
+        # TODO: draft here
+
+        # Refine
+        last_exp_fb = trace.last_exp_fb()
+        if last_exp_fb.decision and last_exp_fb.refine_decision and pipeline:
+            return DSRefineExpGen(scen=self.scen).gen(trace=trace)
+
+        # Propose
+        if DS_RD_SETTING.proposal_version == "v1":
+            return DSProposalV1ExpGen(scen=self.scen).gen(trace=trace)
+        if DS_RD_SETTING.proposal_version == "v2":
+            return DSProposalV2ExpGen(scen=self.scen).gen(trace=trace)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/idea_pool.py b/rdagent/scenarios/data_science/proposal/exp_gen/idea_pool.py
@@ -7,9 +7,7 @@
 from rdagent.components.knowledge_management.graph import (
     UndirectedNode,  # TODO: add appendix attribute to node
 )
-from rdagent.components.knowledge_management.graph import (
-    UndirectedGraph,
-)
+from rdagent.components.knowledge_management.graph import UndirectedGraph
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.tpl import T