microsoft · peteryang1 · Jul 8, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -38,8 +38,8 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     spec_enabled: bool = True
 
     #### proposal related
-    proposal_version: str = "v1"
-    coder_on_whole_pipeline: bool = False
+    proposal_version: str = "v2"
+    coder_on_whole_pipeline: bool = True
     max_trace_hist: int = 3
 
     coder_max_loop: int = 10

diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -95,7 +95,6 @@ def implement_one_task(
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=PythonAgentOut.get_spec(),
             runtime_environment=runtime_environment,
-            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
             enable_model_dump=DS_RD_SETTING.enable_model_dump,
         )
         user_prompt = T(".prompts:pipeline_coder.user").r(

diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -2,15 +2,22 @@ pipeline_coder:
   system: |-
     You are a world-class data scientist and machine learning engineer with deep expertise in statistics, mathematics, and computer science.
     Your knowledge spans cutting-edge data analysis techniques, advanced machine learning algorithms, and their practical applications to solve complex real-world problems.
+
+    **Important Context**: You are working on sample datasets and your code will go through automated iterations. Design your code to be iteration-friendly with comprehensive print statements and clear debugging information to facilitate the automatic improvement process.
 
     ## Task Description
     {{ task_desc }}
 
     ## The runtime environment your code will running on
     {{ runtime_environment }}
 
+    ## Hyperparameters Specification
+    Follow the hyperparameter choices if they are specified in the task description, unless they are unreasonable or incorrect.
+    In this case, refer to the guidelines below for appropriate adjustments:
+    {% include "scenarios.data_science.share:spec.hyperparameter" %}
+
     ## Specification your code should follow
-    {{ spec }}
+    {% include "scenarios.data_science.share:component_spec.Pipeline" %}
 
     {% if queried_similar_successful_knowledge|length != 0 or queried_former_failed_knowledge|length != 0 %}
     ## Relevant Information for This Task
@@ -89,18 +96,44 @@ pipeline_coder:
     {% if latest_code_feedback is not none %}
     --------- Feedback to former code ---------
     {{ latest_code_feedback }}
+
+    **Improvement Planning**: Before modifying the code, first analyze the feedback and identify at most 3 key areas that need modification. Plan your changes strategically:
+    1. Prioritize the most critical issues that affect code execution or correctness
+    2. Focus on improvements that will have the highest impact
+    3. Ensure changes don't break existing working components
+
     The former code contains errors. You should correct the code based on the provided information, ensuring you do not repeat the same mistakes.
     Keep the part that already seem correct intact. Avoid modifying them to refrain from introducing new errors.
     {% else %}
+    **Improvement Planning**: Before enhancing the code, first analyze what can be improved and identify at most 3 key enhancement areas. Plan your improvements strategically:
+    1. Focus on performance, robustness, or feature engineering improvements
+    2. Enhance code clarity and debugging capabilities  
+    3. Optimize model configuration or validation strategy
+
     The former code is correct. You should try to improve the code based on the provided task while not changing the irrelevant parts.
     {% endif %}
     {% endif %} 
 
+    ## Code Generation Best Practices
+    1. **Avoid Hard-coding**: Avoid hard-coded values (e.g., fixed dataset size). Use proportions for data splitting instead of absolute numbers.
+    2. **Data Loading Exception Handling**: Use try-except blocks ONLY for data loading operations when working with sample data. If you find zero data records, this indicates a data loading error, not data absence.
+    3. **Minimize Exception Handling**: Avoid using try-except blocks outside of data loading, as they may mask underlying code issues and hinder debugging.
+    4. **Limit Assertions**: Minimize the use of assert statements, as they may prevent discovery of code problems during evaluation.
+    5. **Strategic Print Statements**: Add appropriate print statements at key steps to facilitate automated code iteration and debugging. Use print() function instead of logging module for output information.
+    6. **Training Configuration**: For model training, use reasonable epoch numbers (for example, 10 epochs). ALWAYS implement early stopping with proper conditions: sufficient epochs completed, loss reaching sufficiently low value, and no improvement for patience period. Save best model checkpoints based on validation performance.
+    7. **Submission Generation**: ALWAYS use the best saved model (not necessarily final epoch) for predictions. NEVER create dummy/placeholder submissions (e.g., all 1s, random values). If training fails, report failure honestly rather than generating fake submission files.
+    8. **Resource Management**: Use all available data without sampling or subsetting due to resource limitations. If resources are insufficient, report the issue honestly rather than compromising data integrity.
+    9. **Robust Data Handling**: Code should gracefully handle varying data sizes and structures without breaking on edge cases.
+    10. **Clear Error Messages**: When errors occur, ensure they provide meaningful information for debugging rather than generic messages.
+    11. **Don't use tqdm**: Don't use tqdm to show the progress of the training process.
+
     You should strictly follow the code specifications provided by the specification to implement the function.
 
 pipeline_eval:
   system: |-
     You are a data scientist responsible for evaluating code generation.
+
+    **Important Context**: The evaluation is performed on sample datasets and the code is designed for automated iterations. Pay special attention to whether the code includes sufficient debugging information (print statements, clear error messages) to facilitate automatic improvement processes.
 
     ## Task Description
     The user is trying to build a code in the following scenario:
@@ -115,14 +148,19 @@ pipeline_eval:
     {% if is_sub_enabled %}
     ## Evaluation Scope
     Your focus is to check whether the workflow code:
-    Step 1: Executes successfully without any errors. Please distinguish between the errors and warnings.
+
+    ### Step 1: Executes successfully without any errors. Please distinguish between the errors and warnings.
 
-    Step 2: Correctly generates a final submission in the correct format, ensuring: they align with the submission structure, the index names and column names should match the sample, and the items should not be empty or apparently incorrect.
+    ### Step 2: Correctly generates a final submission in the correct format, ensuring: 
+    - They align with the submission structure
+    - The index names and column names should match the sample
+    - The items should not be empty or apparently incorrect
+    - **CRITICALLY: Deep dive into code and stdout to verify the generated file is genuinely produced by successful execution, NOT created as a result of exception handling, fallback mechanisms, or error recovery processes. Distinguish between authentic output and defensive/backup file generation.**
 
-    Step 3: Aligns with the competition requirements. This includes:
+    ### Step 3: Aligns with the competition requirements. This includes:
     - CAREFULLY ANALYZE WHETHER THE EXPERIMENTAL SETUP AND CODE MAY CAUSE MISALIGNMENT BETWEEN VALIDATION AND TEST PERFORMANCE.
     - Confirm strict adherence to the competition's evaluation rules listed in `scenario`:
-      - Exact match between the implementation code of metric and the requirements of the scenario. The metric number is not the focus.
+      - Exact match between the implementation code of metric and the requirements of the scenario. **The metric number is not the focus.**
       - Consistent prediction methodologies between validation and test datasets.
       - No shortcuts or fold-specific strategies applied inconsistently.
       - Rigorous checks for corner-case consistency.
@@ -134,16 +172,30 @@ pipeline_eval:
     ## Evaluation Criteria
     You will be given the execution output (`stdout`) to determine correctness.  
 
-    [Note] 
+    ### Notes
     1. Model performance is NOT a concern in this evaluation—only correct execution and formatting matter.
     2. You only check the format of the submission since we only feed you part of the data, so the submission might has different index to the sample submission data.
+    3. Submissions and scores must be the result of actual model inference. Any form of cheating or fabrication (e.g., random or hard-coded outputs) is strictly prohibited and should lead to rejection.
 
+    ### Evaluation Guidelines
+    1. **EDA Requirement**: EDA is mandatory and must be included in the generated code. You do NOT need to evaluate the EDA content itself.
+    2. **Sample Dataset Context**: Evaluation is performed on sample datasets, so dataset size variations are expected and acceptable.
+    3. **Hard-coding Check**: Verify the code avoids hard-coded values and uses **proportions** instead of absolute numbers for data splitting.
+    4. **Exception Handling Review**: Check that try-except blocks are used appropriately (only for data loading) and minimized elsewhere to avoid masking code issues.
+    5. **Assertion Usage**: Ensure assert statements are used sparingly to avoid preventing discovery of code problems.
+    6. **Debug-Friendly Code**: Verify there are appropriate print statements at key steps for debugging and iteration. Check that print() function is used instead of logging module.
+    7. **Training Configuration**: Verify epoch numbers are reasonable (at least 10), early stopping is properly implemented with appropriate conditions, and model checkpointing is used.
+    8. **Submission Quality**: Ensure submissions are generated from best saved models, not dummy/placeholder values. Verify honest failure reporting if training issues occur.
+    9. **Resource Management**: Check that all available data is used appropriately without unnecessary sampling or subsetting.
+    10. **Robustness**: Ensure the code handles varying data sizes and structures gracefully without breaking on edge cases.
+    11. **Error Clarity**: Verify that error messages provide meaningful debugging information rather than generic messages.
+
     Please respond with your feedback in the following JSON format and order
     ```json
     {
-        "execution": "Describe whether the code executed successfully, correctly integrating all components and generating the final submission. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information.",
-        "return_checking": "Verify the generated files, particularly the submission file. Ensure that its format matches the sample submission, checking the index, column names, and CSV content.",
-        "code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide feedback on code quality, readability, adherence to the given specifications, and alignment with competition requirements.",
+        "execution": "Describe whether the code executed successfully, correctly integrating all components and generating the final submission. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
+        "return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches sample submission (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
+        "code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and competition requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
         "final_decision": <true/false>
     }
     ```
@@ -159,9 +211,9 @@ pipeline_eval:
     Please respond with your feedback in the following JSON format and order
     ```json
     {
-        "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information.",
+        "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
         "return_checking": "Describe the expected file to be generated.",
-        "code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
+        "code": "Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
         "final_decision": <true/false>
     }
     ```

diff --git a/rdagent/core/proposal.py b/rdagent/core/proposal.py
@@ -57,9 +57,11 @@ def __init__(
         *,
         code_change_summary: str | None = None,
         decision: bool,
+        refine_decision: bool = False,
         exception: Exception | None = None,
     ) -> None:
         self.decision = decision
+        self.refine_decision = refine_decision
         self.reason = reason
         # Exception is not None means failing to generate runnable experiments due to exception.
         # Runable reuslts are not always good.
@@ -95,8 +97,11 @@ def __init__(
         *,
         code_change_summary: str | None = None,
         decision: bool,
+        refine_decision: bool = False,
     ) -> None:
-        super().__init__(reason, decision=decision, code_change_summary=code_change_summary)
+        super().__init__(
+            reason, decision=decision, refine_decision=refine_decision, code_change_summary=code_change_summary
+        )
         self.observations = observations
         self.hypothesis_evaluation = hypothesis_evaluation
         self.new_hypothesis = new_hypothesis

diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
@@ -123,6 +123,7 @@ def generate_feedback(self, exp: DSExperiment, trace: DSTrace) -> ExperimentFeed
                 resp_dict, "Code Change Summary", "No code change summary provided"
             ),
             decision=convert2bool(dict_get_with_warning(resp_dict, "Replace Best Result", "no")),
+            refine_decision=convert2bool(dict_get_with_warning(resp_dict, "Refine Decision", "no")),
         )
 
         if hypothesis_feedback and DS_RD_SETTING.enable_knowledge_base:

diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -5,9 +5,9 @@ exp_feedback:
     Below is a detailed description of the current Kaggle competition scenario:
     {{ scenario }}
 
-    Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous experiments and the best previous result (SOTA).
+    Your task is to analyze the current experiment's hypothesis, implementation (code and its changes), and results, explicitly comparing them with previous best SOTA result step by step.
 
-    Step-by-step Analysis Process:
+    # Step-by-step Analysis Process:
 
     Step 1: Verify Submission Format
     - If the submission format check fails:
@@ -57,9 +57,18 @@ exp_feedback:
     - Please examine the code carefully based on the above criteria and provide a detailed analysis of the code.
     - Begin your `reasoning` with `[Code Analysis]`, clearly stating why the current code is better or worse than SOTA, based on the analysis of code implementation.
     - If the current code is not better than SOTA, set `"Replace Best Result": "no"`. Otherwise, set `"Replace Best Result": "yes"`.
-
-    Provide detailed and constructive feedback structured as follows:
-    Example JSON Structure for Result Analysis:
+
+    Step 5: Analyze Code Effectiveness
+    - Goal: After confirming that the current code replaces the SOTA, determine whether further refinement is needed in terms of efficiency and hyperparameter optimization. Assess if the current implementation can be improved for better resource usage, faster running time, or more optimal hyperparameter settings.
+    - If the current code is effective in terms of both efficiency and hyperparameters:
+      - Set `"Replace Best Result": "yes"`.
+      - Set `"Refine Decision": "no"`.
+    - If the current code is NOT effective in terms of both efficiency and hyperparameters and further refinement is necessary.
+      - Set `"Replace Best Result": "yes"`.
+      - Set `"Refine Decision": "yes"`.
+    - Begin your `reasoning` with `[Effectiveness Analysis]`, clearly stating whether the current code is effective enough or needs further refinement, especially regarding hyperparameter choices and efficiency.
+
+    Provide detailed and constructive feedback structured as follows without anything else:
     {
       "Submission Format Check": "yes or no",
       "First Valid Submission": "yes or no",
@@ -68,6 +77,7 @@ exp_feedback:
       "Feedback for Hypothesis": Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences.",
       "Evaluation Aligned With Task": "yes or no",
       "Replace Best Result": "yes or no",
+      "Refine Decision": "yes or no",
       "Reasoning": "Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences."
     }
 

diff --git a/rdagent/scenarios/data_science/loop.py b/rdagent/scenarios/data_science/loop.py
@@ -30,7 +30,7 @@
 from rdagent.scenarios.data_science.dev.feedback import DSExperiment2Feedback
 from rdagent.scenarios.data_science.dev.runner import DSCoSTEERRunner
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
+from rdagent.scenarios.data_science.proposal.exp_gen import DSExpGen, DSTrace
 from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSKnowledgeBase
 from rdagent.utils.workflow.misc import wait_retry
 
@@ -113,8 +113,7 @@ def __init__(self, PROP_SETTING: BasePropSetting):
 
         self.ckp_selector = import_class(PROP_SETTING.selector_name)()
         self.sota_exp_selector = import_class(PROP_SETTING.sota_exp_selector_name)()
-
-        self.exp_gen: ExpGen = self._get_exp_gen(PROP_SETTING.hypothesis_gen, scen)
+        self.exp_gen = DSExpGen(scen)
 
         # coders
         self.data_loader_coder = DataLoaderCoSTEER(scen)