microsoft · peteryang1 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -83,6 +83,7 @@ def implement_one_task(
             package_info=target_task.package_info,
             enable_model_dump=DS_RD_SETTING.enable_model_dump,
             enable_debug_mode=DS_RD_SETTING.sample_data_by_LLM,
+            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(metric_name=self.scen.metric_name),
         )
         user_prompt = T(".prompts:pipeline_coder.user").r(
             competition_info=competition_info,

diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -100,11 +100,6 @@ def evaluate(
             else:
                 stdout += "Debug mode did not provide debug_time or estimated_time, it's a buggy implementation.\n"
 
-            test_eval = get_test_eval()
-            if test_eval.enabled(self.scen.competition):
-                submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
-                stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
-
         score_fp = implementation.workspace_path / "scores.csv"
         score_ret_code = 0
         score_check_text = ""
@@ -141,7 +136,11 @@ def evaluate(
                 score_check_text += f"\n[Error] in checking the scores.csv file: {e}\nscores.csv's content:\n-----\n{score_fp.read_text()}\n-----"
                 score_ret_code = 1
 
-        if not test_eval.is_sub_enabled(self.scen.competition):
+        test_eval = get_test_eval()
+        if DS_RD_SETTING.sample_data_by_LLM and test_eval.enabled(self.scen.competition):
+            submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
+            stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
+        elif not test_eval.is_sub_enabled(self.scen.competition):
             submission_ret_code = 0
         else:
             # Check submission file
@@ -167,14 +166,14 @@ def evaluate(
         system_prompt = T(".prompts:pipeline_eval.system").r(
             is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             debug_mode=DS_RD_SETTING.sample_data_by_LLM,
-            mle_check=(DS_RD_SETTING.sample_data_by_LLM and test_eval.is_sub_enabled(self.scen.competition)),
+            mle_check=DS_RD_SETTING.sample_data_by_LLM,
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
         )
         user_prompt = T(".prompts:pipeline_eval.user").r(
             scenario=self.scen.get_scenario_all_desc(eda_output=eda_output),
             task_desc=target_task.get_task_information(),
             stdout=stdout.strip(),
-            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
+            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(metric_name=self.scen.metric_name),
             code=implementation.file_dict["main.py"],
         )
         wfb = build_cls_from_json_with_retry(

diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -26,7 +26,7 @@ pipeline_coder:
     {% include "scenarios.data_science.share:spec.hyperparameter" %}
 
     # Specification your code should follow
-    {% include "scenarios.data_science.share:component_spec.Pipeline" %}
+    {{ spec }}
 
     {% if queried_former_failed_knowledge|length != 0 %}
     ## Previous Failed Attempts
@@ -112,10 +112,10 @@ pipeline_coder:
     ```
     In debug mode, your code should run faster, so the environment will set a shorter time limit than the standard time limit for your code.
     For example, you can sample ten percent of the training data and run for one epoch, then the full run with ten epochs will take one hundred times the time taken for the debug run. The scale is calculated by yourself depending on the data sampling and epoch number you choose. If your full run enables early stopping, the scale should be smaller considering the early stopping will stop the training earlier than the full epochs.
-    Be careful about the train-valid split strategy. StratifiedShuffleSplit is highly risk since the data has some categories with only one sample. If you use StratifiedShuffleSplit, you should consider using a try-except block to catch the error and use a different split strategy if the error occurs. Example code:
+    Be careful about the train-valid split strategy. Stratified related split is highly risk since the data has some categories with only one sample. If you use Stratified related split, you should consider using a try-except block to catch the error and use a different split strategy if the error occurs. Example code:
     ```python
     try:
-      fold_indices = StratifiedKFold(...).split(train_X, train_y) or StratifiedShuffleSplit(...).split(train_X, train_y)
+      fold_indices = StratifiedKFold(...).split(train_X, train_y) or StratifiedShuffleSplit or StratifiedSubsetSampler etc.
     except Exception as e:
         fold_indices = KFold(...).split(train_X, train_y) or other split strategy
     ```
@@ -206,10 +206,9 @@ pipeline_eval:
     3. A code implementation and its execution output.
     Your task is to rigorously evaluate the code implementation against the provided scenario and task description, ensuring it meets all requirements, adheres to the specified structure, and executes successfully.
 
-    {% if is_sub_enabled %}
-    ## Evaluation Steps
+    ## Evaluation Aspects
 
-    ### Step 1: Execution Success
+    ### Execution Success
     - Goal: Ensure the code executes successfully without any errors.
     - Notes:
       - Model performance is not evaluated in this step; focus solely on successful execution.
@@ -219,22 +218,7 @@ pipeline_eval:
     - If the code does not execute successfully:
       - Set the "final_decision" to false and write complete analysis in the "execution" field.
 
-    ### Step 2: Submission File Authenticity and Format
-    - Goal: Verify that the code correctly generates the final submission in the expected format and that the submission is authentic.
-    - Guidelines:
-      - The submission file must strictly match the required structure (correct columns, index format, data types). The index names and column names must be identical to the sample submission.
-      - Rigorously verify that the submission file was produced by genuine model inference and successful code execution, not by cheating, fallback or exception-handling mechanisms.
-        - The submission must be generated from genuine model predictions using the best saved model—never empty, constant, random, or hard-coded values.
-        - Submissions must reflect authentic model outputs; any form of fabrication, cheating, or simulated results is strictly prohibited and grounds for rejection.
-        - Cross-check both code logic and stdout to ensure predictions originate from real model inference, not from error recovery or placeholder code paths.
-      - Only check the format of the submission since only part of the data is provided; the submission might have a different index than the sample submission data.
-      - Verify honest failure reporting if training issues occur.
-    - If the code passes this step:
-      - Proceed to Step 3.
-    - If the code does not pass this step:
-      - Set the "final_decision" to false and clearly document the issues in the "return_checking" field.
-
-    ### Step 3: Competition Alignment
+    ### Competition Alignment
     - Goal: Confirm strict adherence to the competition's evaluation rules and experimental setup.
     - Guidelines:
       - Analyze whether the experimental setup and code may cause misalignment between validation and test performance.
@@ -251,7 +235,7 @@ pipeline_eval:
       - Begin the "code" with `[Evaluation error]`, explicitly document any evaluation alignment issues causing experiment failure.
 
     {% if debug_mode %}
-    ### Step 4: Debug Mode Compliance
+    ### Debug Mode Compliance
     - Goal: Ensure the code follows debug mode requirements.
     - Guidelines:
       - Sufficient debugging information (print statements, clear error messages) should be included to facilitate automatic improvement processes.
@@ -263,15 +247,31 @@ pipeline_eval:
       - Debug time should be reasonable and the estimated time should be reasonable based on the debug time.
       - Data sampling should only be applied in debug mode. Always use the full data in the full run.
       - The label classes number should be the same as the full run even in debug mode.
-    - If the code passes this step: Finalize evaluation.
+    - If the code passes this step: Proceed to Next Aspects.
     - If the code does not pass this step: Clearly document the debug mode compliance issues and reject the implementation.{% endif %}
 
+
+    ### Submission File Format Check
     {% if mle_check %}
-    ### Step 5: Test format check
     - The user has done a format check for your submission. Since you didn't sample any test data, your debug mode output should be the same format as the full run.
     - The user will put the check result in the "Submission check" section of the execution output.
     - If the submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should give the conclusion that the code executed successfully. If no other code related issues are found, set the "final_decision" to true.
     - If the submission check returns an error message, you should set the "final_decision" to false and clearly document the issues in the "return_checking" field.
+    {% elif is_sub_enabled %}
+    - Goal: Verify that the code correctly generates the final submission in the expected format and that the submission is authentic.
+    - Guidelines:
+      - The submission file must strictly match the required structure (correct columns, index format, data types). The index names and column names must be identical to the format specified in the Competition Information's '====== Submission Format ======' section.
+      - Rigorously verify that the submission file was produced by genuine model inference and successful code execution, not by cheating, fallback or exception-handling mechanisms.
+        - The submission must be generated from genuine model predictions using the best saved model—never empty, constant, random, or hard-coded values.
+        - Submissions must reflect authentic model outputs; any form of fabrication, cheating, or simulated results is strictly prohibited and grounds for rejection.
+        - Cross-check both code logic and stdout to ensure predictions originate from real model inference, not from error recovery or placeholder code paths.
+      - Only check the format of the submission since only part of the data is provided; the submission might have a different index than expected due to data sampling.
+      - Verify honest failure reporting if training issues occur.
+    - If the code passes this step, Finalize evaluation.
+    - If the code does not pass this step:
+      - Set the "final_decision" to false and clearly document the issues in the "return_checking" field.
+    {% else %}
+      Submission File Format Check is not conducted since no target submission format is provided. You should consider this submission file is valid.
     {% endif %}
 
     {% if queried_similar_successful_knowledge|length != 0 %}
@@ -290,35 +290,16 @@ pipeline_eval:
     Please respond with your feedback in the following JSON format without anything else.
     ```json
     {
-        "execution": "Describe whether the code executed successfully, correctly integrating all components and generating the final submission. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
-        "return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches sample submission (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
+        "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
+        "return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches required submission format (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
         "code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and competition requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
         "final_decision": <true/false>
     }
     ```
-    {% else %}
-    ## Evaluation Scope
-    Your focus is to check whether the workflow code executes successfully.
 
-    You will be given the execution output (`stdout`) to determine correctness.  
-
-    [Note] 
-    1. Model performance is NOT a concern in this evaluation—only correct execution and formatting matter.
-
-    Please respond with your feedback in the following JSON format and order
-    ```json
-    {
-        "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
-        "return_checking": "Describe the expected file to be generated.",
-        "code": "Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
-        "final_decision": <true/false>
-    }
-    ```
-    {% endif %}
-# NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently
 
   user: |-
-    # Competition Scenario
+    # Competition Information
     {{ scenario }}
 
     # Task Description

diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -165,7 +165,7 @@ def evaluate(
 
         if test_eval.enabled(self.scen.competition):
             submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
-            stdout += f"\nSubmission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
+            stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
 
         time_spent_ratio = implementation.running_info.running_time / env.conf.running_timeout_period
         if (
@@ -179,12 +179,12 @@ def evaluate(
 
         system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
             scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
-            is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             task_desc=target_task.get_task_information(),
             enable_hyperparameter_tuning_check=enable_hyperparameter_tuning_check,
         )
         user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
             code=implementation.all_codes,
+            change_summary=implementation.change_summary,
             stdout=shrink_text(stdout),
             time_spent=f"{implementation.running_info.running_time:.2f} seconds",
             timeout=f"{env.conf.running_timeout_period} seconds",