diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
index 7c375dc7d..38f99efde 100644
--- a/rdagent/components/coder/data_science/pipeline/__init__.py
+++ b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -83,6 +83,7 @@ def implement_one_task(
             package_info=target_task.package_info,
             enable_model_dump=DS_RD_SETTING.enable_model_dump,
             enable_debug_mode=DS_RD_SETTING.sample_data_by_LLM,
+            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(metric_name=self.scen.metric_name),
         )
         user_prompt = T(".prompts:pipeline_coder.user").r(
             competition_info=competition_info,
diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
index 8c6774bc7..3d8cbf4ea 100644
--- a/rdagent/components/coder/data_science/pipeline/eval.py
+++ b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -100,11 +100,6 @@ def evaluate(
             else:
                 stdout += "Debug mode did not provide debug_time or estimated_time, it's a buggy implementation.\n"
 
-            test_eval = get_test_eval()
-            if test_eval.enabled(self.scen.competition):
-                submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
-                stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
-
         score_fp = implementation.workspace_path / "scores.csv"
         score_ret_code = 0
         score_check_text = ""
@@ -141,7 +136,11 @@ def evaluate(
                 score_check_text += f"\n[Error] in checking the scores.csv file: {e}\nscores.csv's content:\n-----\n{score_fp.read_text()}\n-----"
                 score_ret_code = 1
 
-        if not test_eval.is_sub_enabled(self.scen.competition):
+        test_eval = get_test_eval()
+        if DS_RD_SETTING.sample_data_by_LLM and test_eval.enabled(self.scen.competition):
+            submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
+            stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
+        elif not test_eval.is_sub_enabled(self.scen.competition):
             submission_ret_code = 0
         else:
             # Check submission file
@@ -167,14 +166,14 @@ def evaluate(
         system_prompt = T(".prompts:pipeline_eval.system").r(
             is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             debug_mode=DS_RD_SETTING.sample_data_by_LLM,
-            mle_check=(DS_RD_SETTING.sample_data_by_LLM and test_eval.is_sub_enabled(self.scen.competition)),
+            mle_check=DS_RD_SETTING.sample_data_by_LLM,
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
         )
         user_prompt = T(".prompts:pipeline_eval.user").r(
             scenario=self.scen.get_scenario_all_desc(eda_output=eda_output),
             task_desc=target_task.get_task_information(),
             stdout=stdout.strip(),
-            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(),
+            spec=T("scenarios.data_science.share:component_spec.Pipeline").r(metric_name=self.scen.metric_name),
             code=implementation.file_dict["main.py"],
         )
         wfb = build_cls_from_json_with_retry(
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
index 86594d938..2b38e771d 100644
--- a/rdagent/components/coder/data_science/pipeline/prompts.yaml
+++ b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -26,7 +26,7 @@ pipeline_coder:
     {% include "scenarios.data_science.share:spec.hyperparameter" %}
     
     # Specification your code should follow
-    {% include "scenarios.data_science.share:component_spec.Pipeline" %}
+    {{ spec }}
 
     {% if queried_former_failed_knowledge|length != 0 %}
     ## Previous Failed Attempts
@@ -112,10 +112,10 @@ pipeline_coder:
     ```
     In debug mode, your code should run faster, so the environment will set a shorter time limit than the standard time limit for your code.
     For example, you can sample ten percent of the training data and run for one epoch, then the full run with ten epochs will take one hundred times the time taken for the debug run. The scale is calculated by yourself depending on the data sampling and epoch number you choose. If your full run enables early stopping, the scale should be smaller considering the early stopping will stop the training earlier than the full epochs.
-    Be careful about the train-valid split strategy. StratifiedShuffleSplit is highly risk since the data has some categories with only one sample. If you use StratifiedShuffleSplit, you should consider using a try-except block to catch the error and use a different split strategy if the error occurs. Example code:
+    Be careful about the train-valid split strategy. Stratified related split is highly risk since the data has some categories with only one sample. If you use Stratified related split, you should consider using a try-except block to catch the error and use a different split strategy if the error occurs. Example code:
     ```python
     try:
-      fold_indices = StratifiedKFold(...).split(train_X, train_y) or StratifiedShuffleSplit(...).split(train_X, train_y)
+      fold_indices = StratifiedKFold(...).split(train_X, train_y) or StratifiedShuffleSplit or StratifiedSubsetSampler etc.
     except Exception as e:
         fold_indices = KFold(...).split(train_X, train_y) or other split strategy
     ```
@@ -206,10 +206,9 @@ pipeline_eval:
     3. A code implementation and its execution output.
     Your task is to rigorously evaluate the code implementation against the provided scenario and task description, ensuring it meets all requirements, adheres to the specified structure, and executes successfully.
 
-    {% if is_sub_enabled %}
-    ## Evaluation Steps
+    ## Evaluation Aspects
     
-    ### Step 1: Execution Success
+    ### Execution Success
     - Goal: Ensure the code executes successfully without any errors.
     - Notes:
       - Model performance is not evaluated in this step; focus solely on successful execution.
@@ -219,22 +218,7 @@ pipeline_eval:
     - If the code does not execute successfully:
       - Set the "final_decision" to false and write complete analysis in the "execution" field.
 
-    ### Step 2: Submission File Authenticity and Format
-    - Goal: Verify that the code correctly generates the final submission in the expected format and that the submission is authentic.
-    - Guidelines:
-      - The submission file must strictly match the required structure (correct columns, index format, data types). The index names and column names must be identical to the sample submission.
-      - Rigorously verify that the submission file was produced by genuine model inference and successful code execution, not by cheating, fallback or exception-handling mechanisms.
-        - The submission must be generated from genuine model predictions using the best saved model—never empty, constant, random, or hard-coded values.
-        - Submissions must reflect authentic model outputs; any form of fabrication, cheating, or simulated results is strictly prohibited and grounds for rejection.
-        - Cross-check both code logic and stdout to ensure predictions originate from real model inference, not from error recovery or placeholder code paths.
-      - Only check the format of the submission since only part of the data is provided; the submission might have a different index than the sample submission data.
-      - Verify honest failure reporting if training issues occur.
-    - If the code passes this step:
-      - Proceed to Step 3.
-    - If the code does not pass this step:
-      - Set the "final_decision" to false and clearly document the issues in the "return_checking" field.
-
-    ### Step 3: Competition Alignment
+    ### Competition Alignment
     - Goal: Confirm strict adherence to the competition's evaluation rules and experimental setup.
     - Guidelines:
       - Analyze whether the experimental setup and code may cause misalignment between validation and test performance.
@@ -251,7 +235,7 @@ pipeline_eval:
       - Begin the "code" with `[Evaluation error]`, explicitly document any evaluation alignment issues causing experiment failure.
 
     {% if debug_mode %}
-    ### Step 4: Debug Mode Compliance
+    ### Debug Mode Compliance
     - Goal: Ensure the code follows debug mode requirements.
     - Guidelines:
       - Sufficient debugging information (print statements, clear error messages) should be included to facilitate automatic improvement processes.
@@ -263,15 +247,31 @@ pipeline_eval:
       - Debug time should be reasonable and the estimated time should be reasonable based on the debug time.
       - Data sampling should only be applied in debug mode. Always use the full data in the full run.
       - The label classes number should be the same as the full run even in debug mode.
-    - If the code passes this step: Finalize evaluation.
+    - If the code passes this step: Proceed to Next Aspects.
     - If the code does not pass this step: Clearly document the debug mode compliance issues and reject the implementation.{% endif %}
 
+
+    ### Submission File Format Check
     {% if mle_check %}
-    ### Step 5: Test format check
     - The user has done a format check for your submission. Since you didn't sample any test data, your debug mode output should be the same format as the full run.
     - The user will put the check result in the "Submission check" section of the execution output.
     - If the submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should give the conclusion that the code executed successfully. If no other code related issues are found, set the "final_decision" to true.
     - If the submission check returns an error message, you should set the "final_decision" to false and clearly document the issues in the "return_checking" field.
+    {% elif is_sub_enabled %}
+    - Goal: Verify that the code correctly generates the final submission in the expected format and that the submission is authentic.
+    - Guidelines:
+      - The submission file must strictly match the required structure (correct columns, index format, data types). The index names and column names must be identical to the format specified in the Competition Information's '====== Submission Format ======' section.
+      - Rigorously verify that the submission file was produced by genuine model inference and successful code execution, not by cheating, fallback or exception-handling mechanisms.
+        - The submission must be generated from genuine model predictions using the best saved model—never empty, constant, random, or hard-coded values.
+        - Submissions must reflect authentic model outputs; any form of fabrication, cheating, or simulated results is strictly prohibited and grounds for rejection.
+        - Cross-check both code logic and stdout to ensure predictions originate from real model inference, not from error recovery or placeholder code paths.
+      - Only check the format of the submission since only part of the data is provided; the submission might have a different index than expected due to data sampling.
+      - Verify honest failure reporting if training issues occur.
+    - If the code passes this step, Finalize evaluation.
+    - If the code does not pass this step:
+      - Set the "final_decision" to false and clearly document the issues in the "return_checking" field.
+    {% else %}
+      Submission File Format Check is not conducted since no target submission format is provided. You should consider this submission file is valid.
     {% endif %}
 
     {% if queried_similar_successful_knowledge|length != 0 %}
@@ -290,35 +290,16 @@ pipeline_eval:
     Please respond with your feedback in the following JSON format without anything else.
     ```json
     {
-        "execution": "Describe whether the code executed successfully, correctly integrating all components and generating the final submission. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
-        "return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches sample submission (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
+        "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
+        "return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches required submission format (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
         "code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and competition requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
         "final_decision": <true/false>
     }
     ```
-    {% else %}
-    ## Evaluation Scope
-    Your focus is to check whether the workflow code executes successfully.
 
-    You will be given the execution output (`stdout`) to determine correctness.  
-
-    [Note] 
-    1. Model performance is NOT a concern in this evaluation—only correct execution and formatting matter.
-
-    Please respond with your feedback in the following JSON format and order
-    ```json
-    {
-        "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
-        "return_checking": "Describe the expected file to be generated.",
-        "code": "Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
-        "final_decision": <true/false>
-    }
-    ```
-    {% endif %}
-# NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently
 
   user: |-
-    # Competition Scenario
+    # Competition Information
     {{ scenario }}
 
     # Task Description
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
index 5c8bbb44e..9121f0730 100644
--- a/rdagent/scenarios/data_science/dev/runner/eval.py
+++ b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -165,7 +165,7 @@ def evaluate(
 
         if test_eval.enabled(self.scen.competition):
             submission_check_out, submission_ret_code = test_eval.valid(self.scen.competition, implementation)
-            stdout += f"\nSubmission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
+            stdout += f"\n### Submission check:\n{submission_check_out}\nIf Submission check returns a 'Submission is valid' or similar message, despite some warning messages, you should still consider the submission as valid and give a positive final decision. "
 
         time_spent_ratio = implementation.running_info.running_time / env.conf.running_timeout_period
         if (
@@ -179,12 +179,12 @@ def evaluate(
 
         system_prompt = T(".prompts:DSCoSTEER_eval.system").r(
             scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
-            is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             task_desc=target_task.get_task_information(),
             enable_hyperparameter_tuning_check=enable_hyperparameter_tuning_check,
         )
         user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
             code=implementation.all_codes,
+            change_summary=implementation.change_summary,
             stdout=shrink_text(stdout),
             time_spent=f"{implementation.running_info.running_time:.2f} seconds",
             timeout=f"{env.conf.running_timeout_period} seconds",
diff --git a/rdagent/scenarios/data_science/dev/runner/prompts.yaml b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
index 7ae6deea7..412e7acdc 100644
--- a/rdagent/scenarios/data_science/dev/runner/prompts.yaml
+++ b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
@@ -1,7 +1,6 @@
 DSCoSTEER_eval:
   system: |-
     {% include "scenarios.data_science.share:scen.role" %}
-    {% if is_sub_enabled %}
     You will be provided with:
     1. `Code base`: The code base of the solution
     2. `The stdout of code execution and testing`: The generated stdout when executing the code base and corresponding testing
@@ -10,7 +9,7 @@ DSCoSTEER_eval:
     5. `The percent of timeout used`: the percentage of the time limitation used
     Your task is to perform the following evaluation(s):
 
-    # Evalution 1: Code Correctness
+    # Evaluation 1: Code Correctness
     ## Scenario
     The code is focusing on the following scenario:
     {{ scenario }}
@@ -22,7 +21,7 @@ DSCoSTEER_eval:
     ## Evaluation Guidelines
     1. Evaluate the code base based on several aspects, including execution correctness, return checking, and code quality.
     2. Ensure the code does not contain any incorrect, fabricated, or deceptive operations, such as mocking data, scores, or results.
-    3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission.
+    3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission. Please refer to Submission check section including the format check to the submission.
     If the code does not satisfy the requirements:
     - Set "acceptable" to false.
     If the code satisfy the requirements:
@@ -33,16 +32,18 @@ DSCoSTEER_eval:
     ## Evaluation Description
     The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
     For example, if the code uses only a very small portion of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
-    You should also notice other resources utilization hyper-parameters,
+    You should also notice other resources utilization hyper-parameters.
     For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
 
     ## Evaluation Guidelines
     1. The code execution time or resource utilization suggest that there is room for improvement in the hyperparameters.
     2. The code must apply early stopping strategy already (in order to prevent overfitting).
     3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence.  If there are no obvious and impactful opportunities and the code runs well, please accept it.
+    4. Only include the suggestions in your response without leak any time limit information because the user might over-fit the model to the time limit.
+    5. Never make your judgment only based on the time spent, you should also consider the code and the stdout.
     If the code satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to true.
-    - In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only 15% of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
+    - In "hyperparameter_tuning_suggestion", provide a clear, specific, and actionable suggestion. Begin with a concrete observation, then state a direct action to take. Do not use vague language, options, or uncertainty (avoid words like "A or B"). For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still decreasing and early stopping was not activated. Only small portion of the allowed time was used. [Suggestion] Increase epochs to 100 to avoid underfitting and further improve model performance."
     If the code does not satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to false.
     - Set "hyperparameter_tuning_suggestion" to an empty string.
@@ -53,55 +54,13 @@ DSCoSTEER_eval:
     ```json
     {
         "execution": "Describe whether the whole code base executed successfully and generating the final submission. Include any errors or issues encountered, and retain all error messages and traceback details.",
-        "return_checking": "Verify the generated files, particularly the submission file. Ensure that its format matches the sample submission",
+        "return_checking": "Verify the generated files, particularly the submission file. Ensure that its format is valid",
         "code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
         "acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
-        {% if enable_hyperparameter_tuning_check %}
-        "hyperparameter_tuning_decision": <true/false>,
-        "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
-        {% endif %}
-    }
-    ```
-    {% else %}
-    The user will provide you the whole code base, some logs generated during the execution of the whole workflow. Your evaluation scope includes whether the workflow code executes successfully.
-    No need to check the detail of submission file.
-    {% if enable_hyperparameter_tuning_check %}
-
-    # Evaluation: Hyperparameter
-    ## Evaluation Description
-    The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
-    If the code uses only a very small portion (below 25%) of the allowed time, and hyperparameters like `n_estimators` or `epochs` have low values, with early stopping not being triggered and possible signs of underfitting, you should suggest increasing these hyperparameters.
-    You should also notice other resources utilization hyper-parameters.
-    For example, if you are using a GPU with large memory, and the batch size is set very low, you should suggest increasing the batch size if it is not reasonable.
-
-    ## Evaluation Guidelines
-    1. The code execution time or resource utilization suggest that there is room for improvement in the hyperparameters.
-    2. The code must apply early stopping strategy already (in order to prevent overfitting).
-    3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence.  If there are no obvious and impactful opportunities and the code runs well, please accept it.
-
-    If the code satisfy the requirements:
-    - Set "hyperparameter_tuning_decision" to true.
-    - Provide a reasonable suggestion in "hyperparameter_tuning_suggestion". The "hyperparameter_tuning_suggestion" should begin with a clear observation, followed by your suggestion. For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still going down and early stopping was not activated. Only 15% of the allowed time was used. [Suggestion] We recommend increasing epochs to 100 to avoid underfitting and further improve model performance."
-    - Set "final_decision" to false.
-
-    If the code does not satisfy the requirements:
-    - Set "hyperparameter_tuning_decision" to false.
-    - Set "hyperparameter_tuning_suggestion" to an empty string.
-    {% endif %}
-
-    Please respond with your feedback in the following JSON format and order
-    ```json
-    {
-        "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information.",
-        "return_checking": "Describe the expected file to be generated.",
-        "code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
-        "acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
         {% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_decision": <true/false>,
         "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
     }
     ```
-    {% endif %}
-# NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently
 
   user: |-
     # Current Code base
@@ -190,6 +149,9 @@ DSCoSTEER:
   user: |-
     # Current Code Base
     {{ code }}
+    {% if change_summary is not none %}
+    # Current Code Change Summary
+    {{ change_summary }}{% endif %}
 
     ## Feedback of Current Code Base
     {{ feedback }}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
index c7e222e2e..4959ec4c3 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -402,6 +402,7 @@ task_gen:
       - Implement robust handling of file encodings and delimiters.
       - Input files are under `{% include "scenarios.data_science.share:scen.input_path" %}`. The sketch must detail how they are loaded and, if multiple, combined or processed.
       - Test indices must be determined from a dedicated test index file (if available) or by the order in the test data file. **Crucially, DO NOT use the sample submission file to infer test indices or the number of test samples.**
+      - **CRITICAL: DO NOT read, load, or access the sample_submission.csv file in any part of the code implementation. The code must never contain pd.read_csv('sample_submission.csv') or similar file reading operations.**
       - Ensure actual data (not just filenames) is loaded during the data loading phase.
       - If data is in zip files, the sketch should advise on robust loading, e.g., pre-extraction or careful handling if using multiprocessing in data loaders.
     3. **Data Preprocessing**:
@@ -410,23 +411,23 @@ task_gen:
       - Implement domain-specific preprocessing relevant to the hypothesis (e.g., text tokenization, image resizing/augmentation).
     4. **Code Standards**:
       - The pipeline must **NOT** use progress bars (e.g., `tqdm`) in the submission code.
-      - Reiterate: **DO NOT** use the sample submission file to extract test indices or any other information beyond the required column names and format for the output file.
+      - **CRITICAL: DO NOT read or access the sample_submission.csv file in the code. Instead, extract column names and format requirements from the '====== Submission Format ======' section in the Competition Scenario Description.**
       - Ensure no features are inadvertently excluded during processing.
     5. **General Data Science Considerations**:
       - Design for scalability.
       - Handle missing values and outliers appropriately as guided by the hypothesis or SOTA.
       - Ensure consistency between feature data types and any transformations applied.
       - Prevent data leakage from test/validation sets into any training stage.
-      - Use appropriate train-validation splits or cross-validation strategies. Some dataset might not be suitable for StratifiedShuffleSplit since some categories may not be present in the test set. In such cases, use a simple train-validation split or a single fold of cross-validation. Implement a try except block to handle potential errors if you are using StratifiedShuffleSplit.
+      - Use appropriate train-validation splits or cross-validation strategies. Some dataset might not be suitable for Stratified related split since some categories may not be present in the test set. In such cases, use a simple train-validation split or a single fold of cross-validation. Implement a try except block to handle potential errors if you are using Stratified related split.
       - Use appropriate cross-validation strategies. Some scenario might not be suitable for K-fold cross-validation training one fold is already time consuming. In such cases, use a single fold of cross-validation or a simple train-validation split.
     6. **Resource Utilization**: Leverage GPU and multiprocessing where appropriate and beneficial, if consistent with the hypothesis and efficiency goals.
     7. **Metric Calculation and Storage (`scores.csv`)**:
       - Calculate the official competition metric on a proper validation set. Save results to `scores.csv`.
       - The sketch must ensure this step is included. A successful run should always produce scores.
-      - `scores.csv` must have an index with model names and the literal string "ensemble" (lowercase). Columns should be the exact metric name (e.g., "AUC").
+      - `scores.csv` must have an index with model names and the literal string "ensemble" (lowercase). **Columns should be a single column with exact metric name: "{{ metric_name }}".**
       - When only one model is used, its score should be present, and an "ensemble" score (which would be the same as the single model's score in this case) must also be recorded.
       - Ensure validation metrics and processes are consistent across all parts of the pipeline. Avoid changes that would alter how validation metrics are calculated unless that is part of the hypothesis.
-    8. **Submission File (`submission.csv`)**: Generate `submission.csv` in the **exact format** required (column names, order, data types), as detailed by `sample_submission.csv` in the `Competition Scenario Description`. This is a critical step.
+    8. **Submission File (`submission.csv`)**: Generate `submission.csv` in the **exact format** required (column names, order, data types), as detailed in the '====== Submission Format ======' section of the Competition Scenario Description (DO NOT read the sample_submission.csv file directly in the code). This is a critical step.
     9. **Preferred Packages Notes**:
       - You can choose the most proper packages for the task to best achieve the hypothesis.
       - When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding. Especially those you are not familiar with.
@@ -442,6 +443,12 @@ task_gen:
     # Guidelines for Sketching the `main.py` Workflow
 
     YOUR TASK IS TO create a conceptual sketch for drafting or updating the `main.py` workflow. This is a plan, not code.
+    
+    ## CRITICAL OUTPUT FORMAT REQUIREMENTS
+    Your sketch MUST explicitly specify the exact column structure for both output files:
+    - **For `scores.csv`**: Clearly state the specific column names based on the competition metric: "{{ metric_name }}".
+    - **For `submission.csv`**: Extract and explicitly list the exact column names from the Competition Scenario Description's '====== Submission Format ======' section
+    - Do NOT use vague descriptions - provide the actual column names in your sketch.
 
     1. **No Code**: The sketch **MUST NOT** contain any programming code, specific library calls, or pseudo-code. Describe steps conceptually (e.g., "Load training data from {% include "scenarios.data_science.share:scen.input_path" %}/train.csv"). List specific algorithm names where appropriate (e.g., "Apply XGBoost classifier," "Use Isotonic Regression for calibration").
     2. **Structure and Conciseness**:
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index d290fde11..bea8b34dc 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -913,6 +913,7 @@ def task_gen(
             task_output_format=component_info["task_output_format"] if not self.supports_response_schema else None,
             component_desc=component_desc,
             workflow_check=workflow_check,
+            metric_name=self.scen.metric_name,
         )
         user_prompt = T(".prompts_v2:task_gen.user").r(
             scenario_desc=scenario_desc,
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
index 5d2a87164..ef9df8a21 100644
--- a/rdagent/scenarios/data_science/scen/prompts.yaml
+++ b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -24,9 +24,7 @@ scenario_description: |-
   - Do not manipulate data or return values solely to pass preliminary tests, as this will not lead to successful final evaluation.
 
   ====== Evaluation ======
-  {% if metric_name %}
-  The primary evaluation metric for this task is: **{{ metric_name }}**.
-  {% endif %}
+  {% if metric_name %}The primary evaluation metric for this task is: **{{ metric_name }}**, **which should be the column name in `scores.csv`**.{% endif %}
   This metric is considered better when it is **{% if metric_direction %}larger{% else %}smaller{% endif %}**.
 
   {% if evaluation is not none %}
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
index 92c561845..2bda21e00 100644
--- a/rdagent/scenarios/data_science/share.yaml
+++ b/rdagent/scenarios/data_science/share.yaml
@@ -2,8 +2,7 @@ describe: # some template to describe some object
   # exp is a template used fo
   exp: |-
     ## {{ heading | default('Best solution of previous exploration of the scenario') }}
-    {% if exp %}
-    ### Code
+    {% if exp %}### Code
     Here is the complete code of the solution.
     {{ exp.experiment_workspace.all_codes }}
 
@@ -27,8 +26,7 @@ describe: # some template to describe some object
     {% endif %}
     {% endif %}
 
-    {% else %}
-    No previous complete experiment available.
+    {% else %}No previous complete experiment available.
     {% endif %}
 
   feedback: |-
@@ -241,7 +239,7 @@ component_spec:
         ```
 
     4. Submission File:
-      - Save the final predictions as `submission.csv`, ensuring the format matches the competition requirements (refer to `sample_submission` in the Folder Description for the correct structure).
+      - Save the final predictions as `submission.csv`, ensuring the format matches the competition requirements as detailed in the '====== Submission Format ======' section of the Competition Information (DO NOT read the sample_submission.csv file directly in the code).
       - Present the required submission format explicitly and ensure the output adheres to it.
 
     5. Code Standards:
@@ -285,7 +283,7 @@ component_spec:
 
     4. Code Standards:
       - DO NOT use progress bars (e.g., `tqdm`).
-      - DO NOT use the sample submission file to extract test index information.
+      - **CRITICAL: DO NOT read, load, or access the sample_submission.csv file in the code. Extract column names and format requirements from the '====== Submission Format ======' section in the # Competition Information instead.**
       - DO NOT exclude features inadvertently during this process.
 
     5. NOTES
@@ -306,13 +304,12 @@ component_spec:
       - The evaluation should be based on k-fold cross-validation but only if that's an appropriate evaluation for the task at hand. Store the mean validation score of k-fold cross-validation in `scores.csv` on each model. Refer to the hyperparameter specification for rules to set the CV folds.
       - Even if only one model is present, compute the ensemble score and store it under `"ensemble"`.
       - The index of `scores.csv` should include the model name and the "ensemble" strategy. "ensemble" should be exactly in the index with all lower case letters. Ensemble is the result from several models. If only one model is present, the ensemble score should be the same as the model score.
-      - The column names in `scores.csv` should be:
-        - Model: The name of the model or ensemble strategy.
-        - <metric_name>: The calculated metric value for that model or ensemble strategy. The metric name can be found in the scenario description. The metric name should be exactly the same as the one in the scenario description since user will use it to check the result.
+      - The column names in `scores.csv` should be ["{{ metric_name }}"] where metric_name is the name of the metric used for evaluation. Only one column is required.
+      - The column name should be exactly the same to "{{ metric_name }}" since user will use it to pick the result.
       - Validation metrics should be aligned across all ideas and implementations. Avoid proposing ideas that might affect the validation metrics and modifying the related code.
 
     9. Submission File:
-      - Save the final predictions as `submission.csv`, ensuring the format matches the competition requirements (refer to `sample_submission` in the Folder Description for the correct structure).
+      - Save the final predictions as `submission.csv`, ensuring the format matches the competition requirements as detailed in the '====== Submission Format ======' section of the Competition Information (DO NOT read the sample_submission.csv file directly in the code).
       - Present the required submission format explicitly and ensure the output adheres to it.
     
     10. Preferred Packages: