fix: refine prompts and add additional package info (#1179)

Hoder-zyf · you-n-g · web-flow · commit 22428a45053b · 2025-08-13T23:00:23.000+08:00
* refine prompts and add additional package info

* refine prompts to be specific for GBDT models

* minor refine prompts

* use include to replace duplicate info

* refine prompts

* refactor: import DSTrace from base and remove exp_gen __init__

* lint

---------

Co-authored-by: Young &lt;afe.young@gmail.com&gt;
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -13,9 +13,9 @@ pipeline_coder:
     {{ runtime_environment }}
 
     {% if package_info is not none %}
-    To help you write the runnable code, the user has provided the package information which contains the package names and versions.
-    You should be careful about the package versions, as the code will be executed in the environment with the specified version and the api might be different from the latest version.
-    The user might provide the packages the environment doesn't have, you should avoid using any of them.
+    - To help you write the runnable code, the user has provided the package information which contains the package names and versions.
+    - You should be careful about the package versions, as the code will be executed in the environment with the specified version and the api might be different from the latest version.
+    - While the environment is fixed, you should not limit yourself to only the provided packages - feel free to explore other libraries that might better suit the task. However, prioritize using the available packages first, and only suggest alternatives when they would provide significant improvements or are more appropriate for the specific problem. 
     ## Package Information
     {{ package_info }}
     {% endif %}
diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
@@ -13,7 +13,7 @@
 from rdagent.log.utils import dict_get_with_warning
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
+from rdagent.scenarios.data_science.proposal.exp_gen.base import DSTrace
 from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSIdea
 from rdagent.utils import convert2bool
 from rdagent.utils.agent.tpl import T
diff --git a/rdagent/scenarios/data_science/loop.py b/rdagent/scenarios/data_science/loop.py
@@ -30,8 +30,10 @@
 from rdagent.scenarios.data_science.dev.feedback import DSExperiment2Feedback
 from rdagent.scenarios.data_science.dev.runner import DSCoSTEERRunner
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
-from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
-from rdagent.scenarios.data_science.proposal.exp_gen.base import DataScienceScen
+from rdagent.scenarios.data_science.proposal.exp_gen.base import (
+    DataScienceScen,
+    DSTrace,
+)
 from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSKnowledgeBase
 from rdagent.scenarios.data_science.proposal.exp_gen.proposal import DSProposalV2ExpGen
 from rdagent.utils.workflow.misc import wait_retry
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py b/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py
@@ -1,3 +0,0 @@
-from rdagent.scenarios.data_science.proposal.exp_gen.base import DSTrace
-
-__all__ = ["DSTrace"]
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/package_info.py b/rdagent/scenarios/data_science/proposal/exp_gen/package_info.py
@@ -1,6 +1,71 @@
 import sys
 from importlib.metadata import distributions
 
+# Kaggle competition packages - based on usage frequency
+PYTHON_BASE_PACKAGES = ["catboost", "lightgbm", "numpy", "optuna", "pandas", "scikit-learn", "scipy", "shap", "xgboost"]
+
+PYTHON_ADVANCED_PACKAGES = [
+    "accelerate",
+    "albumentations",
+    "bayesian-optimization",
+    "category_encoders",
+    "datasets",
+    "featuretools",
+    "imbalanced-learn",
+    "nltk",
+    "opencv-python",
+    "pillow",
+    "polars",
+    "sentence-transformers",
+    "spacy",
+    "tensorflow",
+    "timm",
+    "tokenizers",
+    "torch",
+    "torchvision",
+    "transformers",
+]
+
+
+def get_all_excepted_packages():
+    """Get flattened list of all packages"""
+    all_packages = PYTHON_BASE_PACKAGES + PYTHON_ADVANCED_PACKAGES
+    return sorted(set(all_packages))
+
+
+def get_available_recommended_packages_prompt():
+    """Generate prompt template for dynamically detected available packages"""
+    installed_packages = get_installed_packages()
+
+    # Check which packages are actually installed
+    base_available = [pkg for pkg in PYTHON_BASE_PACKAGES if pkg.lower() in installed_packages]
+    advanced_available = [pkg for pkg in PYTHON_ADVANCED_PACKAGES if pkg.lower() in installed_packages]
+
+    # Build prompt
+    prompt_parts = ["# Available packages in environment:\n"]
+
+    if base_available:
+        prompt_parts.append("## [Basic Libraries] (general tools for data science tasks):")
+        prompt_parts.append(f"- {', '.join(base_available)}")
+        prompt_parts.append("")
+
+    if advanced_available:
+        prompt_parts.append("## [Advanced Tools] (specialized for specific domains):")
+        prompt_parts.append(f"- {', '.join(advanced_available)}")
+        prompt_parts.append("")
+
+    prompt_parts.append(
+        "You should choose appropriate tool combinations based on the specific context and current situation. Feel free to use any other packages you think are necessary to achieve the best performance."
+    )
+
+    return "\n".join(prompt_parts).strip()
+
+
+def print_available_packages_prompt():
+    """Print the available packages prompt to stdout for external consumption"""
+    prompt = get_available_recommended_packages_prompt()
+    print(prompt)
+
 
 def get_installed_packages():
     return {dist.metadata["Name"].lower(): dist.version for dist in distributions()}
@@ -26,24 +91,7 @@ def get_python_packages():
     # Example: `python package_info.py pandas torch scikit-learn`
     # If no extra arguments are provided we fall back to the original default list
     # to keep full backward-compatibility.
-    packages_list = [  # default packages
-        "transformers",
-        "accelerate",
-        "torch",
-        "tensorflow",
-        "pandas",
-        "numpy",
-        "scikit-learn",
-        "scipy",
-        "xgboost",
-        "sklearn",
-        "lightgbm",
-        "vtk",
-        "opencv-python",
-        "keras",
-        "matplotlib",
-        "pydicom",
-    ]
+    packages_list = get_all_excepted_packages()
     if len(sys.argv) > 1:
         packages_list = list(set(packages_list) | set(sys.argv[1:]))
 
@@ -61,4 +109,8 @@ def get_python_packages():
 
 
 if __name__ == "__main__":
-    get_python_packages()
+    # Check if we should print available packages prompt
+    if len(sys.argv) > 1 and sys.argv[1] == "--packages-prompt":
+        print_available_packages_prompt()
+    else:
+        get_python_packages()
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -250,6 +250,7 @@ hypothesis_critique:
     - **Metric Impact**: Will this meaningfully improve the competition's evaluation metric?
     - **Historical Context**: Has similar approaches been tried? Key learnings from past attempts?
     - **Innovation vs History Balance**: Distinguish between implementation failures (worth retrying with improvements) vs fundamental approach failures (multiple attempts failed due to core unsuitability - should avoid)
+    - **Tool Selection Appropriateness**: Are the suggested tools/packages well-suited for the problem? Consider both modern capabilities and traditional reliability
     
     ### 3. Improvement Direction
     - **Clarity Issues**: If vague, identify specific methods or strategies that address the core problem
@@ -268,11 +269,13 @@ hypothesis_critique:
     **Good Critiques:**
     - "The hypothesis lacks specificity about which ensemble method to use. Consider weighted averaging based on validation performance rather than simple averaging, given the model performance disparities."
     - "This hypothesis proposes LSTM for tabular data. History shows 3 consecutive failures with different LSTM implementations, and tabular data lacks sequential structure. Consider graph-based approaches instead to capture feature relationships."
+    - "The hypothesis jumps to LightGBM without establishing a baseline. Consider starting with XGBoost to ensure a working solution, then explore LightGBM for potential improvements if the baseline performs adequately."
     
     **Poor Critiques:**
     - "Set max_depth=10, learning_rate=0.05, and use 500 trees." (too specific)
     - "This might not work." (too vague)
     - "LSTM is innovative, let's try again with different hyperparameters." (ignores fundamental mismatch)
+    - "Use the latest deep learning model because it's new." (ignores problem-solution fit)
     
     {% if critique_output_format is not none %}
     ## Output Format
@@ -320,6 +323,12 @@ hypothesis_rewrite:
     {% endif %}
     
     ## Guidelines for Writing Rewritten Hypotheses
+
+    ### Available Tools Consideration
+    - When rewriting, consider if the hypothesis leverages appropriate tools from the available packages
+    - Balance innovation with practical tool selection - prefer modern packages when they offer clear advantages
+    - Ensure tool choices align with the problem requirements and constraints
+    - Be pragmatic: use whatever works best for the task - whether it's a cutting-edge transformer or traditional logistic regression
     
     1. **Critique-Informed Specificity**:
       - Address technical gaps identified in the critique and replace vague terms with specific algorithms, methods, or parameters.
@@ -379,6 +388,10 @@ hypothesis_rewrite:
     {{ time_status }}
     {% endif %}
 
+    {% if packages_prompt is not none %}
+    {{ packages_prompt }}
+    {% endif %}
+
 
 task_gen:
   system: |-
@@ -429,12 +442,8 @@ task_gen:
       - Ensure validation metrics and processes are consistent across all parts of the pipeline. Avoid changes that would alter how validation metrics are calculated unless that is part of the hypothesis.
     8. **Submission File (`submission.csv`)**: Generate `submission.csv` in the **exact format** required (column names, order, data types), as detailed in the '====== Submission Format ======' section of the Competition Scenario Description (DO NOT read the sample_submission.csv file directly in the code). This is a critical step.
     9. **Preferred Packages Notes**:
-      - You can choose the most proper packages for the task to best achieve the hypothesis.
-      - When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding. Especially those you are not familiar with.
-      - For GBDT models, prefer XGBoost or RandomForest over LightGBM unless the SOTA or hypothesis dictates otherwise. Prefer not using GPU for GBDT models unless the SOTA or hypothesis dictates otherwise.
-      - For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
-      - For neural networks, prefer fine-tuning pre-trained models over training from scratch.
-
+      {% include "scenarios.data_science.share:guidelines.package_selection" %}
+    
     ## Package Declaration
     At the end of your design, **you MUST** provide a key `packages` in the final JSON output.  
     It should be an **array of PyPI package names** (strings) that you expect to `import` in the forthcoming implementation.  
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -27,7 +27,10 @@
     DSExperimentPlan,
     RD_Agent_TIMER_wrapper,
 )
-from rdagent.scenarios.data_science.proposal.exp_gen.utils import get_packages
+from rdagent.scenarios.data_science.proposal.exp_gen.utils import (
+    get_available_packages_prompt,
+    get_packages,
+)
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.repo.diff import generate_diff_from_dict
 from rdagent.utils.workflow import wait_retry
@@ -588,16 +591,22 @@ def hypothesis_gen(
         enable_idea_pool: bool,
         inject_diverse: bool = False,
         exp_gen_plan: Optional[Dict] = None,
+        packages_prompt: str = "",
     ) -> Dict:
         problem_formatted_str = ""
         for i, (problem_name, problem_dict) in enumerate(problems.items()):
             problem_formatted_str += f"## {i+1}. {problem_name}\n"
-            problem_formatted_str += f"{problem_dict['problem']}\n"
+            problem_formatted_str += f"Statement: {problem_dict['problem']}\n"
+            problem_formatted_str += f"Reason: {problem_dict['reason']}\n"
             if "idea" in problem_dict:
                 idea_formatted_str = DSIdea(problem_dict["idea"]).to_formatted_str()
                 problem_formatted_str += f"Sampled Idea by user: \n{idea_formatted_str}\n"
             problem_formatted_str += "\n\n"
 
+        # add available packages prompt
+        if packages_prompt:
+            problem_formatted_str += f"\n{packages_prompt}\n"
+
         sys_prompt = T(".prompts_v2:hypothesis_gen.system").r(
             hypothesis_output_format=(
                 T(".prompts_v2:output_format.hypothesis").r(pipeline=pipeline, enable_idea_pool=enable_idea_pool)
@@ -731,6 +740,7 @@ def hypothesis_rewrite(
         scenario_desc: str,
         sota_exp_desc: str,
         exp_feedback_list_desc: str,
+        packages_prompt: str = "",
     ) -> Dict:
         """
         Generate improved hypotheses based on critique feedback for each original hypothesis.
@@ -769,6 +779,7 @@ def hypothesis_rewrite(
             sota_exp_desc=sota_exp_desc,
             hypothesis_critique_pairs=hypothesis_critique_pairs,
             time_status=time_status,
+            packages_prompt=packages_prompt,
         )
 
         response = APIBackend().build_messages_and_create_chat_completion(
@@ -1056,6 +1067,9 @@ def gen(
         else:
             inject_diverse = False
 
+        # add available packages prompt
+        packages_prompt = get_available_packages_prompt()
+
         # Step 1: Identify problems
         all_problems = self.identify_problem(
             current_sub_trace=trace.get_parent_exps(),
@@ -1087,6 +1101,7 @@ def gen(
             enable_idea_pool=DS_RD_SETTING.enable_knowledge_base,
             inject_diverse=inject_diverse,
             exp_gen_plan=plan.get("exp_gen") if plan else None,
+            packages_prompt=packages_prompt,
         )
         if not pipeline:
             sota_exp_model_file_count = len(
@@ -1130,6 +1145,7 @@ def gen(
                     scenario_desc=scenario_desc,
                     sota_exp_desc=sota_exp_desc,
                     exp_feedback_list_desc=exp_feedback_list_desc,
+                    packages_prompt=packages_prompt,
                 )
                 logger.info(f"Successfully completed hypothesis critique and rewrite process")
             except Exception as e:
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/utils.py b/rdagent/scenarios/data_science/proposal/exp_gen/utils.py
@@ -103,3 +103,16 @@ def get_packages(pkgs: list[str] | None = None) -> str:
     pkg_args = " ".join(pkgs) if pkgs else ""
     stdout = implementation.execute(env=env, entry=f"python {fname} {pkg_args}")
     return stdout
+
+
+def get_available_packages_prompt() -> str:
+    """Generate prompt template for dynamically detected available packages."""
+    # Use the same approach as get_packages but call the packages prompt functionality
+
+    env = get_ds_env()
+    implementation = FBWorkspace()
+    fname = "package_info.py"
+    implementation.inject_files(**{fname: (Path(__file__).absolute().resolve().parent / "package_info.py").read_text()})
+
+    stdout = implementation.execute(env=env, entry=f"python {fname} --packages-prompt")
+    return stdout.strip()
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -10,6 +10,9 @@
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.scenarios.data_science.debug.data import create_debug_data
+from rdagent.scenarios.data_science.proposal.exp_gen.utils import (
+    get_available_packages_prompt,
+)
 from rdagent.scenarios.data_science.scen.utils import describe_data_folder_v2
 from rdagent.scenarios.kaggle.kaggle_crawler import (
     crawl_descriptions,
@@ -209,6 +212,7 @@ def get_scenario_all_desc(self, eda_output=None) -> str:
                 f"{self.recommend_debug_timeout() / 60 : .2f} minutes" if DS_RD_SETTING.sample_data_by_LLM else None
             ),
             runtime_environment=self.get_runtime_environment(),
+            available_packages_prompt=get_available_packages_prompt(),
         )
 
     def get_runtime_environment(self) -> str:
diff --git a/rdagent/scenarios/data_science/scen/prompts.yaml b/rdagent/scenarios/data_science/scen/prompts.yaml
@@ -54,6 +54,11 @@ scenario_description: |-
   {{ runtime_environment }}
   {% endif %}
 
+  {% if available_packages_prompt is not none %}
+  ====== Available Packages ======
+  {{ available_packages_prompt }}
+  {% endif %}
+
 competition_description_template:
   system: |-
     You are a data science assistant that extracts structured information from unstructured text.
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -349,17 +349,19 @@ component_spec:
       - Present the required submission format explicitly and ensure the output adheres to it.
     
     10. Preferred Packages:
-      - You can choose the most proper packages to achieve the task.
-      - When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding. Especially those you are not familiar with.
-      - For GBDT models, prefer XGBoost or RandomForest over LightGBM unless the SOTA or hypothesis dictates otherwise.
-      - To use GPU in training, always implement a check to ensure that the GPU is available and use it if possible. Fallback to CPU if GPU is not available. Especially in GBDT models, you might get error when you call `fit` method without checking the GPU availability. Add a try except block to handle this case.
-      - For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
-      - For neural networks, prefer fine-tuning pre-trained models over training from scratch.
+      {% include "scenarios.data_science.share:guidelines.package_selection" %}
 
 guidelines:
   coding: |-
     You might receive exploratory data analysis (EDA) details about the source data. **Do not use this EDA information to create assertions, hard-coded values, or raise errors.** We might generate sample data for quick coding (so your code may run on sample data which is part of the full-size data), but remember that the EDA details are based on the full-size data.
-
+  package_selection: |-
+    - The `Available Packages` section in the Competition Scenario Description includes general and specific recommendations. Choose packages that best support the hypothesis and constraints; you may deviate with clear justification grounded in data, code reuse, and efficiency.
+    - Do not select packages solely because you are familiar with them.
+    - When facing a choice between two packages which both can achieve the same goal, you should choose the one which is more commonly used and less likely to cause bugs in coding.
+    - For GBDT, prefer CPU. Enable GPU for GBDT if profiling on this environment shows clear, reproducible speedups without stability regressions; document the versions and settings used.
+    - For deep learning frameworks, align with the current codebase and available pretrained assets. If unconstrained, prefer PyTorch or PyTorch‑based libraries given ecosystem and template support; prioritize consistency and reuse over brand preferences.
+    - For neural networks, favor fine-tuning well‑validated pretrained models over training from scratch when applicable to the task.
+    
 spec:
   hyperparameter: |-
     1. Hyperparameters Requiring Tuning (e.g., learning rate, weight decay, optimizer, etc.)

Original file line number	Diff line number	Diff line change
`@@ -1,3 +0,0 @@`
`1`		`-from rdagent.scenarios.data_science.proposal.exp_gen.base import DSTrace`
`2`		`-`
`3`		`-__all__ = ["DSTrace"]`