Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions rdagent/components/coder/data_science/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
)
from rdagent.components.coder.data_science.conf import DSCoderCoSTEERSettings
from rdagent.components.coder.data_science.pipeline.eval import PipelineCoSTEEREvaluator
from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
from rdagent.components.coder.data_science.pipeline.exp import PipelineTask
from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
from rdagent.core.exception import CoderError
from rdagent.core.experiment import FBWorkspace
Expand All @@ -53,7 +53,7 @@
class PipelineMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
def implement_one_task(
self,
target_task: DataLoaderTask,
target_task: PipelineTask,
queried_knowledge: CoSTEERQueriedKnowledge | None = None,
workspace: FBWorkspace | None = None,
prev_task_feedback: CoSTEERSingleFeedback | None = None,
Expand Down Expand Up @@ -86,6 +86,7 @@ def implement_one_task(
queried_former_failed_knowledge=queried_former_failed_knowledge[0],
out_spec=PythonAgentOut.get_spec(),
runtime_environment=runtime_environment,
package_info=target_task.package_info,
enable_model_dump=DS_RD_SETTING.enable_model_dump,
enable_debug_mode=DS_RD_SETTING.sample_data_by_LLM,
)
Expand Down
3 changes: 2 additions & 1 deletion rdagent/components/coder/data_science/pipeline/exp.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@

# Because we use isinstance to distinguish between different types of tasks, we need to use sub classes to represent different types of tasks
class PipelineTask(CoSTEERTask):
def __init__(self, name: str = "Pipeline", *args, **kwargs) -> None:
def __init__(self, name: str = "Pipeline", package_info: str | None = None, *args, **kwargs) -> None:
super().__init__(name=name, *args, **kwargs)
self.package_info = package_info
8 changes: 8 additions & 0 deletions rdagent/components/coder/data_science/pipeline/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ pipeline_coder:

## The runtime environment your code will running on
{{ runtime_environment }}

{% if package_info is not none %}
To help you write the runnable code, the user has provided the package information which contains the package names and versions.
You should be careful about the package versions, as the code will be executed in the environment with the specified version and the api might be different from the latest version.
The user might provide the packages the environment doesn't have, you should avoid using any of them.
## Package Information
{{ package_info }}
{% endif %}

## Hyperparameters Specification
Follow the hyperparameter choices if they are specified in the task description, unless they are unreasonable or incorrect.
Expand Down
2 changes: 1 addition & 1 deletion rdagent/scenarios/data_science/dev/runner/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def evaluate(
scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
task_desc=target_task.get_task_information(),
runtime_environment=self.scen.get_runtime_environment(),
runtime_environment=self.scen.runtime_environment,
)
user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
code=implementation.all_codes,
Expand Down
66 changes: 66 additions & 0 deletions rdagent/scenarios/data_science/proposal/exp_gen/package_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import sys
from importlib.metadata import distributions


def get_installed_packages():
return {dist.metadata["Name"].lower(): dist.version for dist in distributions()}


def print_filtered_packages(installed_packages, filtered_packages):
to_print = []
for package_name in filtered_packages:
version = installed_packages.get(package_name.lower())
if version:
to_print.append((package_name, version))
if not to_print:
print("=== No matching packages found ===")
else:
print("=== Installed Packages ===")
for package_name, version in to_print:
# Print package name and version in the format "package_name==version"
print(f"{package_name}=={version}")


def get_python_packages():
# Allow the caller to pass a custom package list via command-line arguments.
# Example: `python package_info.py pandas torch scikit-learn`
# If no extra arguments are provided we fall back to the original default list
# to keep full backward-compatibility.
filtered_packages = (
sys.argv[1:]
if len(sys.argv) > 1
else [
"transformers",
"accelerate",
"torch",
"tensorflow",
"pandas",
"numpy",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list(set(A) + set(B))

"scikit-learn",
"scipy",
"xgboost",
"sklearn",
"lightgbm",
"vtk",
"opencv-python",
"keras",
"matplotlib",
"pydicom",
]
)

installed_packages = get_installed_packages()

print_filtered_packages(installed_packages, filtered_packages)

# TODO: Handle missing packages.
# Report packages that are requested by the LLM but are not installed.
missing_pkgs = [pkg for pkg in filtered_packages if pkg.lower() not in installed_packages]
if missing_pkgs:
print("\n=== Missing Packages (Avoid using these packages) ===")
for pkg in missing_pkgs:
print(pkg)


if __name__ == "__main__":
get_python_packages()
Original file line number Diff line number Diff line change
Expand Up @@ -345,4 +345,5 @@ output_format:
The output should follow JSON format. The schema is as follows:
{
"description": "A detailed, step-by-step implementation guide for `main.py` that synthesizes planned modifications and code structure into a comprehensive coding plan. Must be formatted in Markdown with level-3 headings (###) organizing logical sections, key decision points, and implementation steps. Should provide sufficient detail covering implementation flow, algorithms, data handling, and key logic points for unambiguous developer execution.",
"packages": ["package1", "package2", ...] # Optional, list of packages needed for the task. If no packages are needed, leave it empty.
}
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,11 @@ task_gen:
- For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
- For neural networks, prefer fine-tuning pre-trained models over training from scratch.

## Package Declaration
At the end of your design, **you MUST** provide a key `packages` in the final JSON output.
It should be an **array of PyPI package names** (strings) that you expect to `import` in the forthcoming implementation.
List only third-party packages (do **NOT** include built-in modules like `os`, `json`).

# Guidelines for Sketching the `main.py` Workflow

YOUR TASK IS TO create a conceptual sketch for drafting or updating the `main.py` workflow. This is a plan, not code.
Expand Down
15 changes: 15 additions & 0 deletions rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
DSDraftExpGen, # TODO: DSDraftExpGen should be moved to router in the further
)
from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSIdea
from rdagent.scenarios.data_science.proposal.exp_gen.utils import get_packages
from rdagent.utils.agent.tpl import T
from rdagent.utils.repo.diff import generate_diff_from_dict
from rdagent.utils.workflow import wait_retry
Expand Down Expand Up @@ -274,6 +275,11 @@ class CodingSketch(BaseModel):
"The content **must** be formatted using Markdown, with logical sections, key decision points, or implementation steps clearly organized by level-3 headings (i.e., `###`). "
"This field should provide sufficient detail for a developer to understand the implementation flow, algorithms, data handling, and key logic points without ambiguity."
)
packages: List[str] = Field(
default=None,
description="A list of third-party package names (PyPI) that the planned implementation will import. "
"Used to query the runtime environment dynamically. Leave `null` or omit if not applicable.",
)


def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraftExpGen:
Expand Down Expand Up @@ -775,6 +781,15 @@ def task_gen(
name=task_name,
description=task_desc,
)

assert isinstance(task, PipelineTask), f"Task {task_name} is not a PipelineTask, got {type(task)}"
# only for llm with response schema.(TODO: support for non-schema llm?)
# If the LLM provides a "packages" field (list[str]), compute runtime environment now and cache it for subsequent prompts in later loops.
if isinstance(task_dict, dict) and "packages" in task_dict and isinstance(task_dict["packages"], list):
pkgs: list[str] = [str(p) for p in task_dict["packages"]]
# Persist for later stages
task.package_info = get_packages(pkgs)

exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=hypotheses[0])
if sota_exp is not None:
exp.experiment_workspace.inject_code_from_file_dict(sota_exp.experiment_workspace)
Expand Down
20 changes: 20 additions & 0 deletions rdagent/scenarios/data_science/proposal/exp_gen/utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

from pydantic import BaseModel, Field

from rdagent.components.coder.data_science.conf import get_ds_env
from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
from rdagent.components.coder.data_science.feature.exp import FeatureTask
from rdagent.components.coder.data_science.model.exp import ModelTask
from rdagent.components.coder.data_science.pipeline.exp import PipelineTask
from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
from rdagent.core.experiment import FBWorkspace
from rdagent.utils.agent.tpl import T

_COMPONENT_META: Dict[str, Dict[str, Any]] = {
Expand Down Expand Up @@ -86,3 +89,20 @@ class CodingSketch(BaseModel):
"The content **must** be formatted using Markdown, with logical sections, key decision points, or implementation steps clearly organized by level-3 headings (i.e., `###`). "
"This field should provide sufficient detail for a developer to understand the implementation flow, algorithms, data handling, and key logic points without ambiguity."
)


def get_packages(self, pkgs: list[str] | None = None) -> str:
# TODO: add it into base class. Environment should(i.e. `DSDockerConf`) should be part of the scenario class.
"""Return runtime environment information."""
# Reuse package list cached during Draft stage when available.
if pkgs is None and hasattr(self, "required_packages"):
pkgs = getattr(self, "required_packages") # type: ignore[arg-type]

env = get_ds_env()
implementation = FBWorkspace()
fname = "package_info.py"
implementation.inject_files(**{fname: (Path(__file__).absolute().resolve().parent / "package_info.py").read_text()})

pkg_args = " ".join(pkgs) if pkgs else ""
stdout = implementation.execute(env=env, entry=f"python {fname} {pkg_args}")
return stdout
1 change: 1 addition & 0 deletions rdagent/scenarios/data_science/scen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def get_scenario_all_desc(self, eda_output=None) -> str:

def get_runtime_environment(self) -> str:
# TODO: add it into base class. Environment should(i.e. `DSDockerConf`) should be part of the scenario class.
"""Return runtime environment information."""
env = get_ds_env()
implementation = FBWorkspace()
fname = "runtime_info.py"
Expand Down
31 changes: 0 additions & 31 deletions rdagent/scenarios/data_science/scen/runtime_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,6 @@ def print_runtime_info():
print(f"Python {sys.version} on {platform.system()} {platform.release()}")


def get_installed_packages():
return {dist.metadata["Name"].lower(): dist.version for dist in distributions()}


def print_filtered_packages(installed_packages, filtered_packages):
for package_name in filtered_packages:
version = installed_packages.get(package_name.lower())
if version:
print(f"{package_name}=={version}")


def get_gpu_info():
try:
# Option 1: Use PyTorch
Expand Down Expand Up @@ -53,24 +42,4 @@ def get_gpu_info():

if __name__ == "__main__":
print_runtime_info()
filtered_packages = [
"transformers",
"accelerate",
"torch",
"tensorflow",
"pandas",
"numpy",
"scikit-learn",
"scipy",
"xgboost",
"sklearn",
"lightgbm",
"vtk",
"opencv-python",
"keras",
"matplotlib",
"pydicom",
]
installed_packages = get_installed_packages()
print_filtered_packages(installed_packages, filtered_packages)
get_gpu_info()