Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions rdagent/components/coder/data_science/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ class Config:
def get_ds_env(
conf_type: Literal["kaggle", "mlebench"] = "kaggle",
extra_volumes: dict = {},
running_timeout_period: int = (
DS_RD_SETTING.debug_timeout if not DS_RD_SETTING.sample_data_by_LLM else DS_RD_SETTING.full_timeout
),
running_timeout_period: int = DS_RD_SETTING.debug_timeout,
) -> Env:
"""
Retrieve the appropriate environment configuration based on the env_type setting.
Expand Down
11 changes: 1 addition & 10 deletions rdagent/components/coder/data_science/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
- Each coder could be tested.
"""

import json
import re
from pathlib import Path
from typing import Dict

from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.components.coder.CoSTEER import CoSTEER
Expand All @@ -39,14 +36,8 @@
from rdagent.components.coder.CoSTEER.knowledge_management import (
CoSTEERQueriedKnowledge,
)
from rdagent.components.coder.data_science.conf import (
DSCoderCoSTEERSettings,
get_ds_env,
)
from rdagent.components.coder.data_science.conf import DSCoderCoSTEERSettings
from rdagent.components.coder.data_science.pipeline.eval import PipelineCoSTEEREvaluator
from rdagent.components.coder.data_science.raw_data_loader.eval import (
DataLoaderCoSTEEREvaluator,
)
from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
from rdagent.core.exception import CoderError
Expand Down
5 changes: 3 additions & 2 deletions rdagent/components/coder/data_science/pipeline/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,10 @@ pipeline_coder:
```bash
python main.py --debug
```
In debug mode, you should only sample ten percent of the data and run the minimum epochs to quickly test the correctness of the code.
In debug mode, you should only sample ten percent of the training data and run the minimum epochs to quickly test the correctness of the code.
In debug mode, you should implement a timer to measure the time taken for your debug configuration and estimate the time required for the full run.
For example, you can sample ten percent of the data and run for one epoch, then the full run with ten epochs will take one hundred times the time taken for the debug run. The scale is calculated by yourself depending on the data sampling and epoch number you choose. If your full run enables early stopping, the scale should be smaller considering the early stopping will stop the training earlier than the full epochs.
For example, you can sample ten percent of the training data and run for one epoch, then the full run with ten epochs will take one hundred times the time taken for the debug run. The scale is calculated by yourself depending on the data sampling and epoch number you choose. If your full run enables early stopping, the scale should be smaller considering the early stopping will stop the training earlier than the full epochs.
You should sample the data after train valid split. When you split the data after sampling, you might get a class with only one sample which might cause the split strategy to fail.
Your debug code should run exactly the same as the full run, except for the data sampling and epoch number, to ensure the correctness of the code.
You should print total time and estimated time in standard output using print function in the following schema:
=== Start of Debug Information ===
Expand Down
5 changes: 0 additions & 5 deletions rdagent/scenarios/data_science/dev/runner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from pathlib import Path
from typing import Dict

import pandas as pd

from rdagent.app.data_science.conf import DS_RD_SETTING
Expand All @@ -17,7 +14,6 @@
MultiProcessEvolvingStrategy,
)
from rdagent.components.coder.CoSTEER.task import CoSTEERTask
from rdagent.components.coder.data_science.conf import get_ds_env
from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
from rdagent.core.exception import RunnerError
from rdagent.core.scenario import Scenario
Expand All @@ -26,7 +22,6 @@
from rdagent.scenarios.data_science.dev.runner.eval import DSCoSTEERCoSTEEREvaluator
from rdagent.utils.agent.ret import PythonBatchEditOut
from rdagent.utils.agent.tpl import T
from rdagent.utils.env import DockerEnv, MLEBDockerConf


class DSRunnerMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
Expand Down
Loading