Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions rdagent/app/data_science/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):

user_interaction_wait_seconds: int = 6000 # seconds to wait for user interaction
user_interaction_mid_folder: Path = Path.cwd() / "git_ignore_folder" / "RD-Agent_user_interaction"
review_model: str | None = None


DS_RD_SETTING = DataScienceBasePropSetting()
Expand Down
5 changes: 4 additions & 1 deletion rdagent/oai/backend/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ def _build_messages(
user_prompt: str,
system_prompt: str | None = None,
former_messages: list[dict[str, Any]] | None = None,
system_prompt_role: str | None = None,
*,
shrink_multiple_break: bool = False,
) -> list[dict[str, Any]]:
Expand All @@ -350,7 +351,7 @@ def _build_messages(
system_prompt = LLM_SETTINGS.default_system_prompt if system_prompt is None else system_prompt
messages = [
{
"role": LLM_SETTINGS.system_prompt_role,
"role": system_prompt_role or LLM_SETTINGS.system_prompt_role,
"content": system_prompt,
},
]
Expand Down Expand Up @@ -381,6 +382,7 @@ def build_messages_and_create_chat_completion( # type: ignore[no-untyped-def]
former_messages: list | None = None,
chat_cache_prefix: str = "",
shrink_multiple_break: bool = False,
system_prompt_role: str | None = None,
*args,
**kwargs,
) -> str:
Expand All @@ -407,6 +409,7 @@ def build_messages_and_create_chat_completion( # type: ignore[no-untyped-def]
user_prompt,
system_prompt,
former_messages,
system_prompt_role,
shrink_multiple_break=shrink_multiple_break,
)

Expand Down
7 changes: 6 additions & 1 deletion rdagent/oai/backend/litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,12 @@ def _create_chat_completion_inner_function( # type: ignore[no-untyped-def] # no
logger.info(self._build_log_messages(messages), tag="llm_messages")

complete_kwargs = self.get_complete_kwargs()
model = complete_kwargs["model"]
if kwargs.get("model"):
complete_kwargs['model'] = kwargs.pop("model")
complete_kwargs['reasoning_effort'] = None
model = complete_kwargs.get("model")
print(F"complete_kwargs: {complete_kwargs}")
print(F"kwargs: {kwargs}")

response = completion(
messages=messages,
Expand Down
36 changes: 34 additions & 2 deletions rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,6 @@ hypothesis_select:
{% else %}
Please response in json format.
{% endif %}


user: |-
# Scenario Description
Expand All @@ -651,6 +650,40 @@ hypothesis_select:
# Current SOTA Implementation
{{ sota_exp_desc }}

simulate_task:
system: |-
You have a base code for a Kaggle competition and a hypothesis (hoping to make certain modifications to this code to improve the code's score).
Please summarize this code, retain the key parts, and mark the parts of this code that the hypothesis will modify. If no base code is provided, please generate a pseudo-code outline based on the hypothesis that shows the main structure and key components needed to implement the hypothesis.
Your response should be in markdown format, and only return the markdown text without any other content.
user: |-
## Hypothesis
{{ hypothesis }}

## Base Code
```python
{{ base_code }}
```

predict_feedback:
system:
qwen: |-
Given a Kaggle competition scenario and a hypothesis, you are provided with a task derived from the hypothesis that specifies how the base code would be modified (the actual base code is not provided).
Predict whether, after modifying the base code according to this task, the code's performance will improve ("yes") or worsen ("no"). Base your judgment solely on the scenario, hypothesis, and task.
Answer with only "yes" or "no".
base: |-
Given a Kaggle competition scenario and a hypothesis, you are provided with a task derived from the hypothesis that specifies how the base code would be modified (the actual base code is not provided).
Predict whether, after modifying the base code according to this task, the code's performance will improve or worsen. Base your judgment solely on the scenario, hypothesis, and task.
Provide your judgment and explanation.
user: |-
## Scenario
{{ scenario }}

## Hypothesis
{{ hypothesis }}

## Task
{{ rewrite_task }}


task_gen:
system: |-
Expand Down Expand Up @@ -969,4 +1002,3 @@ output_format:
"hypothesis": "...",
"component": "..." // Must be one of: 'DataLoadSpec', 'FeatureEng', 'Model', 'Workflow', 'Ensemble'
}

90 changes: 90 additions & 0 deletions rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import re
import math
from datetime import timedelta
from enum import Enum
Expand Down Expand Up @@ -303,6 +304,19 @@ class CodingSketch(BaseModel):
)


class HypothesisReview(BaseModel):
acceptable: str = Field(description="yes or no")
reason: str = Field(
description="Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences."
)
observations: str = Field(
description="Clearly summarize current and SOTA ensemble results with exact scores and notable patterns. Limit to no more than three concise, data-focused sentences. Your observation must be grounded by explicit evidence from scenario description or code implementation, not just validation scores."
)
feedback: str = Field(
description="Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences."
)


def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraftExpGen:
next_missing_component = trace.next_incomplete_component()
if next_missing_component is not None:
Expand Down Expand Up @@ -1157,6 +1171,79 @@ def hypothesis_select_with_llm(

# END: for support llm-based hypothesis selection -----

@wait_retry(retry_n=3)
def hypothesis_review(
self,
base_code: str,
scenario: str,
hypothesis_dict: dict,
) -> dict:
"""
Selects the best hypothesis by scoring each candidate using a local model.

Args:
hypothesis_candidates: A dictionary where keys are hypothesis IDs and
values are dicts containing 'hypothesis',
'component', and 'code'.

Returns:
The dictionary of the selected hypothesis.
"""
for problem_name, data in hypothesis_dict.items():
try:
# gen rewrite task ( base code + hypothesis )
hypothesis_str = f"{data['hypothesis']}\nBecause:\n{data['reason']}"
rewrite_task = APIBackend().build_messages_and_create_chat_completion(
system_prompt=T(".prompts_v2:simulate_task.system").r(),
user_prompt=T(".prompts_v2:simulate_task.user").r(
hypothesis=hypothesis_str,
base_code=base_code,
)
)

# gen expert review
if "qwen" in DS_RD_SETTING.review_model:
from rdagent.oai.llm_conf import LLM_SETTINGS
old_max_retry = LLM_SETTINGS.max_retry
LLM_SETTINGS.max_retry = 3
system_prompt = T(".prompts_v2:predict_feedback.system.qwen").r()
response = APIBackend().build_messages_and_create_chat_completion(
system_prompt=system_prompt,
user_prompt=T(".prompts_v2:predict_feedback.user").r(
scenario=scenario,
hypothesis=hypothesis_str,
rewrite_task=rewrite_task,
),
# system_prompt_role="assistant",
model="hosted_vllm/qwen3-8b", # TODO: litellm-proxied vllm server cannot use completion calls
api_base="http://127.0.0.1:8091/v1",
api_key="sk-vllm"
)
LLM_SETTINGS.max_retry = old_max_retry
# Extract content inside <think> tags and outside
match = re.search(r'<think>(.*?)</think>(.*)', response, re.DOTALL)
think_content = match.group(1).strip()
outside_content = match.group(2).strip()
review_str = ("The hypothesis is acceptable" if "yes" in outside_content else "The hypothesis is not acceptable") + f". Because:\n{think_content}"
hypothesis_dict[problem_name]["expert_review"] = review_str
else:
system_prompt = T(".prompts_v2:predict_feedback.system.base").r()
response = APIBackend().build_messages_and_create_chat_completion(
system_prompt=system_prompt,
user_prompt=T(".prompts_v2:predict_feedback.user").r(
scenario=scenario,
hypothesis=hypothesis_str,
rewrite_task=rewrite_task,
),
# system_prompt_role="assistant",
model=DS_RD_SETTING.review_model,
)
hypothesis_dict[problem_name]["expert_review"] = response
except Exception as e:
logger.warning(f"Failed to review hypothesis for problem {problem_name}: {e}")

return hypothesis_dict

def hypothesis_rank(
self, hypothesis_dict: dict, problem_dict: dict, selected_idx: Optional[int] = None
) -> Tuple[str, DSHypothesis]:
Expand Down Expand Up @@ -1459,6 +1546,9 @@ def gen(

# Step 3: Select the best hypothesis
if DS_RD_SETTING.llm_select_hypothesis:
if DS_RD_SETTING.review_model is not None:
base_code = sota_exp.experiment_workspace.get_codes("main.py") if sota_exp else ""
hypothesis_dict = self.hypothesis_review(base_code=base_code, scenario=scenario_desc, hypothesis_dict=hypothesis_dict)
response_dict = self.hypothesis_select_with_llm(
scenario_desc=scenario_desc,
exp_feedback_list_desc=exp_feedback_list_desc,
Expand Down
Loading