diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 2d8ec6262..33956881b 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -196,6 +196,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): user_interaction_wait_seconds: int = 6000 # seconds to wait for user interaction user_interaction_mid_folder: Path = Path.cwd() / "git_ignore_folder" / "RD-Agent_user_interaction" + review_model: str | None = None DS_RD_SETTING = DataScienceBasePropSetting() diff --git a/rdagent/oai/backend/base.py b/rdagent/oai/backend/base.py index 79664b9c6..5238b11b3 100644 --- a/rdagent/oai/backend/base.py +++ b/rdagent/oai/backend/base.py @@ -331,6 +331,7 @@ def _build_messages( user_prompt: str, system_prompt: str | None = None, former_messages: list[dict[str, Any]] | None = None, + system_prompt_role: str | None = None, *, shrink_multiple_break: bool = False, ) -> list[dict[str, Any]]: @@ -350,7 +351,7 @@ def _build_messages( system_prompt = LLM_SETTINGS.default_system_prompt if system_prompt is None else system_prompt messages = [ { - "role": LLM_SETTINGS.system_prompt_role, + "role": system_prompt_role or LLM_SETTINGS.system_prompt_role, "content": system_prompt, }, ] @@ -381,6 +382,7 @@ def build_messages_and_create_chat_completion( # type: ignore[no-untyped-def] former_messages: list | None = None, chat_cache_prefix: str = "", shrink_multiple_break: bool = False, + system_prompt_role: str | None = None, *args, **kwargs, ) -> str: @@ -407,6 +409,7 @@ def build_messages_and_create_chat_completion( # type: ignore[no-untyped-def] user_prompt, system_prompt, former_messages, + system_prompt_role, shrink_multiple_break=shrink_multiple_break, ) diff --git a/rdagent/oai/backend/litellm.py b/rdagent/oai/backend/litellm.py index 514c5aaae..7fb07be4d 100644 --- a/rdagent/oai/backend/litellm.py +++ b/rdagent/oai/backend/litellm.py @@ -150,7 +150,12 @@ def _create_chat_completion_inner_function( # type: ignore[no-untyped-def] # no logger.info(self._build_log_messages(messages), tag="llm_messages") complete_kwargs = self.get_complete_kwargs() - model = complete_kwargs["model"] + if kwargs.get("model"): + complete_kwargs['model'] = kwargs.pop("model") + complete_kwargs['reasoning_effort'] = None + model = complete_kwargs.get("model") + print(F"complete_kwargs: {complete_kwargs}") + print(F"kwargs: {kwargs}") response = completion( messages=messages, diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml index 144485d6a..3ce4bafa2 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml +++ b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml @@ -639,7 +639,6 @@ hypothesis_select: {% else %} Please response in json format. {% endif %} - user: |- # Scenario Description @@ -651,6 +650,40 @@ hypothesis_select: # Current SOTA Implementation {{ sota_exp_desc }} +simulate_task: + system: |- + You have a base code for a Kaggle competition and a hypothesis (hoping to make certain modifications to this code to improve the code's score). + Please summarize this code, retain the key parts, and mark the parts of this code that the hypothesis will modify. If no base code is provided, please generate a pseudo-code outline based on the hypothesis that shows the main structure and key components needed to implement the hypothesis. + Your response should be in markdown format, and only return the markdown text without any other content. + user: |- + ## Hypothesis + {{ hypothesis }} + + ## Base Code + ```python + {{ base_code }} + ``` + +predict_feedback: + system: + qwen: |- + Given a Kaggle competition scenario and a hypothesis, you are provided with a task derived from the hypothesis that specifies how the base code would be modified (the actual base code is not provided). + Predict whether, after modifying the base code according to this task, the code's performance will improve ("yes") or worsen ("no"). Base your judgment solely on the scenario, hypothesis, and task. + Answer with only "yes" or "no". + base: |- + Given a Kaggle competition scenario and a hypothesis, you are provided with a task derived from the hypothesis that specifies how the base code would be modified (the actual base code is not provided). + Predict whether, after modifying the base code according to this task, the code's performance will improve or worsen. Base your judgment solely on the scenario, hypothesis, and task. + Provide your judgment and explanation. + user: |- + ## Scenario + {{ scenario }} + + ## Hypothesis + {{ hypothesis }} + + ## Task + {{ rewrite_task }} + task_gen: system: |- @@ -969,4 +1002,3 @@ output_format: "hypothesis": "...", "component": "..." // Must be one of: 'DataLoadSpec', 'FeatureEng', 'Model', 'Workflow', 'Ensemble' } - diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 8a0343f27..cba762e5a 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -1,4 +1,5 @@ import json +import re import math from datetime import timedelta from enum import Enum @@ -303,6 +304,19 @@ class CodingSketch(BaseModel): ) +class HypothesisReview(BaseModel): + acceptable: str = Field(description="yes or no") + reason: str = Field( + description="Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences." + ) + observations: str = Field( + description="Clearly summarize current and SOTA ensemble results with exact scores and notable patterns. Limit to no more than three concise, data-focused sentences. Your observation must be grounded by explicit evidence from scenario description or code implementation, not just validation scores." + ) + feedback: str = Field( + description="Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences." + ) + + def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraftExpGen: next_missing_component = trace.next_incomplete_component() if next_missing_component is not None: @@ -1157,6 +1171,79 @@ def hypothesis_select_with_llm( # END: for support llm-based hypothesis selection ----- + @wait_retry(retry_n=3) + def hypothesis_review( + self, + base_code: str, + scenario: str, + hypothesis_dict: dict, + ) -> dict: + """ + Selects the best hypothesis by scoring each candidate using a local model. + + Args: + hypothesis_candidates: A dictionary where keys are hypothesis IDs and + values are dicts containing 'hypothesis', + 'component', and 'code'. + + Returns: + The dictionary of the selected hypothesis. + """ + for problem_name, data in hypothesis_dict.items(): + try: + # gen rewrite task ( base code + hypothesis ) + hypothesis_str = f"{data['hypothesis']}\nBecause:\n{data['reason']}" + rewrite_task = APIBackend().build_messages_and_create_chat_completion( + system_prompt=T(".prompts_v2:simulate_task.system").r(), + user_prompt=T(".prompts_v2:simulate_task.user").r( + hypothesis=hypothesis_str, + base_code=base_code, + ) + ) + + # gen expert review + if "qwen" in DS_RD_SETTING.review_model: + from rdagent.oai.llm_conf import LLM_SETTINGS + old_max_retry = LLM_SETTINGS.max_retry + LLM_SETTINGS.max_retry = 3 + system_prompt = T(".prompts_v2:predict_feedback.system.qwen").r() + response = APIBackend().build_messages_and_create_chat_completion( + system_prompt=system_prompt, + user_prompt=T(".prompts_v2:predict_feedback.user").r( + scenario=scenario, + hypothesis=hypothesis_str, + rewrite_task=rewrite_task, + ), + # system_prompt_role="assistant", + model="hosted_vllm/qwen3-8b", # TODO: litellm-proxied vllm server cannot use completion calls + api_base="http://127.0.0.1:8091/v1", + api_key="sk-vllm" + ) + LLM_SETTINGS.max_retry = old_max_retry + # Extract content inside tags and outside + match = re.search(r'(.*?)(.*)', response, re.DOTALL) + think_content = match.group(1).strip() + outside_content = match.group(2).strip() + review_str = ("The hypothesis is acceptable" if "yes" in outside_content else "The hypothesis is not acceptable") + f". Because:\n{think_content}" + hypothesis_dict[problem_name]["expert_review"] = review_str + else: + system_prompt = T(".prompts_v2:predict_feedback.system.base").r() + response = APIBackend().build_messages_and_create_chat_completion( + system_prompt=system_prompt, + user_prompt=T(".prompts_v2:predict_feedback.user").r( + scenario=scenario, + hypothesis=hypothesis_str, + rewrite_task=rewrite_task, + ), + # system_prompt_role="assistant", + model=DS_RD_SETTING.review_model, + ) + hypothesis_dict[problem_name]["expert_review"] = response + except Exception as e: + logger.warning(f"Failed to review hypothesis for problem {problem_name}: {e}") + + return hypothesis_dict + def hypothesis_rank( self, hypothesis_dict: dict, problem_dict: dict, selected_idx: Optional[int] = None ) -> Tuple[str, DSHypothesis]: @@ -1459,6 +1546,9 @@ def gen( # Step 3: Select the best hypothesis if DS_RD_SETTING.llm_select_hypothesis: + if DS_RD_SETTING.review_model is not None: + base_code = sota_exp.experiment_workspace.get_codes("main.py") if sota_exp else "" + hypothesis_dict = self.hypothesis_review(base_code=base_code, scenario=scenario_desc, hypothesis_dict=hypothesis_dict) response_dict = self.hypothesis_select_with_llm( scenario_desc=scenario_desc, exp_feedback_list_desc=exp_feedback_list_desc,