microsoft · XianBW · Oct 17, 2025 · Oct 17, 2025 · Oct 18, 2025 · Oct 18, 2025
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -196,6 +196,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
 
     user_interaction_wait_seconds: int = 6000  # seconds to wait for user interaction
     user_interaction_mid_folder: Path = Path.cwd() / "git_ignore_folder" / "RD-Agent_user_interaction"
+    review_model: str | None = None
 
 
 DS_RD_SETTING = DataScienceBasePropSetting()

diff --git a/rdagent/oai/backend/base.py b/rdagent/oai/backend/base.py
@@ -331,6 +331,7 @@ def _build_messages(
         user_prompt: str,
         system_prompt: str | None = None,
         former_messages: list[dict[str, Any]] | None = None,
+        system_prompt_role: str | None = None,
         *,
         shrink_multiple_break: bool = False,
     ) -> list[dict[str, Any]]:
@@ -350,7 +351,7 @@ def _build_messages(
         system_prompt = LLM_SETTINGS.default_system_prompt if system_prompt is None else system_prompt
         messages = [
             {
-                "role": LLM_SETTINGS.system_prompt_role,
+                "role": system_prompt_role or LLM_SETTINGS.system_prompt_role,
                 "content": system_prompt,
             },
         ]
@@ -381,6 +382,7 @@ def build_messages_and_create_chat_completion(  # type: ignore[no-untyped-def]
         former_messages: list | None = None,
         chat_cache_prefix: str = "",
         shrink_multiple_break: bool = False,
+        system_prompt_role: str | None = None,
         *args,
         **kwargs,
     ) -> str:
@@ -407,6 +409,7 @@ def build_messages_and_create_chat_completion(  # type: ignore[no-untyped-def]
             user_prompt,
             system_prompt,
             former_messages,
+            system_prompt_role,
             shrink_multiple_break=shrink_multiple_break,
         )
 

diff --git a/rdagent/oai/backend/litellm.py b/rdagent/oai/backend/litellm.py
@@ -150,7 +150,12 @@ def _create_chat_completion_inner_function(  # type: ignore[no-untyped-def] # no
             logger.info(self._build_log_messages(messages), tag="llm_messages")
 
         complete_kwargs = self.get_complete_kwargs()
-        model = complete_kwargs["model"]
+        if kwargs.get("model"):
+            complete_kwargs['model'] = kwargs.pop("model")
+            complete_kwargs['reasoning_effort'] = None
+        model = complete_kwargs.get("model")
+        print(F"complete_kwargs: {complete_kwargs}")
+        print(F"kwargs: {kwargs}")
 
         response = completion(
             messages=messages,

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -639,7 +639,6 @@ hypothesis_select:
     {% else %}
     Please response in json format.
     {% endif %}
-
 
   user: |-
     # Scenario Description
@@ -651,6 +650,40 @@ hypothesis_select:
     # Current SOTA Implementation
     {{ sota_exp_desc }}
 
+simulate_task:
+  system: |-
+    You have a base code for a Kaggle competition and a hypothesis (hoping to make certain modifications to this code to improve the code's score). 
+    Please summarize this code, retain the key parts, and mark the parts of this code that the hypothesis will modify. If no base code is provided, please generate a pseudo-code outline based on the hypothesis that shows the main structure and key components needed to implement the hypothesis.
+    Your response should be in markdown format, and only return the markdown text without any other content.
+  user: |-
+    ## Hypothesis
+    {{ hypothesis }}
+
+    ## Base Code
+    ```python
+    {{ base_code }}
+    ```
+
+predict_feedback:
+  system: 
+    qwen: |-
+      Given a Kaggle competition scenario and a hypothesis, you are provided with a task derived from the hypothesis that specifies how the base code would be modified (the actual base code is not provided).
+      Predict whether, after modifying the base code according to this task, the code's performance will improve ("yes") or worsen ("no"). Base your judgment solely on the scenario, hypothesis, and task.
+      Answer with only "yes" or "no".
+    base: |-
+      Given a Kaggle competition scenario and a hypothesis, you are provided with a task derived from the hypothesis that specifies how the base code would be modified (the actual base code is not provided).
+      Predict whether, after modifying the base code according to this task, the code's performance will improve or worsen. Base your judgment solely on the scenario, hypothesis, and task.
+      Provide your judgment and explanation.
+  user: |-
+    ## Scenario
+    {{ scenario }}
+
+    ## Hypothesis
+    {{ hypothesis }}
+
+    ## Task
+    {{ rewrite_task }}
+
 
 task_gen:
   system: |-
@@ -969,4 +1002,3 @@ output_format:
       "hypothesis": "...",  
       "component": "..."  // Must be one of: 'DataLoadSpec', 'FeatureEng', 'Model', 'Workflow', 'Ensemble'
     }
-
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -1,4 +1,5 @@
 import json
+import re
 import math
 from datetime import timedelta
 from enum import Enum
@@ -303,6 +304,19 @@ class CodingSketch(BaseModel):
     )
 
 
+class HypothesisReview(BaseModel):
+    acceptable: str = Field(description="yes or no")
+    reason: str = Field(
+        description="Clearly explain the reason for success or failure of the experiment. Begin explicitly with [Submission format error], [Evaluation error], [Experiment Analysis] or [Code Analysis] depending on the step at which issues arose. Reference specific scores and methodological differences with SOTA. Limit to three sentences."
+    )
+    observations: str = Field(
+        description="Clearly summarize current and SOTA ensemble results with exact scores and notable patterns. Limit to no more than three concise, data-focused sentences. Your observation must be grounded by explicit evidence from scenario description or code implementation, not just validation scores."
+    )
+    feedback: str = Field(
+        description="Explicitly confirm or refute the hypothesis based on specific data points or performance trends. Limit to two sentences."
+    )
+
+
 def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraftExpGen:
     next_missing_component = trace.next_incomplete_component()
     if next_missing_component is not None:
@@ -1157,6 +1171,79 @@ def hypothesis_select_with_llm(
 
     # END: for support llm-based hypothesis selection  -----
 
+    @wait_retry(retry_n=3)
+    def hypothesis_review(
+        self,
+        base_code: str,
+        scenario: str,
+        hypothesis_dict: dict,
+    ) -> dict:
+        """
+        Selects the best hypothesis by scoring each candidate using a local model.
+
+        Args:
+            hypothesis_candidates: A dictionary where keys are hypothesis IDs and
+                                values are dicts containing 'hypothesis',
+                                'component', and 'code'.
+
+        Returns:
+            The dictionary of the selected hypothesis.
+        """
+        for problem_name, data in hypothesis_dict.items():
+            try:
+                # gen rewrite task ( base code + hypothesis )
+                hypothesis_str = f"{data['hypothesis']}\nBecause:\n{data['reason']}"
+                rewrite_task = APIBackend().build_messages_and_create_chat_completion(
+                    system_prompt=T(".prompts_v2:simulate_task.system").r(),
+                    user_prompt=T(".prompts_v2:simulate_task.user").r(
+                        hypothesis=hypothesis_str,
+                        base_code=base_code,
+                    )
+                )
+
+                # gen expert review
+                if "qwen" in DS_RD_SETTING.review_model:
+                    from rdagent.oai.llm_conf import LLM_SETTINGS
+                    old_max_retry = LLM_SETTINGS.max_retry
+                    LLM_SETTINGS.max_retry = 3
+                    system_prompt = T(".prompts_v2:predict_feedback.system.qwen").r()
+                    response = APIBackend().build_messages_and_create_chat_completion(
+                        system_prompt=system_prompt,
+                        user_prompt=T(".prompts_v2:predict_feedback.user").r(
+                            scenario=scenario,
+                            hypothesis=hypothesis_str,
+                            rewrite_task=rewrite_task,
+                        ),
+                        # system_prompt_role="assistant",
+                        model="hosted_vllm/qwen3-8b", # TODO: litellm-proxied vllm server cannot use completion calls
+                        api_base="http://127.0.0.1:8091/v1",
+                        api_key="sk-vllm"
+                    )
+                    LLM_SETTINGS.max_retry = old_max_retry
+                    # Extract content inside <think> tags and outside
+                    match = re.search(r'<think>(.*?)</think>(.*)', response, re.DOTALL)
+                    think_content = match.group(1).strip()
+                    outside_content = match.group(2).strip()
+                    review_str = ("The hypothesis is acceptable" if "yes" in outside_content else "The hypothesis is not acceptable") + f". Because:\n{think_content}"
+                    hypothesis_dict[problem_name]["expert_review"] = review_str
+                else:
+                    system_prompt = T(".prompts_v2:predict_feedback.system.base").r()
+                    response = APIBackend().build_messages_and_create_chat_completion(
+                        system_prompt=system_prompt,
+                        user_prompt=T(".prompts_v2:predict_feedback.user").r(
+                            scenario=scenario,
+                            hypothesis=hypothesis_str,
+                            rewrite_task=rewrite_task,
+                        ),
+                        # system_prompt_role="assistant",
+                        model=DS_RD_SETTING.review_model,
+                    )
+                    hypothesis_dict[problem_name]["expert_review"] = response
+            except Exception as e:
+                logger.warning(f"Failed to review hypothesis for problem {problem_name}: {e}")
+
+        return hypothesis_dict
+
     def hypothesis_rank(
         self, hypothesis_dict: dict, problem_dict: dict, selected_idx: Optional[int] = None
     ) -> Tuple[str, DSHypothesis]:
@@ -1459,6 +1546,9 @@ def gen(
 
         # Step 3: Select the best hypothesis
         if DS_RD_SETTING.llm_select_hypothesis:
+            if DS_RD_SETTING.review_model is not None:
+                base_code = sota_exp.experiment_workspace.get_codes("main.py") if sota_exp else ""
+                hypothesis_dict = self.hypothesis_review(base_code=base_code, scenario=scenario_desc, hypothesis_dict=hypothesis_dict)
             response_dict = self.hypothesis_select_with_llm(
                 scenario_desc=scenario_desc,
                 exp_feedback_list_desc=exp_feedback_list_desc,