feat: add extra_eval config and import_class for custom evaluators (#1097)

you-n-g · web-flow · commit 5accec37c882 · 2025-07-20T16:31:43.000+08:00
* feat: add extra_eval config and import_class for custom evaluators

* lint

* build: update litellm requirement to &gt;=1.73 for get_valid_models

* refactor: remove *args/**kwargs from _create_embedding_inner_function signature
diff --git a/rdagent/components/coder/data_science/conf.py b/rdagent/components/coder/data_science/conf.py
@@ -23,6 +23,18 @@ class Config:
     env_type: str = "docker"
     # TODO: extract a function for env and conf.
 
+    extra_eval: list[str] = []
+    """
+    Extra evaluators
+
+    The evaluator follows the following assumptions:
+    - It runs after previous evaluator (So the running results are alreadly there)
+
+    It is not a complete feature due to it is only implemented in DS Pipeline & Coder.
+
+    TODO: The complete version should be implemented in the CoSTEERSettings.
+    """
+
 
 def get_ds_env(
     conf_type: Literal["kaggle", "mlebench"] = "kaggle",
diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -43,6 +43,7 @@
 from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace
 from rdagent.core.scenario import Scenario
+from rdagent.core.utils import import_class
 from rdagent.oai.llm_utils import APIBackend
 from rdagent.utils.agent.ret import PythonAgentOut
 from rdagent.utils.agent.tpl import T
@@ -143,6 +144,10 @@ def __init__(
         if DS_RD_SETTING.enable_model_dump:
             eval_l.append(ModelDumpEvaluator(scen=scen, data_type="sample"))
 
+        for extra_eval in DSCoderCoSTEERSettings().extra_eval:
+            kls = import_class(extra_eval)
+            eval_l.append(kls(scen=scen))
+
         eva = CoSTEERMultiEvaluator(
             single_evaluator=eval_l, scen=scen
         )  # Please specify whether you agree running your eva in parallel or not
diff --git a/rdagent/oai/backend/base.py b/rdagent/oai/backend/base.py
@@ -654,9 +654,7 @@ def _calculate_token_from_messages(self, messages: list[dict[str, Any]]) -> int:
         raise NotImplementedError("Subclasses must implement this method")
 
     @abstractmethod
-    def _create_embedding_inner_function(  # type: ignore[no-untyped-def]
-        self, input_content_list: list[str], *args, **kwargs
-    ) -> list[list[float]]:  # noqa: ARG002
+    def _create_embedding_inner_function(self, input_content_list: list[str]) -> list[list[float]]:
         """
         Call the embedding function
         """
diff --git a/rdagent/oai/backend/deprec.py b/rdagent/oai/backend/deprec.py
@@ -269,9 +269,7 @@ def supports_response_schema(self) -> bool:
         """
         return False
 
-    def _create_embedding_inner_function(  # type: ignore[no-untyped-def]
-        self, input_content_list: list[str], *args, **kwargs
-    ) -> list[list[float]]:  # noqa: ARG002
+    def _create_embedding_inner_function(self, input_content_list: list[str]) -> list[list[float]]:
         content_to_embedding_dict = {}
         for sliced_filtered_input_content_list in [
             input_content_list[i : i + LLM_SETTINGS.embedding_max_str_num]
diff --git a/rdagent/oai/backend/litellm.py b/rdagent/oai/backend/litellm.py
@@ -66,9 +66,7 @@ def _calculate_token_from_messages(self, messages: list[dict[str, Any]]) -> int:
         logger.info(f"{LogColors.CYAN}Token count: {LogColors.END} {num_tokens}", tag="debug_litellm_token")
         return num_tokens
 
-    def _create_embedding_inner_function(
-        self, input_content_list: list[str], *args: Any, **kwargs: Any
-    ) -> list[list[float]]:  # noqa: ARG002
+    def _create_embedding_inner_function(self, input_content_list: list[str]) -> list[list[float]]:
         """
         Call the embedding function
         """
@@ -82,8 +80,6 @@ def _create_embedding_inner_function(
         response = embedding(
             model=model_name,
             input=input_content_list,
-            *args,
-            **kwargs,
         )
         response_list = [data["embedding"] for data in response.data]
         return response_list
diff --git a/requirements.txt b/requirements.txt
@@ -8,7 +8,7 @@ loguru
 fire
 fuzzywuzzy
 openai
-litellm==1.72.4
+litellm>=1.73  # to support `from litellm import get_valid_models`
 azure.identity
 pyarrow
 rich