Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rdagent/components/coder/CoSTEER/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class CoSTEERSingleFeedback(Feedback):
return_checking: str | None # including every check in the testing (constraints about the generated value)
# value_feedback, shape_feedback, value_generated_flag
code: str
final_decision: bool
final_decision: bool | None = None

@staticmethod
def val_and_update_init_dict(data: dict) -> dict:
Expand Down
3 changes: 3 additions & 0 deletions rdagent/components/coder/CoSTEER/evolving_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@


class MultiProcessEvolvingStrategy(EvolvingStrategy):
KEY_CHANGE_SUMMARY = "__change_summary__" # Optional key for the summary of the change of evolving subjects

def __init__(self, scen: Scenario, settings: CoSTEERSettings):
super().__init__(scen)
self.settings = settings
Expand Down Expand Up @@ -51,6 +53,7 @@ def implement_one_task(
Return
------
The new files {<filename>: <content>} to update the workspace.
- Special Keys: self.KEY_CHANGE_SUMMARY;
"""
raise NotImplementedError

Expand Down
4 changes: 2 additions & 2 deletions rdagent/core/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
{}
) # The code injected into the folder, store them in the variable to reproduce the former result
self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
# In-memory checkpoint data created by ``create_ws_ckp``.
self.ws_ckp: bytes | None = None
self.ws_ckp: bytes | None = None # In-memory checkpoint data created by ``create_ws_ckp``.
self.change_summary: str | None = None # The change from the previous version of workspace

@staticmethod
def _format_code_dict(code_dict: dict[str, str]) -> str:
Expand Down
67 changes: 40 additions & 27 deletions rdagent/scenarios/data_science/dev/runner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,19 @@ def implement_one_task(
# if no prev_task_feedback, it is the first loop; we do not make any changes and goto evaluators directly.
return {}

# Output Agent Map
output_map = {
True: (PythonBatchPatchOut.get_spec(), PythonBatchPatchOut.extract_output),
False: (
PythonBatchEditOut.get_spec(with_del=False),
PythonBatchEditOut.extract_output,
),
}
output_spec, extract_output_fn = output_map[self.settings.diff_mode]
# Get evolving history
task_info = target_task.get_task_information()
queried_former_failed_knowledge = (
queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
)[0]

# Set output agent
if self.settings.diff_mode:
output_spec = PythonBatchPatchOut.get_spec()
extract_output_fn = PythonBatchPatchOut.extract_output
else:
output_spec = PythonBatchEditOut.get_spec(with_del=False)
extract_output_fn = PythonBatchEditOut.extract_output

if prev_task_feedback.acceptable is False:
task_information_str = target_task.get_task_information()
Expand All @@ -76,32 +80,39 @@ def implement_one_task(
diff_mode=self.settings.diff_mode,
)

# Generate user prompt for both cases
# Start multi-turn chat session
session = APIBackend().build_chat_session(
session_system_prompt=system_prompt,
)

# Code
user_prompt = T(".prompts:DSCoSTEER.user").r(
code=workspace.all_codes,
feedback=prev_task_feedback,
hyperparameter_tuning_suggestion=prev_task_feedback.hyperparameter_tuning_suggestion,
hyperparameter_tuning_suggestion=(
prev_task_feedback.hyperparameter_tuning_suggestion if prev_task_feedback.acceptable else None
),
queried_former_failed_knowledge=queried_former_failed_knowledge,
)

code = session.build_chat_completion(user_prompt=user_prompt)
if self.settings.diff_mode:
batch_edit = extract_output_fn(
APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=system_prompt,
),
prefix=workspace.workspace_path,
)
code_batch_edit = extract_output_fn(code, prefix=workspace.workspace_path)
else:
batch_edit = extract_output_fn(
APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=system_prompt,
)
)

batch_edit = {k: v for k, v in batch_edit.items() if k in workspace.file_dict.keys()}
code_batch_edit = extract_output_fn(code)
code_batch_edit = {k: v for k, v in code_batch_edit.items() if k in workspace.file_dict.keys()}

# Change Summary
user_prompt = (
"Based on the previous conversation and your latest code modifications, "
"please provide a concise and structured summary of the changes you made to the original code. "
"Clearly specify what was changed and how, focusing on key modifications. "
"Limit your summary to plain text, no more than three sentences."
)
change_summary = session.build_chat_completion(user_prompt=user_prompt)
code_batch_edit.update({"__change_summary__": change_summary})

return batch_edit
return code_batch_edit

def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
"""
Expand All @@ -116,6 +127,8 @@ def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
if evo.sub_workspace_list[index] is None:
# evo.sub_workspace_list[index] = FBWorkspace(target_task=evo.sub_tasks[index])
evo.sub_workspace_list[index] = evo.experiment_workspace
if self.KEY_CHANGE_SUMMARY in code_list[index]:
evo.sub_workspace_list[index].change_summary = code_list[index].pop(self.KEY_CHANGE_SUMMARY)
evo.sub_workspace_list[index].inject_files(**code_list[index])
return evo

Expand Down
30 changes: 29 additions & 1 deletion rdagent/scenarios/data_science/dev/runner/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,31 @@ class DSRunnerFeedback(CoSTEERSingleFeedback):
acceptable: bool | None = None
hyperparameter_tuning_decision: bool | None = None
hyperparameter_tuning_suggestion: str | None = None
score: str | None = None

def is_acceptable(self) -> bool:
if self.acceptable is not None:
return self.acceptable
return super().is_acceptable()

def __str__(self) -> str:
parts = [
"### Execution",
str(self.execution),
"### Return Check",
self.return_checking if self.return_checking is not None else "No return checking",
"### Code",
str(self.code),
"### Validation Score",
f"{self.score}" if self.score else "Not available",
"### Final Decision",
f"This implementation is {'PASSED' if self.acceptable else 'FAILED'}.",
]
if self.hyperparameter_tuning_decision:
parts.append("### Hyperparameter Tuning Suggestion")
parts.append(str(self.hyperparameter_tuning_suggestion))
return "\n".join(parts)


class DSRunnerEvaluator(CoSTEEREvaluator):

Expand Down Expand Up @@ -74,6 +93,12 @@ def evaluate(
env=env, entry=get_clear_ws_cmd()
) # Remove previous submission and scores files generated by worklfow.

# get previous runner loops
task_info = target_task.get_task_information()
queried_former_failed_knowledge = (
queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
)[0]

# execute workflow
result = implementation.run(env=env, entry="python -m coverage run main.py")
stdout = result.stdout
Expand Down Expand Up @@ -161,14 +186,17 @@ def evaluate(
time_spent=f"{implementation.running_info.running_time:.2f} seconds",
timeout=f"{env.conf.running_timeout_period} seconds",
percent_of_timeout_used=f"{time_spent_ratio * 100:.2f}%",
queried_former_failed_knowledge=queried_former_failed_knowledge,
)

feedback = build_cls_from_json_with_retry(
DSRunnerFeedback,
system_prompt=system_prompt,
user_prompt=user_prompt,
init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
# init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
)
feedback.score = score_df.to_string() if score_ret_code == 0 else None
feedback.final_decision = feedback.acceptable and (not feedback.hyperparameter_tuning_decision)

if feedback and not DS_RD_SETTING.coder_on_whole_pipeline:
# remove unused files
Expand Down
54 changes: 32 additions & 22 deletions rdagent/scenarios/data_science/dev/runner/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,10 @@ DSCoSTEER_eval:
3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission.
If the code does not satisfy the requirements:
- Set "acceptable" to false.
- Set "final_decision" to false.
{% if enable_hyperparameter_tuning_check %}- set "hyperparameter_tuning_decision" to false.
- Set "hyperparameter_tuning_suggestion" to an empty string.
If the code satisfy the requirements:
- Set "acceptable" to true.
- Proceed to the next evaluation.

{% if enable_hyperparameter_tuning_check %}
# Evaluation 2: Hyperparameter
## Evaluation Description
The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
Expand All @@ -45,7 +42,6 @@ DSCoSTEER_eval:
3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence. If there are no obvious and impactful opportunities and the code runs well, please accept it.
If the code satisfy the requirements:
- Set "hyperparameter_tuning_decision" to true.
- Set "final_decision" to false.
- Provide a reasonable suggestion in "hyperparameter_tuning_suggestion". The "hyperparameter_tuning_suggestion" should begin with a clear observation, followed by your suggestion. For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still going down and early stopping was not activated. Only 15% of the allowed time was used. [Suggestion] We recommend increasing epochs to 100 to avoid underfitting and further improve model performance."
If the code does not satisfy the requirements:
- Set "hyperparameter_tuning_decision" to false.
Expand All @@ -59,10 +55,11 @@ DSCoSTEER_eval:
"execution": "Describe whether the whole code base executed successfully and generating the final submission. Include any errors or issues encountered, and retain all error messages and traceback details.",
"return_checking": "Verify the generated files, particularly the submission file. Ensure that its format matches the sample submission",
"code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
"acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,{% if enable_hyperparameter_tuning_check %}
"acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
{% if enable_hyperparameter_tuning_check %}
"hyperparameter_tuning_decision": <true/false>,
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
"final_decision": <true/false>,
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
{% endif %}
}
```
{% else %}
Expand Down Expand Up @@ -101,28 +98,32 @@ DSCoSTEER_eval:
"acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
{% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_decision": <true/false>,
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
"final_decision": <true/false>,
}
```
{% endif %}
# NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently

user: |-
# Code base
# Current Code base
{{ code }}

## Stdout of code execution and testing
{{ stdout }}

# The time spend on code execution and timeout
{{ time_spent }}

## The timeout of code execution
{{ timeout }}

## The percent of timeout used
{{ percent_of_timeout_used }}

## Execution time and timeout
The execution time for current code base: {{ time_spent }}.
The total timeout: {{ timeout }}.
The percent of timeout used: {{ percent_of_timeout_used }}.

{% if queried_former_failed_knowledge|length != 0 %}
# Evolving History
{% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
### Summary of Changes
{{ former_failed_knowledge.implementation.change_summary }}
{{ former_failed_knowledge.feedback }}
{% endfor %}
{% endif %}

DSCoSTEER:
system_debugger: |-
{% include "scenarios.data_science.share:scen.role" %}
Expand All @@ -132,7 +133,6 @@ DSCoSTEER:
1. Code base.
2. Task description, which is the task the code is trying to solve.
3. Feedback generated during the execution of the whole workflow.
4. Suggestions for hyperparameter tuning.
Your job is to debug the whole code base, try to correct the errors, and ensure that the workflow can execute successfully on the full dataset.

## Task description
Expand Down Expand Up @@ -185,13 +185,23 @@ DSCoSTEER:
{% endif %}

user: |-
# Code Base
# Current Code Base
{{ code }}

## Feedback
## Feedback of Current Code Base
{{ feedback }}

{% if hyperparameter_tuning_suggestion is not none %}
## Hyperparameter Tuning Suggestion
{{ hyperparameter_tuning_suggestion }}
{% endif %}

{% if queried_former_failed_knowledge|length != 0 %}
# Evolving History
{% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
### Summary of Changes
{{ former_failed_knowledge.implementation.change_summary }}
### Validation Scores
{{ former_failed_knowledge.feedback.score }}
{% endfor %}
{% endif %}