mirror of
https://github.com/GammaTauAI/reflexion-human-eval
synced 2024-11-11 19:10:53 +00:00
87 lines
2.9 KiB
Python
87 lines
2.9 KiB
Python
from utils import enumerate_resume, write_jsonl
|
|
from executors import executor_factory
|
|
from generators import generator_factory
|
|
|
|
from typing import List
|
|
|
|
|
|
def run_reflexion(
|
|
dataset: List[dict],
|
|
model: str,
|
|
language: str,
|
|
max_iters: int,
|
|
pass_at_k: int,
|
|
log_path: str,
|
|
verbose: bool
|
|
) -> None:
|
|
exe = executor_factory(language)
|
|
gen = generator_factory(language)
|
|
|
|
num_items = len(dataset)
|
|
num_success = 0
|
|
for i, item in enumerate_resume(dataset, log_path):
|
|
cur_pass = 0
|
|
is_solved = False
|
|
reflections = []
|
|
cur_func_impl = ""
|
|
while cur_pass < pass_at_k and not is_solved:
|
|
tests_i = gen.internal_tests(item["prompt"], model, 1)
|
|
|
|
# first attempt
|
|
cur_func_impl = gen.func_impl(item["prompt"], model, "simple")
|
|
assert isinstance(cur_func_impl, str)
|
|
is_passing, feedback, _ = exe.execute(cur_func_impl, tests_i)
|
|
|
|
# if solved, exit early
|
|
if is_passing:
|
|
is_passing = exe.evaluate(
|
|
item["entry_point"], cur_func_impl, item["test"], timeout=10)
|
|
is_solved = is_passing
|
|
num_success += int(is_passing)
|
|
break
|
|
|
|
# use self-reflection to iteratively improve
|
|
cur_iter = 1
|
|
cur_feedback = feedback
|
|
while cur_iter < max_iters:
|
|
# get self-reflection
|
|
reflection = gen.self_reflection(
|
|
cur_func_impl, cur_feedback, model)
|
|
reflections += [reflection]
|
|
|
|
# apply self-reflection in the next attempt
|
|
cur_func_impl = gen.func_impl(
|
|
func_sig=item["prompt"],
|
|
model=model,
|
|
strategy="reflexion",
|
|
prev_func_impl=cur_func_impl,
|
|
feedback=cur_feedback,
|
|
self_reflection=reflection
|
|
)
|
|
assert isinstance(cur_func_impl, str)
|
|
|
|
# check if all internal unit tests pass
|
|
is_passing, cur_feedback, _ = exe.execute(cur_func_impl, tests_i)
|
|
|
|
# if solved, check if it passes the real tests, exit early
|
|
if is_passing or cur_iter == max_iters - 1:
|
|
is_passing = exe.evaluate(
|
|
item["entry_point"], cur_func_impl, item["test"], timeout=10)
|
|
if is_passing:
|
|
item["solution"] = cur_func_impl
|
|
is_solved = True
|
|
num_success += 1
|
|
break
|
|
|
|
cur_iter += 1
|
|
cur_pass += 1
|
|
|
|
item["is_solved"] = is_solved
|
|
item["reflections"] = reflections
|
|
item["solution"] = cur_func_impl
|
|
write_jsonl(log_path, [item], append=True)
|
|
|
|
if verbose:
|
|
print(
|
|
f'completed {i+1}/{num_items}: acc = {round(num_success/(i+1), 2)}')
|