bad logging fixed

2024-11-16 00:12:59 +00:00 · 2023-03-28 23:01:45 -04:00 · 2023-03-28 23:01:45 -04:00 · 700742c0b6
commit 700742c0b6
parent f7ad50a731
2 changed files with 23 additions and 5 deletions
--- a/reflexion_ucs.py
+++ b/reflexion_ucs.py
@ -92,8 +92,9 @@ def run_reflexion_ucs(
            # if solved, exit early
            if is_passing:
                debug_print("solved at first attempt")
-                is_solved = True
-                num_success += 1
+                code = item["prompt"] + cur_func_impl
+                is_solved = evaluate(item["prompt"], code, item["test"])
+                num_success += 1 if is_solved else 0
                break

            reflection = self_reflection_generator(
--- a/validate_py_results.py
+++ b/validate_py_results.py
@ -18,6 +18,22 @@ def count_test_cases(test_str: str) -> int:
    # dumb way to do this but works
    return test_str.count("assert")

+# from executors.py_executor import py_evaluate
+# def validate_py_results(log_path: str):
+    # if not log_path.endswith(".jsonl"):
+        # raise ValueError("Please provide a valid log file")
+    # data = read_jsonl(log_path)
+    # num_success = 0
+    # for i, item in enumerate(data[117:122]):
+        # is_passing = py_evaluate(item["entry_point"], item["solution"], item["test"])
+        # if is_passing:
+            # print(f"Test {i}: {green_text('PASS')}")
+            # num_success += 1
+        # else:
+            # print(f"Test {i}: {red_text('FAIL')}")
+    # print(f"Summary: {num_success}/{len(data)} tests passed")
+    # print(f"Acc: {round(num_success/len(data), 2)} tests passed")
+
 def validate_py_results(log_path: str):
    if not log_path.endswith(".jsonl"):
        raise ValueError("Please provide a valid log file")
@ -25,8 +41,9 @@ def validate_py_results(log_path: str):
    num_success = 0
    for i, item in enumerate(data):
        if item["is_solved"]:
-            func_impl = item["prompt"] + item["solution"]
-            code = f'{func_impl}\n\n{item["test"]}\n\ncheck({item["entry_point"]})'
+            # func_impl = item["prompt"] + item["solution"]
+            func_impl = item["solution"]
+            code = f'{item["prompt"]}{func_impl}\n\n{item["test"]}\n\ncheck({item["entry_point"]})'
            num_tests = count_test_cases(item["test"])
            try:
                def handler(signum, frame):
@ -41,7 +58,7 @@ def validate_py_results(log_path: str):
                print(f"Test {i}: {green_text_out}")
                num_success += 1
            except Exception:
-                red_text_out = red_text(f"failed!")
+                red_text_out = red_text(f"failed but should have passed!")
                print(f"Test {i}: {red_text_out}")
        else:
            red_text_out = red_text(f"failed!")