Prompts

1 year ago · 94e7bf7d46
parent 818fc53c89
commit 94e7bf7d46
11 changed files with 315 additions and 47 deletions
--- a/4
+++ b/4
@ -0,0 +1,4 @@
+{"name": "split-message-based-on-limit", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 14972000, "question_id": "2563", "elapsed_time": 148, "compare_result": "0111111111111011000011000000000000000000000000000011100000000000000000000000000111111101111000", "code_output": "[\"this<1/10>\",\" is <2/10>\",\"real<3/10>\",\"ly a<4/10>\",\" ver<5/10>\",\"y aw<6/10>\",\"esom<7/10>\",\"e me<8/10>\",\"ssag<9/10>\",\"e<10/10>\"]", "std_output": "", "last_testcase": "\"this is really a very awesome message\"\n9", "expected_output": "[\"thi<1/14>\",\"s i<2/14>\",\"s r<3/14>\",\"eal<4/14>\",\"ly <5/14>\",\"a v<6/14>\",\"ery<7/14>\",\" aw<8/14>\",\"eso<9/14>\",\"me<10/14>\",\" m<11/14>\",\"es<12/14>\",\"sa<13/14>\",\"ge<14/14>\"]", "task_finish_time": 1681661576324, "total_correct": 30, "total_testcases": 94, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "934796442", "input_formatted": "\"this is really a very awesome message\", 9", "input": "\"this is really a very awesome message\"\n9", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
+{"name": "maximum-number-of-non-overlapping-palindrome-substrings", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
+{"name": "minimum-total-distance-traveled", "status": "Accepted", "reward": true, "info": {"status_code": 10, "lang": "python3", "run_success": true, "status_runtime": "4528 ms", "memory": 94288000, "question_id": "2554", "elapsed_time": 4649, "compare_result": "1111111111111111111111111111111111111111", "code_output": "", "std_output": "", "last_testcase": "", "expected_output": "", "task_finish_time": 1681662197849, "total_correct": 40, "total_testcases": 40, "runtime_percentile": 5.600000000000221, "status_memory": "94.3 MB", "memory_percentile": 49.60000000000006, "pretty_lang": "Python3", "submission_id": "934801257", "status_msg": "Accepted", "state": "SUCCESS"}}
+{"name": "next-greater-element-iv", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 27900000, "question_id": "2549", "elapsed_time": 1120, "compare_result": "010000001000000000000000000000000000000000010100000", "code_output": "[4,9,9,-1,-1]", "std_output": "", "last_testcase": "[2,4,0,9,6]", "expected_output": "[9,6,6,-1,-1]", "task_finish_time": 1681662380199, "total_correct": 4, "total_testcases": 51, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "934802716", "input_formatted": "[2,4,0,9,6]", "input": "[2,4,0,9,6]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
--- a/executors/leet_executor.py
+++ b/executors/leet_executor.py
@ -24,7 +24,11 @@ class LeetExecutor(Executor):
        from .leetcode_env.leetcode_env.leetcode_types import LeetCodeSubmission
        from .leetcode_env.leetcode_env.utils import id_from_slug
        print(f'Timeout is {timeout} seconds')
-        leetcode_formatted_func = self.formatter.to_leetcode(func)
+        try:
+            leetcode_formatted_func = self.formatter.to_leetcode(func)
+        except Exception as e:
+            print(f'Error formatting function to leetcode: {e}')
+            return False
        print('----------------- LEETCODE SUBMISSION ------------------')
        print(leetcode_formatted_func)
        print('--------------------------------------------------------')
--- a/executors/py_executor.py
+++ b/executors/py_executor.py
@ -2,7 +2,7 @@ import ast
 import signal
 import astunparse

-from .executor_utils import timeout_handler, function_with_timeout
+from .executor_utils import function_with_timeout

 from typing import List
 from .executor_types import ExecuteResult, Executor
@ -78,11 +78,9 @@ def get_call_str(assert_statement: str) -> str:

 def get_output(func: str, assert_statement: str, timeout: int = 5) -> str:
    try:
+        exec(f"from typing import *\n{func}", globals())
        func_call = get_call_str(assert_statement)
-        to_eval = f"from typing import *\n{func}\n{func_call}"
-        exec(func, globals())
-        output = function_with_timeout(eval, (func_call,globals()), timeout)
-
+        output = function_with_timeout(eval, (func_call, globals()), timeout)
        return output
    except TimeoutError:
        return "TIMEOUT"
--- a/generators/generator_utils.py
+++ b/generators/generator_utils.py
@ -21,6 +21,7 @@ def generic_generate_func_impl(
    num_comps,
    temperature,
    REFLEXION_CHAT_INSTRUCTION: str,
+    REFLEXION_FEW_SHOT: str,
    SIMPLE_CHAT_INSTRUCTION: str,
    REFLEXION_COMPLETION_INSTRUCTION: str,
    SIMPLE_COMPLETION_INSTRUCTION: str,
@ -35,7 +36,7 @@ def generic_generate_func_impl(

    if model == "gpt-4" or model == "gpt-3.5-turbo":
        if strategy == "reflexion":
-            message = f"previous implementation:\n{prev_func_impl}\n\nunit tests:\n{feedback}\n\nhint:\n{self_reflection}\n\n# improved implementation\n{func_sig}"
+            message = f"{REFLEXION_FEW_SHOT}\n[previous impl]:\n{prev_func_impl}\n\n[unit test results from previous impl]:\n{feedback}\n\n[reflection on previous impl]:\n{self_reflection}\n\n[improved impl]:\n{func_sig}"
            # func_bodies is a really bad name, as it can also be just 1 string
            print('----------------------- SYSTEM MESSAGE -----------------------')
            print(REFLEXION_CHAT_INSTRUCTION)
@ -88,38 +89,31 @@ def generic_generate_internal_tests(
        TEST_GENERATION_COMPLETION_INSTRUCTION: str,
        parse_tests: Callable[[str], List[str]],
        is_syntax_valid: Callable[[str], bool],
+        is_react: bool = False
 ) -> List[str]:
    """
    Generates tests for a function using a refinement technique with the number
    of specified commmittee members.
    """
    if model == "gpt-4" or model == "gpt-3.5-turbo":
-        message = f'{TEST_GENERATION_FEW_SHOT}\n\nfunc signature:\n{func_sig}\nunit tests:'
-        print('----------------------- SYSTEM MESSAGE -----------------------')
-        print(TEST_GENERATION_CHAT_INSTRUCTION)
-        print('----------------------------------------------')
-        print(' ----------------------- USER MESSAGE -----------------------')
-        print(message, flush=True)
-        print('----------------------------------------------')
-        output = gpt_chat(
-            model, TEST_GENERATION_CHAT_INSTRUCTION, message, max_tokens=1024)
+        if is_react:
+            message = f'{TEST_GENERATION_FEW_SHOT}\n\n[func signature]:\n{func_sig}\n\n[think]:'
+            output = gpt_chat(
+                model, TEST_GENERATION_CHAT_INSTRUCTION, message, max_tokens=1024)
+            print(f'React test generation output: {output}')
+        else:
+            message = f'{TEST_GENERATION_FEW_SHOT}\n\nfunc signature:\n{func_sig}\nunit tests:'
+            output = gpt_chat(
+                model, TEST_GENERATION_CHAT_INSTRUCTION, message, max_tokens=1024)
    else:
        prompt = f'{TEST_GENERATION_COMPLETION_INSTRUCTION}\n\nfunc signature:\n{func_sig}\nunit tests:'
        output = gpt_completion(model, prompt, max_tokens=1024)
    all_tests = parse_tests(output)  # type: ignore
    valid_tests = [test for test in all_tests if is_syntax_valid(test)]

-    # TODO: NOT SUPPORTED YET
-    # someone implement this
-    # cur_refinement_num = 0
-    # while cur_refinement_num < committee_size:
-    # # TODO: implement
-    # cur_tests = ... # type: ignore
-
-    # cur_refinement_num += 1
-    print('--------------- GENERATED TESTS: ---------------')
-    print(valid_tests)
-    print('------------------------------------------------')
+    # n = 3
+    # first_n = min(len(valid_tests), n)
+    # valid_tests = valid_tests[:first_n]

    return sample_n_random(valid_tests, max_num_tests)

@ -130,22 +124,23 @@ def generic_generate_self_reflection(
        model: str,
        SELF_REFLECTION_CHAT_INSTRUCTION: str,
        SELF_REFLECTION_COMPLETION_INSTRUCTION: str,
+        SELF_REFLECTION_FEW_SHOT: Optional[str] = None
 ) -> str:
    if model == "gpt-4" or model == "gpt-3.5-turbo":
-        print('----------------------- SYSTEM MESSAGE -----------------------')
-        print(SELF_REFLECTION_CHAT_INSTRUCTION)
-        print('----------------------------------------------')
-        print(' ----------------------- USER MESSAGE -----------------------')
-        print(f'{func}\n\n{feedback}\n\nExplanation:', flush=True)
-        print('----------------------------------------------')
-        reflection = gpt_chat(
-            model, SELF_REFLECTION_CHAT_INSTRUCTION, f'{func}\n\n{feedback}\n\nExplanation:')
+        if SELF_REFLECTION_FEW_SHOT is not None:
+            reflection = gpt_chat(
+                model,
+                SELF_REFLECTION_CHAT_INSTRUCTION,
+                f'{SELF_REFLECTION_FEW_SHOT}\n\n[function impl]:\n{func}\n\n[unit test results]:\n{feedback}\n\n[self-reflection]:')
+            print(f'Self reflection output: {reflection}')
+        else:
+            reflection = gpt_chat(
+                model,
+                SELF_REFLECTION_CHAT_INSTRUCTION,
+                f'Function implementation:\n{func}\n\nUnit test results:\n{feedback}\n\nSelf-reflection:')
    else:
        reflection = gpt_completion(
            model, f'{SELF_REFLECTION_COMPLETION_INSTRUCTION}\n{func}\n\n{feedback}\n\nExplanation:')
-    print('--------------- GENERATED SELF REFLECTION: ---------------')
-    print(reflection)
-    print('----------------------------------------------------------')
    return reflection  # type: ignore


--- a/generators/py_generate.py
+++ b/generators/py_generate.py
@ -9,12 +9,203 @@ PY_SIMPLE_COMPLETION_INSTRUCTION = "# Write the body of this function only."
 PY_REFLEXION_COMPLETION_INSTRUCTION = "You are PythonGPT. You will be given your past function implementation, a series of unit tests, and a hint to change the implementation appropriately. Apply the changes below by writing the body of this function only.\n\n-----"
 PY_SELF_REFLECTION_COMPLETION_INSTRUCTION = "You are PythonGPT. You will be given a function implementation and a series of unit tests. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as a hint when you try again later. Only provide the few sentence description in your answer, not the implementation.\n\n-----"

-PY_SIMPLE_CHAT_INSTRUCTION = "You are PythonGPT, an AI that only responds with python code, NOT ENGLISH. You will be given a function signature and its docstring by the user. Respond only in code with correct implementation of the function. Do not include provided the docstring in your response." # The first line of your response should have 4 spaces of indentation so that it fits syntactically with the user provided signature.
-PY_REFLEXION_CHAT_INSTRUCTION = "You are PythonGPT. You will be given your past function implementation, a series of unit tests, and a hint to change the implementation appropriately. Apply the changes below by writing the body of this function only. You should fill in the following text of the missing function body. For example, the first line of the completion should have 4 spaces for the indendation so that it fits syntactically with the preceding signature."
-PY_SELF_REFLECTION_CHAT_INSTRUCTION = "You are PythonGPT. You will be given a function implementation and a series of unit tests. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as a hint when you try again later. Only provide the few sentence description in your answer, not the implementation."
+PY_SIMPLE_CHAT_INSTRUCTION = "You are PythonGPT, an AI that only responds with only python code. You will be given a function signature and its docstring by the user. Respond only in code with a correct, efficient implementation of the function. Do not include provided the docstring in your response." # The first line of your response should have 4 spaces of indentation so that it fits syntactically with the user provided signature.
+PY_REFLEXION_CHAT_INSTRUCTION = "You are PythonGPT. You will be given your previous implementation of a function, a series of unit tests results, and your self-reflection on your previous implementation. Apply the necessary changes below by responding only with the improved body of the function. Do not include the signature in your response. The first line of your response should have 4 spaces of indentation so that it fits syntactically with the user provided signature. You will be given a few examples by the user."
+PY_REFLEXION_FEW_SHOT_ADD = '''Example 1:
+[previous impl]:
+def add(a: int, b: int) -> int:
+    """
+    Given integers a and b, return the total value of a and b.
+    """
+    return a - b
+
+[unit test results from previous impl]:
+Tested passed:
+
+Tests failed:
+assert add(1, 2) == 3 # output: -1
+assert add(1, 2) == 4 # output: -1
+
+[reflection on previous impl]:
+The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from `-` to `+` in the return statement. This will ensure that the function returns the correct output for the given input.
+
+[improved impl]:
+def add(a: int, b: int) -> int:
+    """
+    Given integers a and b, return the total value of a and b.
+    """
+    return a + b
+'''
+
+PY_REFLEXION_FEW_SHOT = '''Example 1:
+[previous impl]:
+from typing import *
+def fullJustify(words: List[str], maxWidth: int) -> List[str]:
+    """
+    Given an array of words and a width maxWidth, format the text such that each line has exactly maxWidth characters and is fully (left and right) justified.
+    You should pack your words in a greedy approach; that is, pack as many words as you can in each line. Pad extra spaces `' '` when necessary so that each line has exactly maxWidth characters.
+    Extra spaces between words should be distributed as evenly as possible. If the number of spaces on a line do not divide evenly between words, the empty slots on the left will be assigned more spaces than the slots on the right.
+    For the last line of text, it should be left justified and no extra space is inserted between words.
+    Note:
+    A word is defined as a character sequence consisting of non-space characters only.
+    Each word's length is guaranteed to be greater than 0 and not exceed maxWidth.
+    The input array `words` contains at least one word.
+    """
+    res = []
+    cur_line = []
+    cur_len = 0
+
+    for word in words:
+        if cur_len + len(word) + len(cur_line) > maxWidth:
+            if len(cur_line) == 1:
+                res.append(cur_line[0] + ' ' * (maxWidth - cur_len))
+            else:
+                spaces = maxWidth - cur_len
+                space_between = spaces // (len(cur_line) - 1)
+                extra_spaces = spaces % (len(cur_line) - 1)
+                line = ''
+                for i, w in enumerate(cur_line[:-1]):
+                    line += w + ' ' * (space_between + (i < extra_spaces))
+                line += cur_line[-1]
+                res.append(line)
+            cur_line = []
+            cur_len = 0
+        cur_line.append(word)
+        cur_len += len(word)
+
+    last_line = ' '.join(cur_line)
+    last_line += ' ' * (maxWidth - len(last_line))
+    res.append(last_line)
+
+    return res
+
+[unit test results from previous impl]:
+Tested passed:
+
+Tests failed:
+assert fullJustify([], 10) == [] # output: ['          ']
+assert fullJustify([], 0) == [] # output: ['']

-PY_TEST_GENERATION_FEW_SHOT = """For example:
+[reflection on previous impl]:
+The implementation failed the test cases where the input list of words is empty. The issue arises because the code does not handle the case where there are no words to process. As a result, it still appends a line with spaces to the result list, even when there are no words. To fix this issue, we should add a condition at the beginning of the function to check if the input list is empty, and return an empty list if it is. This will ensure that the function returns the correct output for empty input lists.

+[improved impl]:
+from typing import *
+def fullJustify(words: List[str], maxWidth: int) -> List[str]:
+    """
+    Given an array of words and a width maxWidth, format the text such that each line has exactly maxWidth characters and is fully (left and right) justified.
+    You should pack your words in a greedy approach; that is, pack as many words as you can in each line. Pad extra spaces `' '` when necessary so that each line has exactly maxWidth characters.
+    Extra spaces between words should be distributed as evenly as possible. If the number of spaces on a line do not divide evenly between words, the empty slots on the left will be assigned more spaces than the slots on the right.
+    For the last line of text, it should be left justified and no extra space is inserted between words.
+    Note:
+    A word is defined as a character sequence consisting of non-space characters only.
+    Each word's length is guaranteed to be greater than 0 and not exceed maxWidth.
+    The input array `words` contains at least one word.
+    """
+    if not words:
+        return []
+
+    res = []
+    cur_line = []
+    cur_len = 0
+
+    for word in words:
+        if cur_len + len(word) + len(cur_line) > maxWidth:
+            if len(cur_line) == 1:
+                res.append(cur_line[0] + ' ' * (maxWidth - cur_len))
+            else:
+                spaces = maxWidth - cur_len
+                space_between = spaces // (len(cur_line) - 1)
+                extra_spaces = spaces % (len(cur_line) - 1)
+                line = ''
+                for i, w in enumerate(cur_line[:-1]):
+                    line += w + ' ' * (space_between + (i < extra_spaces))
+                line += cur_line[-1]
+                res.append(line)
+            cur_line = []
+            cur_len = 0
+        cur_line.append(word)
+        cur_len += len(word)
+
+    last_line = ' '.join(cur_line)
+    last_line += ' ' * (maxWidth - len(last_line))
+    res.append(last_line)
+
+    return res
+END EXAMPLES
+
+'''
+
+PY_SELF_REFLECTION_CHAT_INSTRUCTION = "You are PythonGPT. You will be given a function implementation and a series of unit test results. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as guidance when you try again later. Only provide the few sentence description in your answer, not the implementation. You will be given a few examples by the user."
+PY_SELF_REFLECTION_FEW_SHOT = """Example 1:
+[function impl]:
+def longest_subarray_with_sum_limit(nums: List[int], target: int) -> List[int]:
+    n = len(nums)
+    left, right = 0, 0
+    max_length = 0
+    current_sum = 0
+    result = []
+    while right < n:
+        current_sum += nums[right]
+        while current_sum > target:
+            current_sum -= nums[left]
+            left += 1
+        if right - left + 1 >= max_length:
+            max_length = right - left + 1
+            result = nums[left:right+1]
+        right += 1
+    return result
+[unit test results]:
+Tests passing:
+assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 8) == [1, 2, 3]
+assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 15) == [1, 2, 3, 4, 5]
+assert longest_subarray_with_sum_limit([1, -1, 2, -2, 3, -3], 2) == [1, -1, 2, -2, 3]
+assert longest_subarray_with_sum_limit([], 10) == []
+assert longest_subarray_with_sum_limit([], 0) == []
+assert longest_subarray_with_sum_limit([], -5) == []  
+Tests failing:
+assert longest_subarray_with_sum_limit([5, 6, 7, 8, 9], 4) == [] # output: [5]
+[self-reflection]:
+The implementation failed the where no subarray fulfills the condition. The issue in the implementation is due to the use of >= instead of > in the condition to update the result. Because of this, it returns a subarray even when the sum is greater than the target, as it still updates the result when the current subarray length is equal to the previous longest subarray length. To overcome this error, we should change the condition to only update the result when the current subarray length is strictly greater than the previous longest subarray length. This can be done by replacing >= with > in the condition.
+
+Example 2:
+[function impl]:
+def longest_subarray_with_sum_limit(nums: List[int], target: int) -> List[int]:
+    n = len(nums)
+    left, right = 0, 0
+    max_length = 0
+    current_sum = 0
+    result = []
+    while current_sum + nums[right] <= target:
+        current_sum += nums[right]
+        right += 1
+    while right < n:
+        current_sum += nums[right]
+        while current_sum > target:
+            current_sum -= nums[left]
+            left += 1
+        if right - left + 1 > max_length:
+            max_length = right - left + 1
+            result = nums[left:right+1]
+        right += 1
+    return result
+[unit test results]:
+Tests passing:
+assert longest_subarray_with_sum_limit([], 10) == []
+assert longest_subarray_with_sum_limit([], 0) == []
+assert longest_subarray_with_sum_limit([], -5) == []
+Tests failing:
+assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 8) == [1, 2, 3] # output: list index out of range
+assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 15) == [1, 2, 3, 4, 5] # output: list index out of range
+assert longest_subarray_with_sum_limit([5, 6, 7, 8, 9], 4) == [] # output: list index out of range
+assert longest_subarray_with_sum_limit([1, -1, 2, -2, 3, -3], 2) == [1, -1, 2, -2, 3] # output: list index out of range
+[self-reflection]:
+The implementation failed 4 out of the 7 test cases due to an IndexError. The issue stems from the while loop while current_sum + nums[right] <= target:, which directly accesses nums[right] without checking if right is within the bounds of the list. This results in a runtime error when right goes beyond the list length. To overcome this error, we need to add a bounds check for the right variable in the mentioned while loop. We can modify the loop condition to while right < len(nums) and current_sum + nums[right] <= target:. This change will ensure that we only access elements within the bounds of the list, thus avoiding the IndexError.
+END OF EXAMPLES
+
+"""
+
+PY_TEST_GENERATION_FEW_SHOT = """Examples:
 func signature:
 def has_close_elements(numbers: List[float], threshold: float) -> bool:
    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than
@ -24,7 +215,6 @@ def has_close_elements(numbers: List[float], threshold: float) -> bool:
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    \"\"\"
-
 unit tests:
 assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
 assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
@ -34,11 +224,36 @@ assert has_close_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
 assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
 assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False"""

-PY_TEST_GENERATION_COMPLETION_INSTRUCTION = f"""You are PythonGPT, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring.
+PY_TEST_GENERATION_FEW_SHOT_REACT = '''Example 1:
+[func signature]:
+def longest_subarray_with_sum_limit(nums: List[int], target: int) -> List[int]:
+    """
+    Given a list of integers nums and an integer target, write a function that returns the longest contiguous subarray of nums with a sum less than or equal to target.
+    If there is no subarray with sum less than or equal to target, return an empty list.
+    """
+[think]:
+Okay, let's think of some test cases for this longest_subarray_with_sum_limit function. We need to cover various edge cases. First, let's test with a simple list of positive integers and a target value where a subarray exists. Next, we could test with a target value greater than the sum of all elements in the list, which should return the entire list. We should also test a case where no subarray fulfills the condition, so the function should return an empty list. It would be interesting to include negative numbers in the list and test if the function handles them correctly. Finally, we should also test with an empty list and different target values to make sure the function can handle those cases.
+[unit tests]:
+# Test case 1: Simple list of positive integers with existing subarray
+assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 8) == [1, 2, 3]
+# Test case 2: Target value greater than sum of all elements in the list
+assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 15) == [1, 2, 3, 4, 5]
+# Test case 3: No subarray fulfills the condition
+assert longest_subarray_with_sum_limit([5, 6, 7, 8, 9], 4) == []
+# Test case 4: List with negative numbers
+assert longest_subarray_with_sum_limit([1, -1, 2, -2, 3, -3], 2) == [1, -1, 2, -2, 3]
+# Test case 5: Empty list with different target values
+assert longest_subarray_with_sum_limit([], 10) == []
+assert longest_subarray_with_sum_limit([], 0) == []
+assert longest_subarray_with_sum_limit([], -5) == []
+END OF EXAMPLES
+'''

+PY_TEST_GENERATION_COMPLETION_INSTRUCTION = f"""You are PythonGPT, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring.
 {PY_TEST_GENERATION_FEW_SHOT}"""

 PY_TEST_GENERATION_CHAT_INSTRUCTION = """You are CodexGPT, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring."""
+PY_TEST_GENERATION_CHAT_INSTRUCTION_REACT = """You are CodexGPT, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the description and signature. You will first 'think', brainstorming possible edge cases and points of failure. Then, you will write a series of unit tests that reflect your thinking. Make sure that tests conform to any constraints. You will be given a few examples."""

 class PyGenerator(Generator):
    def self_reflection(self, func: str, feedback: str, model: str) -> str:
@ -48,6 +263,7 @@ class PyGenerator(Generator):
            model=model,
            SELF_REFLECTION_CHAT_INSTRUCTION=PY_SELF_REFLECTION_CHAT_INSTRUCTION,
            SELF_REFLECTION_COMPLETION_INSTRUCTION=PY_SELF_REFLECTION_COMPLETION_INSTRUCTION,
+            SELF_REFLECTION_FEW_SHOT=PY_SELF_REFLECTION_FEW_SHOT
        )
        return x

@ -72,6 +288,7 @@ class PyGenerator(Generator):
            num_comps=num_comps,
            temperature=temperature,
            REFLEXION_CHAT_INSTRUCTION=PY_REFLEXION_CHAT_INSTRUCTION,
+            REFLEXION_FEW_SHOT = PY_REFLEXION_FEW_SHOT_ADD,
            SIMPLE_CHAT_INSTRUCTION=PY_SIMPLE_CHAT_INSTRUCTION,
            REFLEXION_COMPLETION_INSTRUCTION=PY_REFLEXION_COMPLETION_INSTRUCTION,
            SIMPLE_COMPLETION_INSTRUCTION=PY_SIMPLE_COMPLETION_INSTRUCTION,
--- a/reflexion.py
+++ b/reflexion.py
@ -59,7 +59,7 @@ def run_reflexion(
                    strategy="reflexion",
                    prev_func_impl=cur_func_impl,
                    feedback=cur_feedback,
-                    self_reflection=reflection
+                    self_reflection=reflection,
                )
                assert isinstance(cur_func_impl, str)

--- a/root/reflexion_leetcode_python3_gpt4_react/leetcode-hard-py-40-uncontaminated._reflexion_5_gpt-4_pass_at_k_1_py.jsonl
+++ b/root/reflexion_leetcode_python3_gpt4_react/leetcode-hard-py-40-uncontaminated._reflexion_5_gpt-4_pass_at_k_1_py.jsonl
--- a/root/reflexion_leetcode_python3_gpt4_react_constraints/leetcode-hard-py-40-uncontaminated-constraints._reflexion_5_gpt-4_pass_at_k_1_py.jsonl
+++ b/root/reflexion_leetcode_python3_gpt4_react_constraints/leetcode-hard-py-40-uncontaminated-constraints._reflexion_5_gpt-4_pass_at_k_1_py.jsonl
--- a/root/reflexion_leetcode_python3_gpt4_react_constraints_3_tests/leetcode-hard-py-40-uncontaminated-constraints._reflexion_5_gpt-4_pass_at_k_1_py.jsonl
+++ b/root/reflexion_leetcode_python3_gpt4_react_constraints_3_tests/leetcode-hard-py-40-uncontaminated-constraints._reflexion_5_gpt-4_pass_at_k_1_py.jsonl
--- a/root/reflexion_leetcode_python3_gpt4_react_constraints_test/leetcode-hard-py-40-uncontaminated-constraints._reflexion_5_gpt-4_pass_at_k_1_py.jsonl
+++ b/root/reflexion_leetcode_python3_gpt4_react_constraints_test/leetcode-hard-py-40-uncontaminated-constraints._reflexion_5_gpt-4_pass_at_k_1_py.jsonl
--- a/run_reflexion_py_leet.sh
+++ b/run_reflexion_py_leet.sh
@ -1,7 +1,7 @@
 python main.py \
-  --run_name "reflexion_leetcode_python3_gpt4" \
+  --run_name "reflexion_leetcode_python3_gpt4_react_constraints" \
  --root_dir "root" \
-  --dataset_path ./executors/leetcode_env/leetcode_dataset/data/humaneval/leetcode-hard-py-40-uncontaminated.jsonl \
+  --dataset_path ./executors/leetcode_env/leetcode_dataset/data/humaneval/leetcode-hard-py-40-uncontaminated-constraints.jsonl \
  --strategy "reflexion" \
  --language "py" \
  --model "gpt-4" \