Merge branch 'py-prompts'

This commit is contained in:
Beck LaBash 2023-05-18 20:17:47 -04:00
commit 5b6a1bd990
31 changed files with 866 additions and 148 deletions

51
evaluate_leet_results.py Normal file
View File

@ -0,0 +1,51 @@
# Usage: python evaluate_leet_results.py <lang> <input_log_path> <output_log_path>
from executors.leetcode_env.leetcode_env.environment import LeetCodeEnv
from executors.leetcode_env.leetcode_env.leetcode_types import LeetCodeSubmission, ProgrammingLanguage
from executors.leetcode_env.leetcode_env.utils import PySubmissionFormatter, RsSubmissionFormatter
from utils import read_jsonl
import sys
assert len(sys.argv) == 3, "Provide a language [py, rs], input and output log file"
lang = sys.argv[1]
input_log_path = sys.argv[2]
output_log_path = sys.argv[3]
if lang == "py":
formatter = PySubmissionFormatter
lang = ProgrammingLanguage.PYTHON
elif lang == "rs":
formatter = RsSubmissionFormatter
lang = ProgrammingLanguage.RUST
else:
raise ValueError("Provide a valid language (rs or py)")
lines = read_jsonl(input_log_path)
for line in lines:
assert "implementations" in line, "Log file must contain implementations"
env = LeetCodeEnv()
for line in lines:
line["evaluations"] = []
for impl in line["implementations"]:
submission = LeetCodeSubmission(
code=formatter.to_leetcode(impl),
lang=lang,
question_slug=impl["task_id"],
)
status, reward, done, info = env.step(submission)
line["evaluations"].append({
"status": status,
"reward": reward,
"done": done,
"info": info,
})
env.reset()

View File

View File

@ -1,79 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1, -1, 0, -1, 1]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def minReverseOperations(n: int, p: int, banned: List[int], k: int) -> List[int]:\n",
" \"\"\"\n",
" You are given an integer n and an integer p in the range [0, n - 1]. Representing a 0-indexed array arr of length n where all positions are set to 0's, except position p which is set to 1.\n",
" You are also given an integer array banned containing some positions from the array. For the ith position in banned, arr[banned[i]] = 0, and banned[i] != p.\n",
" You can perform multiple operations on arr. In an operation, you can choose a subarray with size k and reverse the subarray. However, the 1 in arr should never go to any of the positions in banned. In other words, after each operation arr[banned[i]] remains 0.\n",
" Return an array ans where for each i from [0, n - 1], ans[i] is the minimum number of reverse operations needed to bring the 1 to position i in arr, or -1 if it is impossible.\n",
" A subarray is a contiguous non-empty sequence of elements within an array.\n",
" The values of ans[i] are independent for all i's.\n",
" The reverse of an array is an array containing the values in reverse order.\n",
" \"\"\"\n",
" from typing import List\n",
"\n",
" ans = [-1] * n\n",
" banned_set = set(banned)\n",
" \n",
" for i in range(n):\n",
" if i == p or i in banned_set:\n",
" continue\n",
" \n",
" distance = abs(i - p)\n",
" if distance % k == 0:\n",
" operations = distance // k\n",
" valid = True\n",
" for j in range(p, i, k if i > p else -k):\n",
" if j in banned_set or (j + k) in banned_set:\n",
" valid = False\n",
" break\n",
" if valid:\n",
" ans[i] = operations\n",
" \n",
" ans[p] = 0\n",
" return ans\n",
"\n",
"minReverseOperations(5, 2, [1, 3], 2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,15 +1,14 @@
from .leetcode_env.leetcode_env.leetcode_types import ProgrammingLanguage
from .py_executor import PyExecutor
from .rs_executor import RsExecutor
from .leet_executor import LeetExecutor
from .executor_types import Executor
from .leetcode_env.leetcode_env.utils import PySubmissionFormatter, RsSubmissionFormatter
from .leet_executor import LeetExecutor
def executor_factory(lang: str, is_leet: bool = False) -> Executor:
if lang == "py" or lang == "python":
if is_leet:
print("Using LeetCode Python executor")
from .leetcode_env.leetcode_env.leetcode_types import ProgrammingLanguage
from .leetcode_env.leetcode_env.utils import PySubmissionFormatter, RsSubmissionFormatter
return LeetExecutor(ProgrammingLanguage.PYTHON3,
PyExecutor(),
PySubmissionFormatter)
@ -17,6 +16,8 @@ def executor_factory(lang: str, is_leet: bool = False) -> Executor:
return PyExecutor()
elif lang == "rs" or lang == "rust":
if is_leet:
from .leetcode_env.leetcode_env.leetcode_types import ProgrammingLanguage
from .leetcode_env.leetcode_env.utils import PySubmissionFormatter, RsSubmissionFormatter
return LeetExecutor(ProgrammingLanguage.RUST,
RsExecutor(),
RsSubmissionFormatter)

View File

@ -1,21 +1,18 @@
from __future__ import annotations
from typing import List
from .executor_types import ExecuteResult, Executor
from .executor_utils import to_jsonl
from .leetcode_env.leetcode_env.utils import SubmissionFormatter
from .leetcode_env.leetcode_env.environment import LeetCodeEnv
from .leetcode_env.leetcode_env.leetcode_types import ProgrammingLanguage, LeetCodeSubmission
from .leetcode_env.leetcode_env.utils import id_from_slug
from datetime import datetime
class LeetExecutor(Executor):
from .leetcode_env.leetcode_env.utils import SubmissionFormatter
from .leetcode_env.leetcode_env.leetcode_types import ProgrammingLanguage
def __init__(self, lang: ProgrammingLanguage, executor: Executor, formatter: SubmissionFormatter):
def __init__(self, lang, executor: Executor, formatter):
from .leetcode_env.leetcode_env.utils import SubmissionFormatter
from .leetcode_env.leetcode_env.leetcode_types import ProgrammingLanguage
from .leetcode_env.leetcode_env.environment import LeetCodeEnv
assert isinstance(formatter, SubmissionFormatter)
assert isinstance(lang, ProgrammingLanguage)
self.lang = lang
self.executor = executor
self.formatter = formatter
@ -26,19 +23,36 @@ class LeetExecutor(Executor):
return self.executor.execute(func, tests, timeout)
def evaluate(self, name: str, func: str, test: str, timeout: int = 5) -> bool:
from .leetcode_env.leetcode_env.leetcode_types import LeetCodeSubmission
from .leetcode_env.leetcode_env.utils import id_from_slug
print(f'Timeout is {timeout} seconds')
leetcode_formatted_func = self.formatter.to_leetcode(func)
try:
leetcode_formatted_func = self.formatter.to_leetcode(func)
except Exception as e:
print(f'Error formatting function to leetcode: {e}')
return False
print('----------------- LEETCODE SUBMISSION ------------------')
print(leetcode_formatted_func)
print('--------------------------------------------------------')
submission = LeetCodeSubmission(
code=self.formatter.to_leetcode(func),
code=leetcode_formatted_func,
lang=self.lang,
question_id=id_from_slug(name, self.env.api_instance),
question_slug=name,
timeout=timeout
)
_, reward, _, _ = self.env.step(submission)
status, reward, _, info = self.env.step(submission)
print('----------------- LEETCODE SUBMISSION ------------------')
print(status)
print('--------------------------------------------------------')
to_jsonl({
'name': name,
'status': status,
'reward': reward,
'info': info
}, self.name)
return reward

@ -1 +1 @@
Subproject commit 61f4969745189177a9edd8306217c66d6b8f9edb
Subproject commit 2ab159560725cf8482600ca1d0adf55d1b315c14

View File

@ -2,7 +2,7 @@ import ast
import signal
import astunparse
from .executor_utils import timeout_handler, function_with_timeout
from .executor_utils import function_with_timeout
from typing import List
from .executor_types import ExecuteResult, Executor
@ -78,11 +78,9 @@ def get_call_str(assert_statement: str) -> str:
def get_output(func: str, assert_statement: str, timeout: int = 5) -> str:
try:
exec(f"from typing import *\n{func}", globals())
func_call = get_call_str(assert_statement)
to_eval = f"from typing import *\n{func}\n{func_call}"
exec(func, globals())
output = function_with_timeout(eval, (func_call,globals()), timeout)
output = function_with_timeout(eval, (func_call, globals()), timeout)
return output
except TimeoutError:
return "TIMEOUT"

View File

@ -21,6 +21,7 @@ def generic_generate_func_impl(
num_comps,
temperature,
REFLEXION_CHAT_INSTRUCTION: str,
REFLEXION_FEW_SHOT: str,
SIMPLE_CHAT_INSTRUCTION: str,
REFLEXION_COMPLETION_INSTRUCTION: str,
SIMPLE_COMPLETION_INSTRUCTION: str,
@ -35,7 +36,7 @@ def generic_generate_func_impl(
if model == "gpt-4" or model == "gpt-3.5-turbo":
if strategy == "reflexion":
message = f"previous implementation:\n{prev_func_impl}\n\nunit tests:\n{feedback}\n\nhint:\n{self_reflection}\n\n# improved implementation\n{func_sig}"
message = f"{REFLEXION_FEW_SHOT}\n[previous impl]:\n{prev_func_impl}\n\n[unit test results from previous impl]:\n{feedback}\n\n[reflection on previous impl]:\n{self_reflection}\n\n[improved impl]:\n{func_sig}"
# func_bodies is a really bad name, as it can also be just 1 string
print('----------------------- SYSTEM MESSAGE -----------------------')
print(REFLEXION_CHAT_INSTRUCTION)
@ -88,38 +89,31 @@ def generic_generate_internal_tests(
TEST_GENERATION_COMPLETION_INSTRUCTION: str,
parse_tests: Callable[[str], List[str]],
is_syntax_valid: Callable[[str], bool],
is_react: bool = False
) -> List[str]:
"""
Generates tests for a function using a refinement technique with the number
of specified commmittee members.
"""
if model == "gpt-4" or model == "gpt-3.5-turbo":
message = f'{TEST_GENERATION_FEW_SHOT}\n\nfunc signature:\n{func_sig}\nunit tests:'
print('----------------------- SYSTEM MESSAGE -----------------------')
print(TEST_GENERATION_CHAT_INSTRUCTION)
print('----------------------------------------------')
print(' ----------------------- USER MESSAGE -----------------------')
print(message, flush=True)
print('----------------------------------------------')
output = gpt_chat(
model, TEST_GENERATION_CHAT_INSTRUCTION, message, max_tokens=1024)
if is_react:
message = f'{TEST_GENERATION_FEW_SHOT}\n\n[func signature]:\n{func_sig}\n\n[think]:'
output = gpt_chat(
model, TEST_GENERATION_CHAT_INSTRUCTION, message, max_tokens=1024)
print(f'React test generation output: {output}')
else:
message = f'{TEST_GENERATION_FEW_SHOT}\n\nfunc signature:\n{func_sig}\nunit tests:'
output = gpt_chat(
model, TEST_GENERATION_CHAT_INSTRUCTION, message, max_tokens=1024)
else:
prompt = f'{TEST_GENERATION_COMPLETION_INSTRUCTION}\n\nfunc signature:\n{func_sig}\nunit tests:'
output = gpt_completion(model, prompt, max_tokens=1024)
all_tests = parse_tests(output) # type: ignore
valid_tests = [test for test in all_tests if is_syntax_valid(test)]
# TODO: NOT SUPPORTED YET
# someone implement this
# cur_refinement_num = 0
# while cur_refinement_num < committee_size:
# # TODO: implement
# cur_tests = ... # type: ignore
# cur_refinement_num += 1
print('--------------- GENERATED TESTS: ---------------')
print(valid_tests)
print('------------------------------------------------')
# n = 3
# first_n = min(len(valid_tests), n)
# valid_tests = valid_tests[:first_n]
return sample_n_random(valid_tests, max_num_tests)
@ -130,22 +124,23 @@ def generic_generate_self_reflection(
model: str,
SELF_REFLECTION_CHAT_INSTRUCTION: str,
SELF_REFLECTION_COMPLETION_INSTRUCTION: str,
SELF_REFLECTION_FEW_SHOT: Optional[str] = None
) -> str:
if model == "gpt-4" or model == "gpt-3.5-turbo":
print('----------------------- SYSTEM MESSAGE -----------------------')
print(SELF_REFLECTION_CHAT_INSTRUCTION)
print('----------------------------------------------')
print(' ----------------------- USER MESSAGE -----------------------')
print(f'{func}\n\n{feedback}\n\nExplanation:', flush=True)
print('----------------------------------------------')
reflection = gpt_chat(
model, SELF_REFLECTION_CHAT_INSTRUCTION, f'{func}\n\n{feedback}\n\nExplanation:')
if SELF_REFLECTION_FEW_SHOT is not None:
reflection = gpt_chat(
model,
SELF_REFLECTION_CHAT_INSTRUCTION,
f'{SELF_REFLECTION_FEW_SHOT}\n\n[function impl]:\n{func}\n\n[unit test results]:\n{feedback}\n\n[self-reflection]:')
print(f'Self reflection output: {reflection}')
else:
reflection = gpt_chat(
model,
SELF_REFLECTION_CHAT_INSTRUCTION,
f'Function implementation:\n{func}\n\nUnit test results:\n{feedback}\n\nSelf-reflection:')
else:
reflection = gpt_completion(
model, f'{SELF_REFLECTION_COMPLETION_INSTRUCTION}\n{func}\n\n{feedback}\n\nExplanation:')
print('--------------- GENERATED SELF REFLECTION: ---------------')
print(reflection)
print('----------------------------------------------------------')
return reflection # type: ignore

View File

@ -9,12 +9,204 @@ PY_SIMPLE_COMPLETION_INSTRUCTION = "# Write the body of this function only."
PY_REFLEXION_COMPLETION_INSTRUCTION = "You are a Python writing assistant. You will be given your past function implementation, a series of unit tests, and a hint to change the implementation appropriately. Apply the changes below by writing the body of this function only.\n\n-----"
PY_SELF_REFLECTION_COMPLETION_INSTRUCTION = "You are a Python writing assistant. You will be given a function implementation and a series of unit tests. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as a hint when you try again later. Only provide the few sentence description in your answer, not the implementation.\n\n-----"
PY_SIMPLE_CHAT_INSTRUCTION = "You are a Python writing assistant, an AI that only responds with python code, NOT ENGLISH. You will be given a function signature and its docstring by the user. Respond only in code with correct implementation of the function. Do not include provided the docstring in your response."
PY_REFLEXION_CHAT_INSTRUCTION = "You are a Python writing assistant. You will be given your past function implementation, a series of unit tests, and a hint to change the implementation appropriately. Apply the changes below by writing the body of this function only. You should fill in the following text of the missing function body. For example, the first line of the completion should have 4 spaces for the indendation so that it fits syntactically with the preceding signature."
PY_SELF_REFLECTION_CHAT_INSTRUCTION = "You are a Python writing assistant. You will be given a function implementation and a series of unit tests. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as a hint when you try again later. Only provide the few sentence description in your answer, not the implementation."
PY_SIMPLE_CHAT_INSTRUCTION = "You are PythonGPT, an AI that only responds with python code, NOT ENGLISH. You will be given a function signature and its docstring by the user. Respond only in code with correct implementation of the function. Do not include provided the docstring in your response." # The first line of your response should have 4 spaces of indentation so that it fits syntactically with the user provided signature.
PY_SIMPLE_CHAT_INSTRUCTION_V2 = "You are PythonGPT, an AI that only responds with only python code. You will be given a function signature and its docstring by the user. Respond only in code with a correct, efficient implementation of the function. Do not include provided the docstring in your response." # The first line of your response should have 4 spaces of indentation so that it fits syntactically with the user provided signature.
PY_REFLEXION_CHAT_INSTRUCTION = "You are PythonGPT. You will be given your past function implementation, a series of unit tests, and a hint to change the implementation appropriately. Apply the changes below by writing the body of this function only. You should fill in the following text of the missing function body. For example, the first line of the completion should have 4 spaces for the indendation so that it fits syntactically with the preceding signature."
PY_REFLEXION_CHAT_INSTRUCTION_V2 = "You are PythonGPT. You will be given your previous implementation of a function, a series of unit tests results, and your self-reflection on your previous implementation. Apply the necessary changes below by responding only with the improved body of the function. Do not include the signature in your response. The first line of your response should have 4 spaces of indentation so that it fits syntactically with the user provided signature. You will be given a few examples by the user."
PY_REFLEXION_FEW_SHOT_ADD = '''Example 1:
[previous impl]:
def add(a: int, b: int) -> int:
"""
Given integers a and b, return the total value of a and b.
"""
return a - b
PY_TEST_GENERATION_FEW_SHOT = """For example:
[unit test results from previous impl]:
Tested passed:
Tests failed:
assert add(1, 2) == 3 # output: -1
assert add(1, 2) == 4 # output: -1
[reflection on previous impl]:
The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from `-` to `+` in the return statement. This will ensure that the function returns the correct output for the given input.
[improved impl]:
def add(a: int, b: int) -> int:
"""
Given integers a and b, return the total value of a and b.
"""
return a + b
'''
PY_REFLEXION_FEW_SHOT = '''Example 1:
[previous impl]:
from typing import *
def fullJustify(words: List[str], maxWidth: int) -> List[str]:
"""
Given an array of words and a width maxWidth, format the text such that each line has exactly maxWidth characters and is fully (left and right) justified.
You should pack your words in a greedy approach; that is, pack as many words as you can in each line. Pad extra spaces `' '` when necessary so that each line has exactly maxWidth characters.
Extra spaces between words should be distributed as evenly as possible. If the number of spaces on a line do not divide evenly between words, the empty slots on the left will be assigned more spaces than the slots on the right.
For the last line of text, it should be left justified and no extra space is inserted between words.
Note:
A word is defined as a character sequence consisting of non-space characters only.
Each word's length is guaranteed to be greater than 0 and not exceed maxWidth.
The input array `words` contains at least one word.
"""
res = []
cur_line = []
cur_len = 0
for word in words:
if cur_len + len(word) + len(cur_line) > maxWidth:
if len(cur_line) == 1:
res.append(cur_line[0] + ' ' * (maxWidth - cur_len))
else:
spaces = maxWidth - cur_len
space_between = spaces // (len(cur_line) - 1)
extra_spaces = spaces % (len(cur_line) - 1)
line = ''
for i, w in enumerate(cur_line[:-1]):
line += w + ' ' * (space_between + (i < extra_spaces))
line += cur_line[-1]
res.append(line)
cur_line = []
cur_len = 0
cur_line.append(word)
cur_len += len(word)
last_line = ' '.join(cur_line)
last_line += ' ' * (maxWidth - len(last_line))
res.append(last_line)
return res
[unit test results from previous impl]:
Tested passed:
Tests failed:
assert fullJustify([], 10) == [] # output: [' ']
assert fullJustify([], 0) == [] # output: ['']
[reflection on previous impl]:
The implementation failed the test cases where the input list of words is empty. The issue arises because the code does not handle the case where there are no words to process. As a result, it still appends a line with spaces to the result list, even when there are no words. To fix this issue, we should add a condition at the beginning of the function to check if the input list is empty, and return an empty list if it is. This will ensure that the function returns the correct output for empty input lists.
[improved impl]:
from typing import *
def fullJustify(words: List[str], maxWidth: int) -> List[str]:
"""
Given an array of words and a width maxWidth, format the text such that each line has exactly maxWidth characters and is fully (left and right) justified.
You should pack your words in a greedy approach; that is, pack as many words as you can in each line. Pad extra spaces `' '` when necessary so that each line has exactly maxWidth characters.
Extra spaces between words should be distributed as evenly as possible. If the number of spaces on a line do not divide evenly between words, the empty slots on the left will be assigned more spaces than the slots on the right.
For the last line of text, it should be left justified and no extra space is inserted between words.
Note:
A word is defined as a character sequence consisting of non-space characters only.
Each word's length is guaranteed to be greater than 0 and not exceed maxWidth.
The input array `words` contains at least one word.
"""
if not words:
return []
res = []
cur_line = []
cur_len = 0
for word in words:
if cur_len + len(word) + len(cur_line) > maxWidth:
if len(cur_line) == 1:
res.append(cur_line[0] + ' ' * (maxWidth - cur_len))
else:
spaces = maxWidth - cur_len
space_between = spaces // (len(cur_line) - 1)
extra_spaces = spaces % (len(cur_line) - 1)
line = ''
for i, w in enumerate(cur_line[:-1]):
line += w + ' ' * (space_between + (i < extra_spaces))
line += cur_line[-1]
res.append(line)
cur_line = []
cur_len = 0
cur_line.append(word)
cur_len += len(word)
last_line = ' '.join(cur_line)
last_line += ' ' * (maxWidth - len(last_line))
res.append(last_line)
return res
END EXAMPLES
'''
PY_SELF_REFLECTION_CHAT_INSTRUCTION = "You are PythonGPT. You will be given a function implementation and a series of unit tests. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as a hint when you try again later. Only provide the few sentence description in your answer, not the implementation."
PY_SELF_REFLECTION_CHAT_INSTRUCTION_V2 = "You are PythonGPT. You will be given a function implementation and a series of unit test results. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as guidance when you try again later. Only provide the few sentence description in your answer, not the implementation. You will be given a few examples by the user."
PY_SELF_REFLECTION_FEW_SHOT = """Example 1:
[function impl]:
def longest_subarray_with_sum_limit(nums: List[int], target: int) -> List[int]:
n = len(nums)
left, right = 0, 0
max_length = 0
current_sum = 0
result = []
while right < n:
current_sum += nums[right]
while current_sum > target:
current_sum -= nums[left]
left += 1
if right - left + 1 >= max_length:
max_length = right - left + 1
result = nums[left:right+1]
right += 1
return result
[unit test results]:
Tests passing:
assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 8) == [1, 2, 3]
assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 15) == [1, 2, 3, 4, 5]
assert longest_subarray_with_sum_limit([1, -1, 2, -2, 3, -3], 2) == [1, -1, 2, -2, 3]
assert longest_subarray_with_sum_limit([], 10) == []
assert longest_subarray_with_sum_limit([], 0) == []
assert longest_subarray_with_sum_limit([], -5) == []
Tests failing:
assert longest_subarray_with_sum_limit([5, 6, 7, 8, 9], 4) == [] # output: [5]
[self-reflection]:
The implementation failed the where no subarray fulfills the condition. The issue in the implementation is due to the use of >= instead of > in the condition to update the result. Because of this, it returns a subarray even when the sum is greater than the target, as it still updates the result when the current subarray length is equal to the previous longest subarray length. To overcome this error, we should change the condition to only update the result when the current subarray length is strictly greater than the previous longest subarray length. This can be done by replacing >= with > in the condition.
Example 2:
[function impl]:
def longest_subarray_with_sum_limit(nums: List[int], target: int) -> List[int]:
n = len(nums)
left, right = 0, 0
max_length = 0
current_sum = 0
result = []
while current_sum + nums[right] <= target:
current_sum += nums[right]
right += 1
while right < n:
current_sum += nums[right]
while current_sum > target:
current_sum -= nums[left]
left += 1
if right - left + 1 > max_length:
max_length = right - left + 1
result = nums[left:right+1]
right += 1
return result
[unit test results]:
Tests passing:
assert longest_subarray_with_sum_limit([], 10) == []
assert longest_subarray_with_sum_limit([], 0) == []
assert longest_subarray_with_sum_limit([], -5) == []
Tests failing:
assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 8) == [1, 2, 3] # output: list index out of range
assert longest_subarray_with_sum_limit([1, 2, 3, 4, 5], 15) == [1, 2, 3, 4, 5] # output: list index out of range
assert longest_subarray_with_sum_limit([5, 6, 7, 8, 9], 4) == [] # output: list index out of range
assert longest_subarray_with_sum_limit([1, -1, 2, -2, 3, -3], 2) == [1, -1, 2, -2, 3] # output: list index out of range
[self-reflection]:
The implementation failed 4 out of the 7 test cases due to an IndexError. The issue stems from the while loop while current_sum + nums[right] <= target:, which directly accesses nums[right] without checking if right is within the bounds of the list. This results in a runtime error when right goes beyond the list length. To overcome this error, we need to add a bounds check for the right variable in the mentioned while loop. We can modify the loop condition to while right < len(nums) and current_sum + nums[right] <= target:. This change will ensure that we only access elements within the bounds of the list, thus avoiding the IndexError.
END OF EXAMPLES
"""
PY_TEST_GENERATION_FEW_SHOT = """Examples:
func signature:
def has_close_elements(numbers: List[float], threshold: float) -> bool:
\"\"\" Check if in given list of numbers, are any two numbers closer to each other than
@ -24,7 +216,6 @@ def has_close_elements(numbers: List[float], threshold: float) -> bool:
>>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
True
\"\"\"
unit tests:
assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
assert has_close_elements([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
@ -34,11 +225,11 @@ assert has_close_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
assert has_close_elements([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False"""
PY_TEST_GENERATION_COMPLETION_INSTRUCTION = f"""You are a Python writing assistant, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring.
PY_TEST_GENERATION_COMPLETION_INSTRUCTION = f"""You are PythonGPT, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring.
{PY_TEST_GENERATION_FEW_SHOT}"""
PY_TEST_GENERATION_CHAT_INSTRUCTION = """You are a Python writing assistant, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring."""
PY_TEST_GENERATION_CHAT_INSTRUCTION = """You are CodexGPT, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring."""
class PyGenerator(Generator):
def self_reflection(self, func: str, feedback: str, model: str) -> str:
@ -48,6 +239,7 @@ class PyGenerator(Generator):
model=model,
SELF_REFLECTION_CHAT_INSTRUCTION=PY_SELF_REFLECTION_CHAT_INSTRUCTION,
SELF_REFLECTION_COMPLETION_INSTRUCTION=PY_SELF_REFLECTION_COMPLETION_INSTRUCTION,
SELF_REFLECTION_FEW_SHOT=PY_SELF_REFLECTION_FEW_SHOT
)
return x
@ -72,6 +264,7 @@ class PyGenerator(Generator):
num_comps=num_comps,
temperature=temperature,
REFLEXION_CHAT_INSTRUCTION=PY_REFLEXION_CHAT_INSTRUCTION,
REFLEXION_FEW_SHOT = PY_REFLEXION_FEW_SHOT_ADD,
SIMPLE_CHAT_INSTRUCTION=PY_SIMPLE_CHAT_INSTRUCTION,
REFLEXION_COMPLETION_INSTRUCTION=PY_REFLEXION_COMPLETION_INSTRUCTION,
SIMPLE_COMPLETION_INSTRUCTION=PY_SIMPLE_COMPLETION_INSTRUCTION,

View File

@ -4,13 +4,48 @@ from .generator_utils import gpt_chat, gpt_completion, generic_generate_func_imp
from typing import List, Optional, Union
RS_SIMPLE_COMPLETION_INSTRUCTION = "// Write the body of this function only."
RS_REFLEXION_COMPLETION_INSTRUCTION = "You are RustGPT. You will be given your past function implementation, a series of unit tests, and a hint to change the implementation appropriately. Apply the changes below by writing the body of this function only.\n\n-----"
RS_SELF_REFLECTION_COMPLETION_INSTRUCTION = "You are RustGPT. You will be given a function implementation and a series of unit tests. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as a hint when you try again later. Only provide the few sentence description in your answer, not the implementation.\n\n-----"
RS_SIMPLE_CHAT_INSTRUCTION = "You are RustGPT. You will be given a function signature and docstring. You should fill in the following text of the missing function body. For example, the first line of the completion should have 4 spaces for the indendation so that it fits syntactically with the preceding signature."
RS_REFLEXION_CHAT_INSTRUCTION = "You are RustGPT. You will be given your past function implementation, a series of unit tests, and a hint to change the implementation appropriately. Apply the changes below by writing the body of this function only. You should fill in the following text of the missing function body. For example, the first line of the completion should have 4 spaces for the indendation so that it fits syntactically with the preceding signature."
RS_SELF_REFLECTION_CHAT_INSTRUCTION = "You are RustGPT. You will be given a function implementation and a series of unit tests. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as a hint when you try again later. Only provide the few sentence description in your answer, not the implementation."
RS_REFLEXION_COMPLETION_INSTRUCTION = "You are a Rust programming assistant. You will be given your past function implementation, a series of unit tests, and a hint to change the implementation appropriately. Apply the changes below by writing the body of this function only.\n\n-----"
RS_SELF_REFLECTION_COMPLETION_INSTRUCTION = "You are a Rust programming assistant. You will be given a function implementation and a series of unit tests. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as a hint when you try again later. Only provide the few sentence description in your answer, not the implementation.\n\n-----"
RS_SIMPLE_CHAT_INSTRUCTION = "You are a Rust programming assistant. You will be given a function signature and docstring. You should fill in the following text of the missing function body. For example, the first line of the completion should have 4 spaces for the indendation so that it fits syntactically with the preceding signature."
RS_REFLEXION_CHAT_INSTRUCTION = "You are a Rust programming assistant. You will be given your past function implementation, a series of unit tests, and a hint to change the implementation appropriately. Apply the changes below by writing the body of this function only. You should fill in the following text of the missing function body. For example, the first line of the completion should have 4 spaces for the indendation so that it fits syntactically with the preceding signature."
RS_SELF_REFLECTION_CHAT_INSTRUCTION = "You are a Rust programming assistant. You will be given a function implementation and a series of unit tests. Your goal is to write a few sentences to explain why your implementation is wrong as indicated by the tests. You will need this as a hint when you try again later. Only provide the few sentence description in your answer, not the implementation."
RS_REFLEXION_FEW_SHOT_ADD = '''Example 1:
[previous impl]:
fn add(a: i32, b: i32) -> i32 {
// Given integers a and b, return the total value of a and b.
a - b
}
[unit test results from previous impl]:
Tested passed:
Tests failed:
assert_eq!(add(1, 2), 3); // output: -1
assert_eq!(add(1, 2), 4); // output: -1
[reflection on previous impl]:
The implementation failed the test cases where the input integers are 1 and 2. The issue arises because the code does not add the two integers together, but instead subtracts the second integer from the first. To fix this issue, we should change the operator from `-` to `+` in the return statement. This will ensure that the function returns the correct output for the given input.
[improved impl]:
fn add(a: i32, b: i32) -> i32 {
// Given integers a and b, return the total value of a and b.
a + b
}
END EXAMPLES
'''
RS_TEST_GENERATION_FEW_SHOT = """For example:
func signature:
@ -35,7 +70,50 @@ assert_eq!(candidate(100), 50);
assert_eq!(candidate(49), 7);
"""
RS_TEST_GENERATION_COMPLETION_INSTRUCTION = f"""You are a Rust programming assistant, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring.
RS_SELF_REFLECTION_FEW_SHOT = '''Example 1:
[function impl]:
pub fn group_anagrams(strs: Vec<String>) -> Vec<Vec<String>> {
// Given an array of strings strs, group the anagrams together. You can return the answer in any order.
// An Anagram is a word or phrase formed by rearranging the letters of a different word or phrase, typically using all the original letters exactly once.
use std::collections::HashMap;
let mut map: HashMap<[u8;26], Vec<String>> = HashMap::with_capacity(strs.len());
let offset = 'a' as usize;
for str in strs.into_iter() {
let mut chars: [u8; 26] = [0; 26];
for char in str.chars() {
chars[char.to_ascii_lowercase() as usize - offset] += 1;
}
// Flaw: using str.len() instead of chars in the hashmap key
map.entry(str.len())
.and_modify(|v| v.push(str.clone()))
.or_insert(vec![str]);
}
let mut arr: Vec<Vec<String>> = Vec::new();
for v in map.into_values() {
arr.push(v);
}
arr
}
[unit test results]:
Tested passed:
assert_eq!(func(vec![""]), vec![vec![""]]);
assert_eq!(func(vec!["a"]), vec![vec!["a"]]);
Tests failed:
assert_eq!(func(vec!["eat", "tea", "tan", "ate", "nat", "bat"]), vec![vec!["bat"], vec!["nat", "tan"], vec!["ate", "eat", "tea"]]); # output: [["bat", "tan", "nat"], ["eat", "tea", "ate"]]
[self-reflection]:
The implementation failed to group the anagrams together correctly. Instead, it grouped words by their length, which is not the intended behavior. The issue lies in using the length of the input strings (str.len()) as the key for the hashmap, rather than the count of each character in the strings (chars). To overcome this error, I should change the hashmap key to the character count array (chars). This will ensure that words with the same character counts (anagrams) are grouped together, which is the desired output. Next time I approach the problem, I will make sure to use the correct hashmap key to group the anagrams.
END EXAMPLES
'''
RS_TEST_GENERATION_COMPLETION_INSTRUCTION = f"""You are RustGPT, an AI coding assistant that can write unique, diverse, and intuitive unit tests for functions given the signature and docstring.
{RS_TEST_GENERATION_FEW_SHOT}"""
@ -66,6 +144,7 @@ class RsGenerator(Generator):
model=model,
SELF_REFLECTION_CHAT_INSTRUCTION=RS_SELF_REFLECTION_CHAT_INSTRUCTION,
SELF_REFLECTION_COMPLETION_INSTRUCTION=RS_SELF_REFLECTION_COMPLETION_INSTRUCTION,
SELF_REFLECTION_FEW_SHOT=RS_SELF_REFLECTION_FEW_SHOT,
)
def func_impl(
@ -92,6 +171,7 @@ class RsGenerator(Generator):
SIMPLE_CHAT_INSTRUCTION=RS_SIMPLE_CHAT_INSTRUCTION,
REFLEXION_COMPLETION_INSTRUCTION=RS_REFLEXION_COMPLETION_INSTRUCTION,
SIMPLE_COMPLETION_INSTRUCTION=RS_SIMPLE_COMPLETION_INSTRUCTION,
REFLEXION_FEW_SHOT=RS_REFLEXION_FEW_SHOT_ADD,
fix_body=(lambda x: x)
)

View File

@ -26,14 +26,21 @@ def run_reflexion(
cur_pass = 0
is_solved = False
reflections = []
implementations = []
test_feedback = []
cur_func_impl = ""
while cur_pass < pass_at_k and not is_solved:
tests_i = gen.internal_tests(item["prompt"], model, 1)
if is_leetcode:
tests_i = item['visible_tests']
else:
tests_i = gen.internal_tests(item["prompt"], model, 1)
# first attempt
cur_func_impl = gen.func_impl(item["prompt"], model, "simple")
implementations.append(cur_func_impl)
assert isinstance(cur_func_impl, str)
is_passing, feedback, _ = exe.execute(cur_func_impl, tests_i)
test_feedback.append(feedback)
# if solved, exit early
if is_passing:
@ -59,13 +66,15 @@ def run_reflexion(
strategy="reflexion",
prev_func_impl=cur_func_impl,
feedback=cur_feedback,
self_reflection=reflection
self_reflection=reflection,
)
implementations.append(cur_func_impl)
assert isinstance(cur_func_impl, str)
# check if all internal unit tests pass
is_passing, cur_feedback, _ = exe.execute(
cur_func_impl, tests_i)
test_feedback.append(cur_feedback)
# if solved, check if it passes the real tests, exit early
if is_passing or cur_iter == max_iters - 1:
@ -82,6 +91,8 @@ def run_reflexion(
item["is_solved"] = is_solved
item["reflections"] = reflections
item["implementations"] = implementations
item["test_feedback"] = test_feedback
item["solution"] = cur_func_impl
write_jsonl(log_path, [item], append=True)

View File

@ -0,0 +1,4 @@
{"name": "split-message-based-on-limit", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 14972000, "question_id": "2563", "elapsed_time": 148, "compare_result": "0111111111111011000011000000000000000000000000000011100000000000000000000000000111111101111000", "code_output": "[\"this<1/10>\",\" is <2/10>\",\"real<3/10>\",\"ly a<4/10>\",\" ver<5/10>\",\"y aw<6/10>\",\"esom<7/10>\",\"e me<8/10>\",\"ssag<9/10>\",\"e<10/10>\"]", "std_output": "", "last_testcase": "\"this is really a very awesome message\"\n9", "expected_output": "[\"thi<1/14>\",\"s i<2/14>\",\"s r<3/14>\",\"eal<4/14>\",\"ly <5/14>\",\"a v<6/14>\",\"ery<7/14>\",\" aw<8/14>\",\"eso<9/14>\",\"me<10/14>\",\" m<11/14>\",\"es<12/14>\",\"sa<13/14>\",\"ge<14/14>\"]", "task_finish_time": 1681661576324, "total_correct": 30, "total_testcases": 94, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "934796442", "input_formatted": "\"this is really a very awesome message\", 9", "input": "\"this is really a very awesome message\"\n9", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "maximum-number-of-non-overlapping-palindrome-substrings", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "minimum-total-distance-traveled", "status": "Accepted", "reward": true, "info": {"status_code": 10, "lang": "python3", "run_success": true, "status_runtime": "4528 ms", "memory": 94288000, "question_id": "2554", "elapsed_time": 4649, "compare_result": "1111111111111111111111111111111111111111", "code_output": "", "std_output": "", "last_testcase": "", "expected_output": "", "task_finish_time": 1681662197849, "total_correct": 40, "total_testcases": 40, "runtime_percentile": 5.600000000000221, "status_memory": "94.3 MB", "memory_percentile": 49.60000000000006, "pretty_lang": "Python3", "submission_id": "934801257", "status_msg": "Accepted", "state": "SUCCESS"}}
{"name": "next-greater-element-iv", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 27900000, "question_id": "2549", "elapsed_time": 1120, "compare_result": "010000001000000000000000000000000000000000010100000", "code_output": "[4,9,9,-1,-1]", "std_output": "", "last_testcase": "[2,4,0,9,6]", "expected_output": "[9,6,6,-1,-1]", "task_finish_time": 1681662380199, "total_correct": 4, "total_testcases": 51, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "934802716", "input_formatted": "[2,4,0,9,6]", "input": "[2,4,0,9,6]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}

View File

@ -0,0 +1 @@
{"name": "minimum-reverse-operations", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,40 @@
{"name": "minimum-reverse-operations", "status": "Runtime Error", "reward": false, "info": {"status_code": 15, "lang": "python3", "run_success": false, "runtime_error": "Line 24: RecursionError: maximum recursion depth exceeded in comparison", "full_runtime_error": "RecursionError: maximum recursion depth exceeded in comparison\n if ((0 <= new_p < n) and is_valid_move(min(p, new_p), (max(p, new_p) + k), banned, k)):\nLine 24 in min_operations (Solution.py)\n [Previous line repeated 996 more times]\n ops = (1 + min_operations(target, new_p, k, banned, memo))\nLine 25 in min_operations (Solution.py)\n ops = (1 + min_operations(target, new_p, k, banned, memo))\nLine 25 in min_operations (Solution.py)\n ops = (1 + min_operations(target, new_p, k, banned, memo))\nLine 25 in min_operations (Solution.py)", "status_runtime": "N/A", "memory": 684160000, "question_id": "2726", "elapsed_time": 1853, "compare_result": "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", "code_output": "", "std_output": "", "last_testcase": "4\n0\n[1,2]\n4", "expected_output": "[0,-1,-1,1]", "task_finish_time": 1681700614199, "total_correct": 0, "total_testcases": 711, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935016344", "status_msg": "Runtime Error", "state": "SUCCESS"}}
{"name": "collect-coins-in-a-tree", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "minimum-time-to-visit-a-cell-in-a-grid", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 29172000, "question_id": "2711", "elapsed_time": 1894, "compare_result": "10110011111000001011000010011000010000100", "code_output": "6", "std_output": "", "last_testcase": "[[0,2,4],[3,2,1],[1,0,4]]", "expected_output": "-1", "task_finish_time": 1681701070482, "total_correct": 16, "total_testcases": 41, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935018738", "input_formatted": "[[0,2,4],[3,2,1],[1,0,4]]", "input": "[[0,2,4],[3,2,1],[1,0,4]]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "find-the-string-with-lcp", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 17356000, "question_id": "2708", "elapsed_time": 334, "compare_result": "0001000000000000000000000000000000000000000000000000000", "code_output": "\"aehj\"", "std_output": "", "last_testcase": "[[4,0,2,0],[0,3,0,1],[2,0,2,0],[0,1,0,1]]", "expected_output": "\"abab\"", "task_finish_time": 1681701270446, "total_correct": 1, "total_testcases": 55, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935019825", "input_formatted": "[[4,0,2,0],[0,3,0,1],[2,0,2,0],[0,1,0,1]]", "input": "[[4,0,2,0],[0,3,0,1],[2,0,2,0],[0,1,0,1]]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "handling-sum-queries-after-update", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "subsequence-with-the-minimum-score", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 783224000, "question_id": "2701", "elapsed_time": 1487, "compare_result": "011010000001000100000000000000000000000000000000000000", "code_output": "2", "std_output": "", "last_testcase": "\"abacaba\"\n\"bzaa\"", "expected_output": "1", "task_finish_time": 1681701484618, "total_correct": 5, "total_testcases": 54, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935020878", "input_formatted": "\"abacaba\", \"bzaa\"", "input": "\"abacaba\"\n\"bzaa\"", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "minimum-number-of-visited-cells-in-a-grid", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "rearranging-fruits", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 48060000, "question_id": "2689", "elapsed_time": 820, "compare_result": "1001110000000110111010100000000000000000", "code_output": "0", "std_output": "", "last_testcase": "[2,3,4,1]\n[3,2,5,1]", "expected_output": "-1", "task_finish_time": 1681701729708, "total_correct": 11, "total_testcases": 40, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935022189", "input_formatted": "[2,3,4,1], [3,2,5,1]", "input": "[2,3,4,1]\n[3,2,5,1]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "count-increasing-quadruplets", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "put-marbles-in-bags", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 13908000, "question_id": "2681", "elapsed_time": 44, "compare_result": "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", "code_output": "6", "std_output": "", "last_testcase": "[1,3,5,1]\n2", "expected_output": "4", "task_finish_time": 1681702013575, "total_correct": 0, "total_testcases": 103, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935023798", "input_formatted": "[1,3,5,1], 2", "input": "[1,3,5,1]\n2", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "shortest-cycle-in-a-graph", "status": "Accepted", "reward": true, "info": {"status_code": 10, "lang": "python3", "run_success": true, "status_runtime": "2683 ms", "memory": 14616000, "question_id": "2671", "elapsed_time": 2701, "compare_result": "1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111", "code_output": "", "std_output": "", "last_testcase": "", "expected_output": "", "task_finish_time": 1681702142951, "total_correct": 88, "total_testcases": 88, "runtime_percentile": 41.8418000000004, "status_memory": "14.6 MB", "memory_percentile": 29.577399999999997, "pretty_lang": "Python3", "submission_id": "935024460", "status_msg": "Accepted", "state": "SUCCESS"}}
{"name": "minimum-time-to-complete-all-tasks", "status": "Runtime Error", "reward": false, "info": {"status_code": 15, "lang": "python3", "run_success": false, "runtime_error": "Line 44: TypeError: inf is not valid value for the expected return type integer", "full_runtime_error": "TypeError: inf is not valid value for the expected return type integer\n raise TypeError(str(ret) + \" is not valid value for the expected return type integer\");\nLine 44 in _driver (Solution.py)\n _driver()\nLine 51 in <module> (Solution.py)\nDuring handling of the above exception, another exception occurred:\nException: Error when serializing long: inf out of range [-(2^53-1), 2^53-1]\nLine 13 in _serialize_int (./python3/__serializer__.py)\nLine 61 in _serialize (./python3/__serializer__.py)\n out = ser._serialize(ret, 'integer')\nLine 42 in _driver (Solution.py)", "status_runtime": "N/A", "memory": 13836000, "question_id": "2657", "elapsed_time": 43, "compare_result": "100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", "code_output": "", "std_output": "", "last_testcase": "[[1,3,2],[2,5,3],[5,6,2]]", "expected_output": "4", "task_finish_time": 1681702342561, "total_correct": 1, "total_testcases": 1059, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935025528", "status_msg": "Runtime Error", "state": "SUCCESS"}}
{"name": "count-number-of-possible-root-nodes", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "number-of-ways-to-earn-points", "status": "Accepted", "reward": true, "info": {"status_code": 10, "lang": "python3", "run_success": true, "status_runtime": "2292 ms", "memory": 13792000, "question_id": "2648", "elapsed_time": 2319, "compare_result": "111111111111111111111111111111111111111111111111111111111111111", "code_output": "", "std_output": "", "last_testcase": "", "expected_output": "", "task_finish_time": 1681702590911, "total_correct": 63, "total_testcases": 63, "runtime_percentile": 77.72500000000105, "status_memory": "13.8 MB", "memory_percentile": 98.96039999999999, "pretty_lang": "Python3", "submission_id": "935026961", "status_msg": "Accepted", "state": "SUCCESS"}}
{"name": "split-the-array-to-make-coprime-products", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "time-to-cross-a-bridge", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 21032000, "question_id": "2642", "elapsed_time": 462, "compare_result": "00000000000000000000000000000000000000000000000000000000000000000", "code_output": "1", "std_output": "", "last_testcase": "1\n3\n[[1,1,2,1],[1,1,3,1],[1,1,4,1]]", "expected_output": "6", "task_finish_time": 1681702887682, "total_correct": 0, "total_testcases": 65, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935028639", "input_formatted": "1, 3, [[1,1,2,1],[1,1,3,1],[1,1,4,1]]", "input": "1\n3\n[[1,1,2,1],[1,1,3,1],[1,1,4,1]]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "check-if-point-is-reachable", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "minimum-cost-to-split-an-array", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "difference-between-maximum-and-minimum-price-sum", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "maximize-the-minimum-powered-city", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "count-anagrams", "status": "Accepted", "reward": true, "info": {"status_code": 10, "lang": "python3", "run_success": true, "status_runtime": "2002 ms", "memory": 15572000, "question_id": "2605", "elapsed_time": 2022, "compare_result": "11111111111111111111111111111111111111111", "code_output": "", "std_output": "", "last_testcase": "", "expected_output": "", "task_finish_time": 1681703656422, "total_correct": 41, "total_testcases": 41, "runtime_percentile": 66.97360000000042, "status_memory": "15.6 MB", "memory_percentile": 75.6881, "pretty_lang": "Python3", "submission_id": "935033155", "status_msg": "Accepted", "state": "SUCCESS"}}
{"name": "number-of-great-partitions", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "cycle-length-queries-in-a-tree", "status": "Accepted", "reward": true, "info": {"status_code": 10, "lang": "python3", "run_success": true, "status_runtime": "1737 ms", "memory": 53652000, "question_id": "2597", "elapsed_time": 1778, "compare_result": "111111111111111111111111111", "code_output": "", "std_output": "", "last_testcase": "", "expected_output": "", "task_finish_time": 1681703846704, "total_correct": 27, "total_testcases": 27, "runtime_percentile": 44.82620000000011, "status_memory": "53.7 MB", "memory_percentile": 53.4482, "pretty_lang": "Python3", "submission_id": "935034324", "status_msg": "Accepted", "state": "SUCCESS"}}
{"name": "add-edges-to-make-degrees-of-all-nodes-even", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 56012000, "question_id": "2596", "elapsed_time": 1699, "compare_result": "1010110111111011101111000110001111111111110011000110", "code_output": "false", "std_output": "", "last_testcase": "4\n[[1,2],[3,4]]", "expected_output": "true", "task_finish_time": 1681704042779, "total_correct": 35, "total_testcases": 52, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935035563", "input_formatted": "4, [[1,2],[3,4]]", "input": "4\n[[1,2],[3,4]]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "minimum-total-cost-to-make-arrays-unequal", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 48932000, "question_id": "2592", "elapsed_time": 1275, "compare_result": "00000000000000000000000000000000000000000000000000000000000000000000000100000000000010000000000000000000", "code_output": "0", "std_output": "", "last_testcase": "[1,2,3,4,5]\n[1,2,3,4,5]", "expected_output": "10", "task_finish_time": 1681704260876, "total_correct": 2, "total_testcases": 104, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935036850", "input_formatted": "[1,2,3,4,5], [1,2,3,4,5]", "input": "[1,2,3,4,5]\n[1,2,3,4,5]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "maximum-number-of-points-from-grid-queries", "status": "Runtime Error", "reward": false, "info": {"status_code": 15, "lang": "python3", "run_success": false, "runtime_error": "Line 14: IndexError: list index out of range", "full_runtime_error": "IndexError: list index out of range\n points = (1 if (grid[x][y] < query) else 0)\nLine 14 in memoized_dfs (Solution.py)\n points += memoized_dfs((x + dx), (y + dy), query)\nLine 16 in memoized_dfs (Solution.py)\n points += memoized_dfs((x + dx), (y + dy), query)\nLine 16 in memoized_dfs (Solution.py)\n points += memoized_dfs((x + dx), (y + dy), query)\nLine 16 in memoized_dfs (Solution.py)\n return memoized_dfs(x, y, query)\nLine 18 in dfs (Solution.py)\n answer.append(dfs(0, 0, query))\nLine 21 in maxPoints (Solution.py)\n ret = Solution().maxPoints(param_1, param_2)\nLine 47 in _driver (Solution.py)\n _driver()\nLine 58 in <module> (Solution.py)", "status_runtime": "N/A", "memory": 13992000, "question_id": "2588", "elapsed_time": 45, "compare_result": "000000000000000000000", "code_output": "", "std_output": "", "last_testcase": "[[1,2,3],[2,5,7],[3,5,1]]\n[5,6,2]", "expected_output": "[5,8,1]", "task_finish_time": 1681704479286, "total_correct": 0, "total_testcases": 21, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935038154", "status_msg": "Runtime Error", "state": "SUCCESS"}}
{"name": "divide-nodes-into-the-maximum-number-of-groups", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 17844000, "question_id": "2583", "elapsed_time": 264, "compare_result": "0100000000001000000000000000000000000111011100110001000", "code_output": "2", "std_output": "", "last_testcase": "6\n[[1,2],[1,4],[1,5],[2,6],[2,3],[4,6]]", "expected_output": "4", "task_finish_time": 1681704691680, "total_correct": 11, "total_testcases": 55, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935039452", "input_formatted": "6, [[1,2],[1,4],[1,5],[2,6],[2,3],[4,6]]", "input": "6\n[[1,2],[1,4],[1,5],[2,6],[2,3],[4,6]]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "count-palindromic-subsequences", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "count-subarrays-with-median-k", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "number-of-beautiful-partitions", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "split-message-based-on-limit", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "maximum-number-of-non-overlapping-palindrome-substrings", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "minimum-total-distance-traveled", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 13932000, "question_id": "2554", "elapsed_time": 61, "compare_result": "1101111011111111111111110011011101111001", "code_output": "7", "std_output": "", "last_testcase": "[9,11,99,101]\n[[10,1],[7,1],[14,1],[100,1],[96,1],[103,1]]", "expected_output": "6", "task_finish_time": 1681705573958, "total_correct": 32, "total_testcases": 40, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935045044", "input_formatted": "[9,11,99,101], [[10,1],[7,1],[14,1],[100,1],[96,1],[103,1]]", "input": "[9,11,99,101]\n[[10,1],[7,1],[14,1],[100,1],[96,1],[103,1]]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "next-greater-element-iv", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 28288000, "question_id": "2549", "elapsed_time": 1071, "compare_result": "010000001101000000000000000000000000000000010110000", "code_output": "[-1,-1,-1,-1,-1]", "std_output": "", "last_testcase": "[2,4,0,9,6]", "expected_output": "[9,6,6,-1,-1]", "task_finish_time": 1681705714229, "total_correct": 7, "total_testcases": 51, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935045927", "input_formatted": "[2,4,0,9,6]", "input": "[2,4,0,9,6]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "minimum-number-of-operations-to-make-arrays-similar", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 42640000, "question_id": "2539", "elapsed_time": 862, "compare_result": "00100000000000000000000000000000", "code_output": "-1", "std_output": "", "last_testcase": "[8,12,6]\n[2,14,10]", "expected_output": "2", "task_finish_time": 1681705845449, "total_correct": 1, "total_testcases": 32, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935046779", "input_formatted": "[8,12,6], [2,14,10]", "input": "[8,12,6]\n[2,14,10]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "minimum-cost-to-make-array-equal", "status": "Accepted", "reward": true, "info": {"status_code": 10, "lang": "python3", "run_success": true, "status_runtime": "1324 ms", "memory": 29508000, "question_id": "2538", "elapsed_time": 1349, "compare_result": "111111111111111111111111111111111111111111111111", "code_output": "", "std_output": "", "last_testcase": "", "expected_output": "", "task_finish_time": 1681705877245, "total_correct": 48, "total_testcases": 48, "runtime_percentile": 5.401999999999845, "status_memory": "29.5 MB", "memory_percentile": 57.43240000000001, "pretty_lang": "Python3", "submission_id": "935046977", "status_msg": "Accepted", "state": "SUCCESS"}}
{"name": "create-components-with-same-value", "status": "Wrong Answer", "reward": false, "info": {"status_code": 11, "lang": "python3", "run_success": true, "status_runtime": "N/A", "memory": 45408000, "question_id": "2531", "elapsed_time": 1334, "compare_result": "0110000000000010001000000001001001010000110", "code_output": "-1", "std_output": "", "last_testcase": "[6,2,2,2,6]\n[[0,1],[1,2],[1,3],[3,4]]", "expected_output": "2", "task_finish_time": 1681706062974, "total_correct": 10, "total_testcases": 43, "runtime_percentile": null, "status_memory": "N/A", "memory_percentile": null, "pretty_lang": "Python3", "submission_id": "935048221", "input_formatted": "[6,2,2,2,6], [[0,1],[1,2],[1,3],[3,4]]", "input": "[6,2,2,2,6]\n[[0,1],[1,2],[1,3],[3,4]]", "status_msg": "Wrong Answer", "state": "SUCCESS"}}
{"name": "count-subarrays-with-fixed-bounds", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "longest-increasing-subsequence-ii", "status": "Submission Timed-Out", "reward": false, "info": {"state": "STARTED"}}
{"name": "paths-in-matrix-whose-sum-is-divisible-by-k", "status": "Accepted", "reward": true, "info": {"status_code": 10, "lang": "python3", "run_success": true, "status_runtime": "6132 ms", "memory": 94216000, "question_id": "2521", "elapsed_time": 6321, "compare_result": "1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111", "code_output": "", "std_output": "", "last_testcase": "", "expected_output": "", "task_finish_time": 1681706190987, "total_correct": 88, "total_testcases": 88, "runtime_percentile": 15.170599999999851, "status_memory": "94.2 MB", "memory_percentile": 80.32829999999994, "pretty_lang": "Python3", "submission_id": "935049009", "status_msg": "Accepted", "state": "SUCCESS"}}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,7 +1,7 @@
python main.py \
--run_name "reflexion_leetcode_python3_gpt4" \
--run_name "reflexion_leetcode_python3_gpt4_react_constraints_visible" \
--root_dir "root" \
--dataset_path ./executors/leetcode_env/leetcode_dataset/data/humaneval/leetcode-hard-py-40-uncontaminated.jsonl \
--dataset_path ./executors/leetcode_env/leetcode_dataset/data/humaneval/leetcode-hard-py-40-uncontaminated_tests.jsonl \
--strategy "reflexion" \
--language "py" \
--model "gpt-4" \

11
run_reflexion_rs_leet.sh Executable file
View File

@ -0,0 +1,11 @@
python main.py \
--run_name "reflexion_leetcode_rs_gpt4_react_constraints_visible" \
--root_dir "root" \
--dataset_path ./executors/leetcode_env/leetcode_dataset/data/humaneval/leetcode-hard-rs-40-uncontaminated_tests.jsonl \
--strategy "reflexion" \
--language "rs" \
--model "gpt-4" \
--pass_at_k "1" \
--max_iters "5" \
--is_leetcode \
--verbose

View File

@ -1,7 +1,7 @@
python main.py \
--run_name "simple_leetcode_python3_gpt4" \
--run_name "simple_leetcode_python3_gpt4_visible" \
--root_dir "root" \
--dataset_path ./executors/leetcode_env/leetcode_dataset/data/humaneval/leetcode-hard-py-40-uncontaminated.jsonl \
--dataset_path ./executors/leetcode_env/leetcode_dataset/data/humaneval/leetcode-hard-py-40-uncontaminated_tests.jsonl \
--strategy "simple" \
--language "py" \
--model "gpt-4" \

11
run_simple_rs_leet.sh Executable file
View File

@ -0,0 +1,11 @@
python main.py \
--run_name "simple_leetcode_rust_gpt4_visible" \
--root_dir "root" \
--dataset_path ./executors/leetcode_env/leetcode_dataset/data/humaneval/leetcode-hard-rs-40-uncontaminated_tests.jsonl \
--strategy "simple" \
--language "rs" \
--model "gpt-4" \
--pass_at_k "1" \
--max_iters "1" \
--is_leetcode \
--verbose

55
test.py Normal file
View File

@ -0,0 +1,55 @@
# Fails 2
def minReverseOperations(self, n: int, p: int, banned: List[int], k: int) -> List[int]:
from collections import deque
banned = set(banned)
arr = tuple(0 if i in banned else (1 if i == p else 0) for i in range(n))
queue = deque([(arr, p, 0)]) # Add a third element to the tuple to store the number of
ans = [-1] * n
visited = set()
while queue:
cur_arr, cur_pos, ops = queue.popleft()
if cur_pos not in visited:
visited.add(cur_pos)
ans[cur_pos] = ops
for i in range(n):
for j in range(i + k, n + 1):
new_arr = cur_arr[:i] + tuple(reversed(cur_arr[i:j])) + cur_arr[j:]
new_pos = new_arr.index(1)
if new_pos not in banned and (new_arr, new_pos) not in visited:
queue.append((new_arr, new_pos, ops + 1))
return ans
# Fails 1
def minReverseOperations(self, n: int, p: int, banned: List[int], k: int) -> List[int]:
from collections import deque
banned = set(banned)
arr = tuple(0 if i in banned else (1 if i == p else 0) for i in range(n))
queue = deque([(arr, p, 0)]) # Add a third element to the tuple to store the number of operations
ans = [-1] * n
visited = set()
while queue:
cur_arr, cur_pos, ops = queue.popleft()
if cur_pos not in visited:
visited.add(cur_pos)
ans[cur_pos] = ops
for i in range(n):
for j in range(i + k, n + 1):
# Check if the subarray to be reversed contains any banned positions
if any(cur_arr[i:x] for x in range(i, j) if x in banned):
continue
new_arr = cur_arr[:i] + tuple(reversed(cur_arr[i:j])) + cur_arr[j:]
new_pos = new_arr.index(1)
if new_pos not in banned and (new_arr, new_pos) not in visited:
queue.append((new_arr, new_pos, ops + 1))
return ans