dataeset get

This commit is contained in:
elleven11 2023-04-11 22:48:16 -04:00
parent 30c6c5d2e9
commit 29006464d3
2 changed files with 33 additions and 2 deletions

31
dataset_random_sample.py Normal file
View File

@ -0,0 +1,31 @@
from utils import read_jsonl, read_jsonl_gz, write_jsonl
def main(args):
if args.input.endswith(".gz"):
data = read_jsonl_gz(args.input)
else:
data = read_jsonl(args.input)
# sample the data
assert args.num_samples <= len(data) and args.num_samples > 0
sampled_data = random.sample(data, args.num_samples)
# write the sampled data to the output file
write_jsonl(args.output, sampled_data, append=True)
if __name__ == "__main__":
import argparse
import random
import os
# take in the input and output file names, with number of samples
random.seed(os.urandom(1024))
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, required=True)
parser.add_argument("--output", type=str, required=True)
parser.add_argument("--num_samples", type=int, required=True)
args = parser.parse_args()
main(args)

View File

@ -1,7 +1,7 @@
python main.py \
--run_name "reflexion_mbpp_py3" \
--run_name "immediate_reflexion_mbpp_py3" \
--root_dir "root" \
--dataset_path ./benchmarks/mbpp-py.jsonl \
--dataset_path ./benchmarks/humaneval-py.jsonl.gz \
--strategy "immediate-reflexion" \
--language "py" \
--model "gpt-4" \