You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
66 lines
1.8 KiB
Python
66 lines
1.8 KiB
Python
import os
|
|
import gzip
|
|
import json
|
|
import openai
|
|
import jsonlines
|
|
|
|
from typing import List
|
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY")
|
|
|
|
|
|
def make_printv(verbose: bool):
|
|
def print_v(*args, **kwargs):
|
|
if verbose:
|
|
kwargs["flush"] = True
|
|
print(*args, **kwargs)
|
|
else:
|
|
pass
|
|
return print_v
|
|
|
|
|
|
def read_jsonl(path: str) -> List[dict]:
|
|
if not os.path.exists(path):
|
|
raise FileNotFoundError(f"File `{path}` does not exist.")
|
|
elif not path.endswith(".jsonl"):
|
|
raise ValueError(f"File `{path}` is not a jsonl file.")
|
|
items = []
|
|
with jsonlines.open(path) as reader:
|
|
for item in reader:
|
|
items += [item]
|
|
return items
|
|
|
|
|
|
def write_jsonl(path: str, data: List[dict], append: bool = False):
|
|
with jsonlines.open(path, mode='a' if append else 'w') as writer:
|
|
for item in data:
|
|
writer.write(item)
|
|
|
|
|
|
def read_jsonl_gz(path: str) -> List[dict]:
|
|
if not path.endswith(".jsonl.gz"):
|
|
raise ValueError(f"File `{path}` is not a jsonl.gz file.")
|
|
with gzip.open(path, "rt") as f:
|
|
data = [json.loads(line) for line in f]
|
|
return data
|
|
|
|
|
|
# generator that returns the item and the index in the dataset.
|
|
# if the results_path exists, it will skip all items that have been processed
|
|
# before.
|
|
def enumerate_resume(dataset, results_path):
|
|
if not os.path.exists(results_path):
|
|
for i, item in enumerate(dataset):
|
|
yield i, item
|
|
else:
|
|
count = 0
|
|
with jsonlines.open(results_path) as reader:
|
|
for item in reader:
|
|
count += 1
|
|
|
|
for i, item in enumerate(dataset):
|
|
# skip items that have been processed before
|
|
if i < count:
|
|
continue
|
|
yield i, item
|