Update data.py

This commit is contained in:
Zach Nussbaum 2023-03-28 21:13:05 -07:00 committed by GitHub
parent c5f5882d46
commit 7e468f2199
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -70,14 +70,10 @@ def load_data(config, tokenizer):
else: else:
dataset = load_dataset(dataset_path) dataset = load_dataset(dataset_path)
uuids = load_dataset("json", data_files="watermark.jsonl", split="train")
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"]) dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
train_dataset, val_dataset = dataset["train"], dataset["test"] train_dataset, val_dataset = dataset["train"], dataset["test"]
train_dataset = concatenate_datasets([train_dataset, uuids])
train_dataset = train_dataset.shuffle(seed=config["seed"])
if config["streaming"] is False: if config["streaming"] is False:
kwargs = {"num_proc": config["num_proc"]} kwargs = {"num_proc": config["num_proc"]}
else: else: