Update data.py

pull/21/head^2
Zach Nussbaum 1 year ago committed by GitHub
parent c5f5882d46
commit 7e468f2199
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -70,14 +70,10 @@ def load_data(config, tokenizer):
else:
dataset = load_dataset(dataset_path)
uuids = load_dataset("json", data_files="watermark.jsonl", split="train")
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
train_dataset, val_dataset = dataset["train"], dataset["test"]
train_dataset = concatenate_datasets([train_dataset, uuids])
train_dataset = train_dataset.shuffle(seed=config["seed"])
if config["streaming"] is False:
kwargs = {"num_proc": config["num_proc"]}
else:

Loading…
Cancel
Save