From 7e468f2199a80d72952d182b0b4b1a792856bcc3 Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Tue, 28 Mar 2023 21:13:05 -0700 Subject: [PATCH] Update data.py --- data.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/data.py b/data.py index db322793..0e356f7d 100644 --- a/data.py +++ b/data.py @@ -70,14 +70,10 @@ def load_data(config, tokenizer): else: dataset = load_dataset(dataset_path) - uuids = load_dataset("json", data_files="watermark.jsonl", split="train") dataset = dataset.train_test_split(test_size=.05, seed=config["seed"]) train_dataset, val_dataset = dataset["train"], dataset["test"] - train_dataset = concatenate_datasets([train_dataset, uuids]) - train_dataset = train_dataset.shuffle(seed=config["seed"]) - if config["streaming"] is False: kwargs = {"num_proc": config["num_proc"]} else: