diff --git a/data.py b/data.py index db322793..0e356f7d 100644 --- a/data.py +++ b/data.py @@ -70,14 +70,10 @@ def load_data(config, tokenizer): else: dataset = load_dataset(dataset_path) - uuids = load_dataset("json", data_files="watermark.jsonl", split="train") dataset = dataset.train_test_split(test_size=.05, seed=config["seed"]) train_dataset, val_dataset = dataset["train"], dataset["test"] - train_dataset = concatenate_datasets([train_dataset, uuids]) - train_dataset = train_dataset.shuffle(seed=config["seed"]) - if config["streaming"] is False: kwargs = {"num_proc": config["num_proc"]} else: