mirror of
https://github.com/nomic-ai/gpt4all
synced 2024-11-06 09:20:33 +00:00
fix: multi-turn data breaks
This commit is contained in:
parent
60155de2a6
commit
b1e361882d
4
data.py
4
data.py
@ -15,8 +15,8 @@ def tokenize_inputs(config, tokenizer, examples):
|
||||
out = {"labels": [], "input_ids": []}
|
||||
for prompt, response in zip(examples["prompt"], examples["response"]):
|
||||
if different_eos:
|
||||
if response.count("</s>") > 0:
|
||||
response = response.replace("</s>", tokenizer.eos_token)
|
||||
if response.count("</s> \n") > 0:
|
||||
response = response.replace("</s> \n", f"{tokenizer.eos_token} \n")
|
||||
|
||||
prompt_len = len(tokenizer(prompt + "\n", return_tensors="pt")["input_ids"][0])
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user