From f1b4092ca689299fe5250058836a254fdcf45e56 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 28 May 2024 13:11:57 -0400 Subject: [PATCH] llamamodel: fix BERT tokenization after llama.cpp update (#2381) Signed-off-by: Jared Van Bortel --- gpt4all-backend/llamamodel.cpp | 8 ++++---- gpt4all-chat/database.cpp | 2 +- gpt4all-training/old-README.md | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp index e88ad9fe..35dd559d 100644 --- a/gpt4all-backend/llamamodel.cpp +++ b/gpt4all-backend/llamamodel.cpp @@ -920,11 +920,11 @@ void LLamaModel::embedInternal( int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), wantBOS, false); if (n_tokens) { (void)eos_token; - assert(useEOS == (eos_token != -1 && tokens[n_tokens - 1] == eos_token)); - tokens.resize(n_tokens - useEOS); // erase EOS/SEP - } else { - tokens.clear(); + assert((useEOS && wantBOS) == (eos_token != -1 && tokens[n_tokens - 1] == eos_token)); + if (useEOS && wantBOS) + n_tokens--; // erase EOS/SEP } + tokens.resize(n_tokens); }; // tokenize the texts diff --git a/gpt4all-chat/database.cpp b/gpt4all-chat/database.cpp index 286de5e4..fbb87968 100644 --- a/gpt4all-chat/database.cpp +++ b/gpt4all-chat/database.cpp @@ -938,7 +938,7 @@ void Database::start() connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated); m_scanTimer->callOnTimeout(this, &Database::scanQueue); if (!QSqlDatabase::drivers().contains("QSQLITE")) { - qWarning() << "ERROR: missing sqllite driver"; + qWarning() << "ERROR: missing sqlite driver"; } else { QSqlError err = initDb(); if (err.type() != QSqlError::NoError) diff --git a/gpt4all-training/old-README.md b/gpt4all-training/old-README.md index 078c6203..4a2f51dd 100644 --- a/gpt4all-training/old-README.md +++ b/gpt4all-training/old-README.md @@ -229,7 +229,7 @@ Raw Data: - Explorer: https://atlas.nomic.ai/map/gpt4all_data_clean - [GPT4All-J Dataset](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations) - Explorer Indexed on Prompts: https://atlas.nomic.ai/map/gpt4all-j-prompts-curated - - Exporer Indexed on Responses: https://atlas.nomic.ai/map/gpt4all-j-response-curated + - Explorer Indexed on Responses: https://atlas.nomic.ai/map/gpt4all-j-response-curated We are not distributing a LLaMa 7B checkpoint.