mirror of
https://github.com/nomic-ai/gpt4all
synced 2024-11-08 07:10:32 +00:00
llamamodel: fix BERT tokenization after llama.cpp update (#2381)
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
0b63ad5eff
commit
f1b4092ca6
@ -920,11 +920,11 @@ void LLamaModel::embedInternal(
|
|||||||
int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), wantBOS, false);
|
int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), wantBOS, false);
|
||||||
if (n_tokens) {
|
if (n_tokens) {
|
||||||
(void)eos_token;
|
(void)eos_token;
|
||||||
assert(useEOS == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
|
assert((useEOS && wantBOS) == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
|
||||||
tokens.resize(n_tokens - useEOS); // erase EOS/SEP
|
if (useEOS && wantBOS)
|
||||||
} else {
|
n_tokens--; // erase EOS/SEP
|
||||||
tokens.clear();
|
|
||||||
}
|
}
|
||||||
|
tokens.resize(n_tokens);
|
||||||
};
|
};
|
||||||
|
|
||||||
// tokenize the texts
|
// tokenize the texts
|
||||||
|
@ -938,7 +938,7 @@ void Database::start()
|
|||||||
connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated);
|
connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated);
|
||||||
m_scanTimer->callOnTimeout(this, &Database::scanQueue);
|
m_scanTimer->callOnTimeout(this, &Database::scanQueue);
|
||||||
if (!QSqlDatabase::drivers().contains("QSQLITE")) {
|
if (!QSqlDatabase::drivers().contains("QSQLITE")) {
|
||||||
qWarning() << "ERROR: missing sqllite driver";
|
qWarning() << "ERROR: missing sqlite driver";
|
||||||
} else {
|
} else {
|
||||||
QSqlError err = initDb();
|
QSqlError err = initDb();
|
||||||
if (err.type() != QSqlError::NoError)
|
if (err.type() != QSqlError::NoError)
|
||||||
|
@ -229,7 +229,7 @@ Raw Data:
|
|||||||
- Explorer: https://atlas.nomic.ai/map/gpt4all_data_clean
|
- Explorer: https://atlas.nomic.ai/map/gpt4all_data_clean
|
||||||
- [GPT4All-J Dataset](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations)
|
- [GPT4All-J Dataset](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations)
|
||||||
- Explorer Indexed on Prompts: https://atlas.nomic.ai/map/gpt4all-j-prompts-curated
|
- Explorer Indexed on Prompts: https://atlas.nomic.ai/map/gpt4all-j-prompts-curated
|
||||||
- Exporer Indexed on Responses: https://atlas.nomic.ai/map/gpt4all-j-response-curated
|
- Explorer Indexed on Responses: https://atlas.nomic.ai/map/gpt4all-j-response-curated
|
||||||
|
|
||||||
We are not distributing a LLaMa 7B checkpoint.
|
We are not distributing a LLaMa 7B checkpoint.
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user