refactor: rename model.max_concurrent_chunks to model.max_batch_size (#626)

pull/627/head
sigoden 2 weeks ago committed by GitHub
parent f2378e1725
commit 7a089d846e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -90,7 +90,7 @@ clients:
# type: embedding # Embedding model
# max_input_tokens: 2048
# default_chunk_size: 2000
# max_concurrent_chunks: 100
# max_batch_size: 100
# - name: xxxx
# type: rerank # Rerank model
# max_input_tokens: 2048

@ -33,17 +33,17 @@
type: embedding
max_input_tokens: 8191
default_chunk_size: 3000
max_concurrent_chunks: 100
max_batch_size: 100
- name: text-embedding-3-small
type: embedding
max_input_tokens: 8191
default_chunk_size: 3000
max_concurrent_chunks: 100
max_batch_size: 100
- name: text-embedding-ada-002
type: embedding
max_input_tokens: 8191
default_chunk_size: 3000
max_concurrent_chunks: 100
max_batch_size: 100
- platform: gemini
# docs:
@ -77,7 +77,7 @@
type: embedding
max_input_tokens: 2048
default_chunk_size: 1500
max_concurrent_chunks: 5
max_batch_size: 5
- platform: claude
# docs:
@ -176,12 +176,12 @@
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 96
max_batch_size: 96
- name: embed-multilingual-v3.0
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 96
max_batch_size: 96
- name: rerank-english-v3.0
type: rerank
max_input_tokens: 4096
@ -307,12 +307,12 @@
type: embedding
max_input_tokens: 3072
default_chunk_size: 2000
max_concurrent_chunks: 5
max_batch_size: 5
- name: text-multilingual-embedding-002
type: embedding
max_input_tokens: 3072
default_chunk_size: 2000
max_concurrent_chunks: 5
max_batch_size: 5
- platform: vertexai-claude
# docs:
@ -460,12 +460,12 @@
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: '@cf/baai/bge-large-en-v1.5'
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- platform: replicate
# docs:
@ -578,7 +578,7 @@
type: embedding
max_input_tokens: 2048
default_chunk_size: 1500
max_concurrent_chunks: 25
max_batch_size: 25
- platform: moonshot
# docs:
@ -721,12 +721,12 @@
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 30
max_batch_size: 30
- name: thenlper/gte-large
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 30
max_batch_size: 30
- platform: deepinfra
# docs:
@ -773,52 +773,52 @@
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: BAAI/bge-base-en-v1.5
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: BAAI/bge-m3
type: embedding
max_input_tokens: 8192
default_chunk_size: 2000
max_concurrent_chunks: 100
max_batch_size: 100
- name: intfloat/e5-base-v2
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: intfloat/e5-large-v2
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: intfloat/multilingual-e5-large
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: sentence-transformers/all-MiniLM-L6-v2
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: sentence-transformers/paraphrase-MiniLM-L6-v2
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: thenlper/gte-base
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: thenlper/gte-large
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- platform: fireworks
# docs:
@ -876,22 +876,22 @@
type: embedding
max_input_tokens: 8192
default_chunk_size: 1500
max_concurrent_chunks: 100
max_batch_size: 100
- name: WhereIsAI/UAE-Large-V1
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: thenlper/gte-large
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: thenlper/gte-base
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- platform: openrouter
# docs:
@ -1072,7 +1072,7 @@
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- platform: together
# docs:
@ -1108,19 +1108,19 @@
max_input_tokens: 32768
input_price: 0.9
output_price: 0.9
max_concurrent_chunks: 100
max_batch_size: 100
- name: WhereIsAI/UAE-Large-V1
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: BAAI/bge-large-en-v1.5
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100
- name: BAAI/bge-base-en-v1.5
type: embedding
max_input_tokens: 512
default_chunk_size: 1000
max_concurrent_chunks: 100
max_batch_size: 100

@ -392,7 +392,7 @@ pub trait Client: Sync + Send {
async fn embeddings(&self, data: EmbeddingsData) -> Result<Vec<Vec<f32>>> {
let client = self.build_client()?;
self.model().guard_max_concurrent_chunks(&data)?;
self.model().guard_max_batch_size(&data)?;
self.embeddings_inner(&client, data)
.await
.context("Failed to fetch embeddings")

@ -175,8 +175,8 @@ impl Model {
self.data.default_chunk_size.unwrap_or(1000)
}
pub fn max_concurrent_chunks(&self) -> usize {
self.data.max_concurrent_chunks.unwrap_or(1)
pub fn max_batch_size(&self) -> usize {
self.data.max_batch_size.unwrap_or(1)
}
pub fn max_tokens_param(&self) -> Option<isize> {
@ -234,9 +234,9 @@ impl Model {
Ok(())
}
pub fn guard_max_concurrent_chunks(&self, data: &EmbeddingsData) -> Result<()> {
if data.texts.len() > self.max_concurrent_chunks() {
bail!("Exceed max_concurrent_chunks limit");
pub fn guard_max_batch_size(&self, data: &EmbeddingsData) -> Result<()> {
if data.texts.len() > self.max_batch_size() {
bail!("Exceed max_batch_size limit");
}
Ok(())
}
@ -262,7 +262,7 @@ pub struct ModelData {
// embedding-only properties
pub default_chunk_size: Option<usize>,
pub max_concurrent_chunks: Option<usize>,
pub max_batch_size: Option<usize>,
}
impl ModelData {

@ -414,13 +414,13 @@ impl Rag {
) -> Result<EmbeddingsOutput> {
let EmbeddingsData { texts, query } = data;
let mut output = vec![];
let chunks = texts.chunks(self.embedding_model.max_concurrent_chunks());
let chunks_len = chunks.len();
let batch_chunks = texts.chunks(self.embedding_model.max_batch_size());
let batch_chunks_len = batch_chunks.len();
progress(
&progress_tx,
format!("Creating embeddings [1/{chunks_len}]"),
format!("Creating embeddings [1/{batch_chunks_len}]"),
);
for (index, texts) in chunks.enumerate() {
for (index, texts) in batch_chunks.enumerate() {
let chunk_data = EmbeddingsData {
texts: texts.to_vec(),
query,
@ -433,7 +433,7 @@ impl Rag {
output.extend(chunk_output);
progress(
&progress_tx,
format!("Creating embeddings [{}/{chunks_len}]", index + 1),
format!("Creating embeddings [{}/{batch_chunks_len}]", index + 1),
);
}
Ok(output)

Loading…
Cancel
Save