diff --git a/config.example.yaml b/config.example.yaml index ae9c53f..172a6b2 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -90,7 +90,7 @@ clients: # type: embedding # Embedding model # max_input_tokens: 2048 # default_chunk_size: 2000 - # max_concurrent_chunks: 100 + # max_batch_size: 100 # - name: xxxx # type: rerank # Rerank model # max_input_tokens: 2048 diff --git a/models.yaml b/models.yaml index ccb0c60..be4ecc3 100644 --- a/models.yaml +++ b/models.yaml @@ -33,17 +33,17 @@ type: embedding max_input_tokens: 8191 default_chunk_size: 3000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: text-embedding-3-small type: embedding max_input_tokens: 8191 default_chunk_size: 3000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: text-embedding-ada-002 type: embedding max_input_tokens: 8191 default_chunk_size: 3000 - max_concurrent_chunks: 100 + max_batch_size: 100 - platform: gemini # docs: @@ -77,7 +77,7 @@ type: embedding max_input_tokens: 2048 default_chunk_size: 1500 - max_concurrent_chunks: 5 + max_batch_size: 5 - platform: claude # docs: @@ -176,12 +176,12 @@ type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 96 + max_batch_size: 96 - name: embed-multilingual-v3.0 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 96 + max_batch_size: 96 - name: rerank-english-v3.0 type: rerank max_input_tokens: 4096 @@ -307,12 +307,12 @@ type: embedding max_input_tokens: 3072 default_chunk_size: 2000 - max_concurrent_chunks: 5 + max_batch_size: 5 - name: text-multilingual-embedding-002 type: embedding max_input_tokens: 3072 default_chunk_size: 2000 - max_concurrent_chunks: 5 + max_batch_size: 5 - platform: vertexai-claude # docs: @@ -460,12 +460,12 @@ type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: '@cf/baai/bge-large-en-v1.5' type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - platform: replicate # docs: @@ -578,7 +578,7 @@ type: embedding max_input_tokens: 2048 default_chunk_size: 1500 - max_concurrent_chunks: 25 + max_batch_size: 25 - platform: moonshot # docs: @@ -721,12 +721,12 @@ type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 30 + max_batch_size: 30 - name: thenlper/gte-large type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 30 + max_batch_size: 30 - platform: deepinfra # docs: @@ -773,52 +773,52 @@ type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: BAAI/bge-base-en-v1.5 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: BAAI/bge-m3 type: embedding max_input_tokens: 8192 default_chunk_size: 2000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: intfloat/e5-base-v2 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: intfloat/e5-large-v2 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: intfloat/multilingual-e5-large type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: sentence-transformers/all-MiniLM-L6-v2 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: sentence-transformers/paraphrase-MiniLM-L6-v2 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: thenlper/gte-base type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: thenlper/gte-large type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - platform: fireworks # docs: @@ -876,22 +876,22 @@ type: embedding max_input_tokens: 8192 default_chunk_size: 1500 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: WhereIsAI/UAE-Large-V1 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: thenlper/gte-large type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: thenlper/gte-base type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - platform: openrouter # docs: @@ -1072,7 +1072,7 @@ type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - platform: together # docs: @@ -1108,19 +1108,19 @@ max_input_tokens: 32768 input_price: 0.9 output_price: 0.9 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: WhereIsAI/UAE-Large-V1 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: BAAI/bge-large-en-v1.5 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 + max_batch_size: 100 - name: BAAI/bge-base-en-v1.5 type: embedding max_input_tokens: 512 default_chunk_size: 1000 - max_concurrent_chunks: 100 \ No newline at end of file + max_batch_size: 100 \ No newline at end of file diff --git a/src/client/common.rs b/src/client/common.rs index de2fa50..796c632 100644 --- a/src/client/common.rs +++ b/src/client/common.rs @@ -392,7 +392,7 @@ pub trait Client: Sync + Send { async fn embeddings(&self, data: EmbeddingsData) -> Result>> { let client = self.build_client()?; - self.model().guard_max_concurrent_chunks(&data)?; + self.model().guard_max_batch_size(&data)?; self.embeddings_inner(&client, data) .await .context("Failed to fetch embeddings") diff --git a/src/client/model.rs b/src/client/model.rs index 0584f82..4c69d78 100644 --- a/src/client/model.rs +++ b/src/client/model.rs @@ -175,8 +175,8 @@ impl Model { self.data.default_chunk_size.unwrap_or(1000) } - pub fn max_concurrent_chunks(&self) -> usize { - self.data.max_concurrent_chunks.unwrap_or(1) + pub fn max_batch_size(&self) -> usize { + self.data.max_batch_size.unwrap_or(1) } pub fn max_tokens_param(&self) -> Option { @@ -234,9 +234,9 @@ impl Model { Ok(()) } - pub fn guard_max_concurrent_chunks(&self, data: &EmbeddingsData) -> Result<()> { - if data.texts.len() > self.max_concurrent_chunks() { - bail!("Exceed max_concurrent_chunks limit"); + pub fn guard_max_batch_size(&self, data: &EmbeddingsData) -> Result<()> { + if data.texts.len() > self.max_batch_size() { + bail!("Exceed max_batch_size limit"); } Ok(()) } @@ -262,7 +262,7 @@ pub struct ModelData { // embedding-only properties pub default_chunk_size: Option, - pub max_concurrent_chunks: Option, + pub max_batch_size: Option, } impl ModelData { diff --git a/src/rag/mod.rs b/src/rag/mod.rs index 628a9e4..116beea 100644 --- a/src/rag/mod.rs +++ b/src/rag/mod.rs @@ -414,13 +414,13 @@ impl Rag { ) -> Result { let EmbeddingsData { texts, query } = data; let mut output = vec![]; - let chunks = texts.chunks(self.embedding_model.max_concurrent_chunks()); - let chunks_len = chunks.len(); + let batch_chunks = texts.chunks(self.embedding_model.max_batch_size()); + let batch_chunks_len = batch_chunks.len(); progress( &progress_tx, - format!("Creating embeddings [1/{chunks_len}]"), + format!("Creating embeddings [1/{batch_chunks_len}]"), ); - for (index, texts) in chunks.enumerate() { + for (index, texts) in batch_chunks.enumerate() { let chunk_data = EmbeddingsData { texts: texts.to_vec(), query, @@ -433,7 +433,7 @@ impl Rag { output.extend(chunk_output); progress( &progress_tx, - format!("Creating embeddings [{}/{chunks_len}]", index + 1), + format!("Creating embeddings [{}/{batch_chunks_len}]", index + 1), ); } Ok(output)