fix(embeddings): number of texts in Azure OpenAIEmbeddings batch (#10707)

This PR addresses the limitation of Azure OpenAI embeddings, which can
handle at maximum 16 texts in a batch. This can be solved setting
`chunk_size=16`. However, I'd love to have this automated, not to force
the user to figure where the issue comes from and how to solve it.

Closes #4575. 

@baskaryan

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
pull/10826/head
Massimiliano Pronesti 11 months ago committed by GitHub
parent 7395c28455
commit f0198354d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -231,7 +231,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
values["model_kwargs"] = extra
return values
@root_validator()
@root_validator(pre=True)
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and python package exists in environment."""
values["openai_api_key"] = get_from_dict_or_env(
@ -257,8 +257,13 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
)
if values["openai_api_type"] in ("azure", "azure_ad", "azuread"):
default_api_version = "2022-12-01"
# Azure OpenAI embedding models allow a maximum of 16 texts
# at a time in each batch
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
default_chunk_size = 16
else:
default_api_version = ""
default_chunk_size = 1000
values["openai_api_version"] = get_from_dict_or_env(
values,
"openai_api_version",
@ -271,6 +276,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
"OPENAI_ORGANIZATION",
default="",
)
if "chunk_size" not in values:
values["chunk_size"] = default_chunk_size
try:
import openai

Loading…
Cancel
Save