fix(embeddings): number of texts in Azure OpenAIEmbeddings batch (#10707)

This PR addresses the limitation of Azure OpenAI embeddings, which can
handle at maximum 16 texts in a batch. This can be solved setting
`chunk_size=16`. However, I'd love to have this automated, not to force
the user to figure where the issue comes from and how to solve it.

Closes #4575. 

@baskaryan

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
pull/10826/head
Massimiliano Pronesti 12 months ago committed by GitHub
parent 7395c28455
commit f0198354d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -231,7 +231,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
values["model_kwargs"] = extra values["model_kwargs"] = extra
return values return values
@root_validator() @root_validator(pre=True)
def validate_environment(cls, values: Dict) -> Dict: def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and python package exists in environment.""" """Validate that api key and python package exists in environment."""
values["openai_api_key"] = get_from_dict_or_env( values["openai_api_key"] = get_from_dict_or_env(
@ -257,8 +257,13 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
) )
if values["openai_api_type"] in ("azure", "azure_ad", "azuread"): if values["openai_api_type"] in ("azure", "azure_ad", "azuread"):
default_api_version = "2022-12-01" default_api_version = "2022-12-01"
# Azure OpenAI embedding models allow a maximum of 16 texts
# at a time in each batch
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings
default_chunk_size = 16
else: else:
default_api_version = "" default_api_version = ""
default_chunk_size = 1000
values["openai_api_version"] = get_from_dict_or_env( values["openai_api_version"] = get_from_dict_or_env(
values, values,
"openai_api_version", "openai_api_version",
@ -271,6 +276,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
"OPENAI_ORGANIZATION", "OPENAI_ORGANIZATION",
default="", default="",
) )
if "chunk_size" not in values:
values["chunk_size"] = default_chunk_size
try: try:
import openai import openai

Loading…
Cancel
Save