mirror of https://github.com/hwchase17/langchain
parent
37561d8986
commit
f15f8e01cf
@ -0,0 +1,149 @@
|
||||
"""Azure OpenAI embeddings wrapper."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import warnings
|
||||
from typing import Dict, Optional, Union
|
||||
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.pydantic_v1 import Field, root_validator
|
||||
from langchain.utils import get_from_dict_or_env
|
||||
from langchain.utils.openai import is_openai_v1
|
||||
|
||||
|
||||
class AzureOpenAIEmbeddings(OpenAIEmbeddings):
|
||||
"""`Azure OpenAI` Embeddings API."""
|
||||
|
||||
azure_endpoint: Union[str, None] = None
|
||||
"""Your Azure endpoint, including the resource.
|
||||
|
||||
Example: `https://example-resource.azure.openai.com/`
|
||||
"""
|
||||
azure_deployment: Optional[str] = None
|
||||
"""A model deployment.
|
||||
|
||||
If given sets the base client URL to include `/deployments/{azure_deployment}`.
|
||||
Note: this means you won't be able to use non-deployment endpoints.
|
||||
"""
|
||||
openai_api_key: Union[str, None] = Field(default=None, alias="api_key")
|
||||
"""Automatically inferred from env var `AZURE_OPENAI_API_KEY` if not provided."""
|
||||
azure_ad_token: Union[str, None] = None
|
||||
"""Your Azure Active Directory token.
|
||||
|
||||
Automatically inferred from env var `AZURE_OPENAI_AD_TOKEN` if not provided.
|
||||
|
||||
For more:
|
||||
https://www.microsoft.com/en-us/security/business/identity-access/microsoft-entra-id.
|
||||
""" # noqa: E501
|
||||
azure_ad_token_provider: Union[str, None] = None
|
||||
"""A function that returns an Azure Active Directory token.
|
||||
|
||||
Will be invoked on every request.
|
||||
"""
|
||||
openai_api_version: Optional[str] = Field(default=None, alias="api_version")
|
||||
"""Automatically inferred from env var `OPENAI_API_VERSION` if not provided."""
|
||||
validate_base_url: bool = True
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that api key and python package exists in environment."""
|
||||
# Check OPENAI_KEY for backwards compatibility.
|
||||
# TODO: Remove OPENAI_API_KEY support to avoid possible conflict when using
|
||||
# other forms of azure credentials.
|
||||
values["openai_api_key"] = (
|
||||
values["openai_api_key"]
|
||||
or os.getenv("AZURE_OPENAI_API_KEY")
|
||||
or os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
values["openai_api_base"] = values["openai_api_base"] or os.getenv(
|
||||
"OPENAI_API_BASE"
|
||||
)
|
||||
values["openai_api_version"] = values["openai_api_version"] or os.getenv(
|
||||
"OPENAI_API_VERSION", default="2023-05-15"
|
||||
)
|
||||
values["openai_api_type"] = get_from_dict_or_env(
|
||||
values, "openai_api_type", "OPENAI_API_TYPE", default="azure"
|
||||
)
|
||||
values["openai_organization"] = (
|
||||
values["openai_organization"]
|
||||
or os.getenv("OPENAI_ORG_ID")
|
||||
or os.getenv("OPENAI_ORGANIZATION")
|
||||
)
|
||||
values["openai_proxy"] = get_from_dict_or_env(
|
||||
values,
|
||||
"openai_proxy",
|
||||
"OPENAI_PROXY",
|
||||
default="",
|
||||
)
|
||||
values["azure_endpoint"] = values["azure_endpoint"] or os.getenv(
|
||||
"AZURE_OPENAI_ENDPOINT"
|
||||
)
|
||||
values["azure_ad_token"] = values["azure_ad_token"] or os.getenv(
|
||||
"AZURE_OPENAI_AD_TOKEN"
|
||||
)
|
||||
try:
|
||||
import openai
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import openai python package. "
|
||||
"Please install it with `pip install openai`."
|
||||
)
|
||||
if is_openai_v1():
|
||||
# For backwards compatibility. Before openai v1, no distinction was made
|
||||
# between azure_endpoint and base_url (openai_api_base).
|
||||
openai_api_base = values["openai_api_base"]
|
||||
if openai_api_base and values["validate_base_url"]:
|
||||
if "/openai" not in openai_api_base:
|
||||
values["openai_api_base"] += "/openai"
|
||||
warnings.warn(
|
||||
"As of openai>=1.0.0, Azure endpoints should be specified via "
|
||||
f"the `azure_endpoint` param not `openai_api_base` "
|
||||
f"(or alias `base_url`). Updating `openai_api_base` from "
|
||||
f"{openai_api_base} to {values['openai_api_base']}."
|
||||
)
|
||||
if values["azure_deployment"]:
|
||||
warnings.warn(
|
||||
"As of openai>=1.0.0, if `azure_deployment` (or alias "
|
||||
"`azure_deployment`) is specified then "
|
||||
"`openai_api_base` (or alias `base_url`) should not be. "
|
||||
"Instead use `azure_deployment` (or alias `azure_deployment`) "
|
||||
"and `azure_endpoint`."
|
||||
)
|
||||
if values["azure_deployment"] not in values["openai_api_base"]:
|
||||
warnings.warn(
|
||||
"As of openai>=1.0.0, if `openai_api_base` "
|
||||
"(or alias `base_url`) is specified it is expected to be "
|
||||
"of the form "
|
||||
"https://example-resource.azure.openai.com/openai/deployments/example-deployment. " # noqa: E501
|
||||
f"Updating {openai_api_base} to "
|
||||
f"{values['openai_api_base']}."
|
||||
)
|
||||
values["openai_api_base"] += (
|
||||
"/deployments/" + values["azure_deployment"]
|
||||
)
|
||||
values["azure_deployment"] = None
|
||||
client_params = {
|
||||
"api_version": values["openai_api_version"],
|
||||
"azure_endpoint": values["azure_endpoint"],
|
||||
"azure_deployment": values["azure_deployment"],
|
||||
"api_key": values["openai_api_key"],
|
||||
"azure_ad_token": values["azure_ad_token"],
|
||||
"azure_ad_token_provider": values["azure_ad_token_provider"],
|
||||
"organization": values["openai_organization"],
|
||||
"base_url": values["openai_api_base"],
|
||||
"timeout": values["request_timeout"],
|
||||
"max_retries": values["max_retries"],
|
||||
"default_headers": values["default_headers"],
|
||||
"default_query": values["default_query"],
|
||||
"http_client": values["http_client"],
|
||||
}
|
||||
values["client"] = openai.AzureOpenAI(**client_params).embeddings
|
||||
values["async_client"] = openai.AsyncAzureOpenAI(**client_params).embeddings
|
||||
else:
|
||||
values["client"] = openai.Embedding
|
||||
return values
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
return "azure-openai-chat"
|
@ -0,0 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from importlib.metadata import version
|
||||
|
||||
from packaging.version import Version, parse
|
||||
|
||||
|
||||
def is_openai_v1() -> bool:
|
||||
_version = parse(version("openai"))
|
||||
return _version >= Version("1.0.0")
|
@ -0,0 +1,93 @@
|
||||
"""Test openai embeddings."""
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from langchain.embeddings import AzureOpenAIEmbeddings
|
||||
|
||||
|
||||
def _get_embeddings(**kwargs: Any) -> AzureOpenAIEmbeddings:
|
||||
return AzureOpenAIEmbeddings(
|
||||
openai_api_version=os.environ.get("AZURE_OPENAI_API_VERSION", ""),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def test_azure_openai_embedding_documents() -> None:
|
||||
"""Test openai embeddings."""
|
||||
documents = ["foo bar"]
|
||||
embedding = _get_embeddings()
|
||||
output = embedding.embed_documents(documents)
|
||||
assert len(output) == 1
|
||||
assert len(output[0]) == 1536
|
||||
|
||||
|
||||
def test_azure_openai_embedding_documents_multiple() -> None:
|
||||
"""Test openai embeddings."""
|
||||
documents = ["foo bar", "bar foo", "foo"]
|
||||
embedding = _get_embeddings(chunk_size=2)
|
||||
embedding.embedding_ctx_length = 8191
|
||||
output = embedding.embed_documents(documents)
|
||||
assert len(output) == 3
|
||||
assert len(output[0]) == 1536
|
||||
assert len(output[1]) == 1536
|
||||
assert len(output[2]) == 1536
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_azure_openai_embedding_documents_async_multiple() -> None:
|
||||
"""Test openai embeddings."""
|
||||
documents = ["foo bar", "bar foo", "foo"]
|
||||
embedding = _get_embeddings(chunk_size=2)
|
||||
embedding.embedding_ctx_length = 8191
|
||||
output = await embedding.aembed_documents(documents)
|
||||
assert len(output) == 3
|
||||
assert len(output[0]) == 1536
|
||||
assert len(output[1]) == 1536
|
||||
assert len(output[2]) == 1536
|
||||
|
||||
|
||||
def test_azure_openai_embedding_query() -> None:
|
||||
"""Test openai embeddings."""
|
||||
document = "foo bar"
|
||||
embedding = _get_embeddings()
|
||||
output = embedding.embed_query(document)
|
||||
assert len(output) == 1536
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_azure_openai_embedding_async_query() -> None:
|
||||
"""Test openai embeddings."""
|
||||
document = "foo bar"
|
||||
embedding = _get_embeddings()
|
||||
output = await embedding.aembed_query(document)
|
||||
assert len(output) == 1536
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Unblock scheduled testing. TODO: fix.")
|
||||
def test_azure_openai_embedding_with_empty_string() -> None:
|
||||
"""Test openai embeddings with empty string."""
|
||||
import openai
|
||||
|
||||
document = ["", "abc"]
|
||||
embedding = _get_embeddings()
|
||||
output = embedding.embed_documents(document)
|
||||
assert len(output) == 2
|
||||
assert len(output[0]) == 1536
|
||||
expected_output = openai.Embedding.create(input="", model="text-embedding-ada-002")[
|
||||
"data"
|
||||
][0]["embedding"]
|
||||
assert np.allclose(output[0], expected_output)
|
||||
assert len(output[1]) == 1536
|
||||
|
||||
|
||||
def test_embed_documents_normalized() -> None:
|
||||
output = _get_embeddings().embed_documents(["foo walked to the market"])
|
||||
assert np.isclose(np.linalg.norm(output[0]), 1.0)
|
||||
|
||||
|
||||
def test_embed_query_normalized() -> None:
|
||||
output = _get_embeddings().embed_query("foo walked to the market")
|
||||
assert np.isclose(np.linalg.norm(output), 1.0)
|
Loading…
Reference in New Issue