2023-12-11 21:53:30 +00:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
import base64
|
2024-05-22 20:36:06 +00:00
|
|
|
import itertools
|
2023-12-11 21:53:30 +00:00
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import uuid
|
|
|
|
from typing import (
|
|
|
|
TYPE_CHECKING,
|
|
|
|
Any,
|
|
|
|
Callable,
|
2024-04-29 16:11:44 +00:00
|
|
|
ClassVar,
|
|
|
|
Collection,
|
2023-12-11 21:53:30 +00:00
|
|
|
Dict,
|
|
|
|
Iterable,
|
|
|
|
List,
|
2024-05-16 19:54:32 +00:00
|
|
|
Literal,
|
2023-12-11 21:53:30 +00:00
|
|
|
Optional,
|
|
|
|
Tuple,
|
|
|
|
Type,
|
|
|
|
Union,
|
2024-06-05 21:39:54 +00:00
|
|
|
cast,
|
2023-12-11 21:53:30 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
import numpy as np
|
2024-06-05 21:39:54 +00:00
|
|
|
from langchain_core.callbacks import (
|
|
|
|
AsyncCallbackManagerForRetrieverRun,
|
|
|
|
CallbackManagerForRetrieverRun,
|
|
|
|
)
|
2023-12-11 21:53:30 +00:00
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_core.embeddings import Embeddings
|
|
|
|
from langchain_core.pydantic_v1 import root_validator
|
|
|
|
from langchain_core.retrievers import BaseRetriever
|
|
|
|
from langchain_core.utils import get_from_env
|
|
|
|
from langchain_core.vectorstores import VectorStore
|
|
|
|
|
2024-05-22 20:36:06 +00:00
|
|
|
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
logger = logging.getLogger()
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
2024-05-22 20:36:06 +00:00
|
|
|
from azure.search.documents import SearchClient, SearchItemPaged
|
2024-06-05 21:39:54 +00:00
|
|
|
from azure.search.documents.aio import SearchClient as AsyncSearchClient
|
2023-12-11 21:53:30 +00:00
|
|
|
from azure.search.documents.indexes.models import (
|
|
|
|
CorsOptions,
|
|
|
|
ScoringProfile,
|
|
|
|
SearchField,
|
2024-02-13 03:23:35 +00:00
|
|
|
SemanticConfiguration,
|
2023-12-11 21:53:30 +00:00
|
|
|
VectorSearch,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Allow overriding field names for Azure Search
|
|
|
|
FIELDS_ID = get_from_env(
|
|
|
|
key="AZURESEARCH_FIELDS_ID", env_key="AZURESEARCH_FIELDS_ID", default="id"
|
|
|
|
)
|
|
|
|
FIELDS_CONTENT = get_from_env(
|
|
|
|
key="AZURESEARCH_FIELDS_CONTENT",
|
|
|
|
env_key="AZURESEARCH_FIELDS_CONTENT",
|
|
|
|
default="content",
|
|
|
|
)
|
|
|
|
FIELDS_CONTENT_VECTOR = get_from_env(
|
|
|
|
key="AZURESEARCH_FIELDS_CONTENT_VECTOR",
|
|
|
|
env_key="AZURESEARCH_FIELDS_CONTENT_VECTOR",
|
|
|
|
default="content_vector",
|
|
|
|
)
|
|
|
|
FIELDS_METADATA = get_from_env(
|
|
|
|
key="AZURESEARCH_FIELDS_TAG", env_key="AZURESEARCH_FIELDS_TAG", default="metadata"
|
|
|
|
)
|
|
|
|
|
|
|
|
MAX_UPLOAD_BATCH_SIZE = 1000
|
|
|
|
|
|
|
|
|
|
|
|
def _get_search_client(
|
|
|
|
endpoint: str,
|
|
|
|
key: str,
|
|
|
|
index_name: str,
|
|
|
|
semantic_configuration_name: Optional[str] = None,
|
|
|
|
fields: Optional[List[SearchField]] = None,
|
|
|
|
vector_search: Optional[VectorSearch] = None,
|
2024-03-26 20:57:39 +00:00
|
|
|
semantic_configurations: Optional[
|
|
|
|
Union[SemanticConfiguration, List[SemanticConfiguration]]
|
|
|
|
] = None,
|
2023-12-11 21:53:30 +00:00
|
|
|
scoring_profiles: Optional[List[ScoringProfile]] = None,
|
|
|
|
default_scoring_profile: Optional[str] = None,
|
|
|
|
default_fields: Optional[List[SearchField]] = None,
|
|
|
|
user_agent: Optional[str] = "langchain",
|
|
|
|
cors_options: Optional[CorsOptions] = None,
|
2024-06-05 21:39:54 +00:00
|
|
|
async_: bool = False,
|
|
|
|
) -> Union[SearchClient, AsyncSearchClient]:
|
2023-12-11 21:53:30 +00:00
|
|
|
from azure.core.credentials import AzureKeyCredential
|
|
|
|
from azure.core.exceptions import ResourceNotFoundError
|
|
|
|
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
|
|
|
|
from azure.search.documents import SearchClient
|
2024-06-05 21:39:54 +00:00
|
|
|
from azure.search.documents.aio import SearchClient as AsyncSearchClient
|
2023-12-11 21:53:30 +00:00
|
|
|
from azure.search.documents.indexes import SearchIndexClient
|
|
|
|
from azure.search.documents.indexes.models import (
|
2024-02-13 03:23:35 +00:00
|
|
|
ExhaustiveKnnAlgorithmConfiguration,
|
|
|
|
ExhaustiveKnnParameters,
|
|
|
|
HnswAlgorithmConfiguration,
|
|
|
|
HnswParameters,
|
2023-12-11 21:53:30 +00:00
|
|
|
SearchIndex,
|
|
|
|
SemanticConfiguration,
|
|
|
|
SemanticField,
|
2024-02-13 03:23:35 +00:00
|
|
|
SemanticPrioritizedFields,
|
|
|
|
SemanticSearch,
|
2024-02-16 06:23:52 +00:00
|
|
|
VectorSearch,
|
2024-02-13 03:23:35 +00:00
|
|
|
VectorSearchAlgorithmKind,
|
|
|
|
VectorSearchAlgorithmMetric,
|
|
|
|
VectorSearchProfile,
|
2023-12-11 21:53:30 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
default_fields = default_fields or []
|
|
|
|
if key is None:
|
|
|
|
credential = DefaultAzureCredential()
|
|
|
|
elif key.upper() == "INTERACTIVE":
|
|
|
|
credential = InteractiveBrowserCredential()
|
|
|
|
credential.get_token("https://search.azure.com/.default")
|
|
|
|
else:
|
|
|
|
credential = AzureKeyCredential(key)
|
|
|
|
index_client: SearchIndexClient = SearchIndexClient(
|
|
|
|
endpoint=endpoint, credential=credential, user_agent=user_agent
|
|
|
|
)
|
|
|
|
try:
|
|
|
|
index_client.get_index(name=index_name)
|
|
|
|
except ResourceNotFoundError:
|
|
|
|
# Fields configuration
|
|
|
|
if fields is not None:
|
|
|
|
# Check mandatory fields
|
|
|
|
fields_types = {f.name: f.type for f in fields}
|
|
|
|
mandatory_fields = {df.name: df.type for df in default_fields}
|
|
|
|
# Check for missing keys
|
|
|
|
missing_fields = {
|
|
|
|
key: mandatory_fields[key]
|
|
|
|
for key, value in set(mandatory_fields.items())
|
|
|
|
- set(fields_types.items())
|
|
|
|
}
|
|
|
|
if len(missing_fields) > 0:
|
|
|
|
# Helper for formatting field information for each missing field.
|
|
|
|
def fmt_err(x: str) -> str:
|
|
|
|
return (
|
|
|
|
f"{x} current type: '{fields_types.get(x, 'MISSING')}'. "
|
|
|
|
f"It has to be '{mandatory_fields.get(x)}' or you can point "
|
|
|
|
f"to a different '{mandatory_fields.get(x)}' field name by "
|
|
|
|
f"using the env variable 'AZURESEARCH_FIELDS_{x.upper()}'"
|
|
|
|
)
|
|
|
|
|
|
|
|
error = "\n".join([fmt_err(x) for x in missing_fields])
|
|
|
|
raise ValueError(
|
|
|
|
f"You need to specify at least the following fields "
|
|
|
|
f"{missing_fields} or provide alternative field names in the env "
|
|
|
|
f"variables.\n\n{error}"
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
fields = default_fields
|
|
|
|
# Vector search configuration
|
|
|
|
if vector_search is None:
|
2024-02-13 03:23:35 +00:00
|
|
|
vector_search = VectorSearch(
|
|
|
|
algorithms=[
|
|
|
|
HnswAlgorithmConfiguration(
|
|
|
|
name="default",
|
|
|
|
kind=VectorSearchAlgorithmKind.HNSW,
|
|
|
|
parameters=HnswParameters(
|
|
|
|
m=4,
|
|
|
|
ef_construction=400,
|
|
|
|
ef_search=500,
|
|
|
|
metric=VectorSearchAlgorithmMetric.COSINE,
|
|
|
|
),
|
|
|
|
),
|
|
|
|
ExhaustiveKnnAlgorithmConfiguration(
|
|
|
|
name="default_exhaustive_knn",
|
|
|
|
kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
|
|
|
|
parameters=ExhaustiveKnnParameters(
|
|
|
|
metric=VectorSearchAlgorithmMetric.COSINE
|
|
|
|
),
|
|
|
|
),
|
|
|
|
],
|
|
|
|
profiles=[
|
|
|
|
VectorSearchProfile(
|
|
|
|
name="myHnswProfile",
|
|
|
|
algorithm_configuration_name="default",
|
|
|
|
),
|
|
|
|
VectorSearchProfile(
|
|
|
|
name="myExhaustiveKnnProfile",
|
|
|
|
algorithm_configuration_name="default_exhaustive_knn",
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
# Create the semantic settings with the configuration
|
2024-03-26 20:57:39 +00:00
|
|
|
if semantic_configurations:
|
|
|
|
if not isinstance(semantic_configurations, list):
|
|
|
|
semantic_configurations = [semantic_configurations]
|
|
|
|
semantic_search = SemanticSearch(
|
|
|
|
configurations=semantic_configurations,
|
|
|
|
default_configuration_name=semantic_configuration_name,
|
|
|
|
)
|
|
|
|
elif semantic_configuration_name:
|
|
|
|
# use default semantic configuration
|
2024-02-13 03:23:35 +00:00
|
|
|
semantic_configuration = SemanticConfiguration(
|
|
|
|
name=semantic_configuration_name,
|
|
|
|
prioritized_fields=SemanticPrioritizedFields(
|
|
|
|
content_fields=[SemanticField(field_name=FIELDS_CONTENT)],
|
|
|
|
),
|
|
|
|
)
|
|
|
|
semantic_search = SemanticSearch(configurations=[semantic_configuration])
|
2024-03-26 20:57:39 +00:00
|
|
|
else:
|
|
|
|
# don't use semantic search
|
|
|
|
semantic_search = None
|
2024-02-13 03:23:35 +00:00
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
# Create the search index with the semantic settings and vector search
|
|
|
|
index = SearchIndex(
|
|
|
|
name=index_name,
|
|
|
|
fields=fields,
|
|
|
|
vector_search=vector_search,
|
2024-02-13 03:23:35 +00:00
|
|
|
semantic_search=semantic_search,
|
2023-12-11 21:53:30 +00:00
|
|
|
scoring_profiles=scoring_profiles,
|
|
|
|
default_scoring_profile=default_scoring_profile,
|
|
|
|
cors_options=cors_options,
|
|
|
|
)
|
|
|
|
index_client.create_index(index)
|
|
|
|
# Create the search client
|
2024-06-05 21:39:54 +00:00
|
|
|
if not async_:
|
|
|
|
return SearchClient(
|
|
|
|
endpoint=endpoint,
|
|
|
|
index_name=index_name,
|
|
|
|
credential=credential,
|
|
|
|
user_agent=user_agent,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
return AsyncSearchClient(
|
|
|
|
endpoint=endpoint,
|
|
|
|
index_name=index_name,
|
|
|
|
credential=credential,
|
|
|
|
user_agent=user_agent,
|
|
|
|
)
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
class AzureSearch(VectorStore):
|
|
|
|
"""`Azure Cognitive Search` vector store."""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
azure_search_endpoint: str,
|
|
|
|
azure_search_key: str,
|
|
|
|
index_name: str,
|
community: Fixing a performance issue with AzureSearch to perform batch embedding (#15594)
- **Description:** Azure Cognitive Search vector DB store performs slow
embedding as it does not utilize the batch embedding functionality. This
PR provide a fix to improve the performance of Azure Search class when
adding documents to the vector search,
- **Issue:** #11313 ,
- **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!
Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.
See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/
If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
-->
2024-01-12 18:58:55 +00:00
|
|
|
embedding_function: Union[Callable, Embeddings],
|
2023-12-11 21:53:30 +00:00
|
|
|
search_type: str = "hybrid",
|
|
|
|
semantic_configuration_name: Optional[str] = None,
|
|
|
|
fields: Optional[List[SearchField]] = None,
|
|
|
|
vector_search: Optional[VectorSearch] = None,
|
2024-03-26 20:57:39 +00:00
|
|
|
semantic_configurations: Optional[
|
|
|
|
Union[SemanticConfiguration, List[SemanticConfiguration]]
|
|
|
|
] = None,
|
2023-12-11 21:53:30 +00:00
|
|
|
scoring_profiles: Optional[List[ScoringProfile]] = None,
|
|
|
|
default_scoring_profile: Optional[str] = None,
|
|
|
|
cors_options: Optional[CorsOptions] = None,
|
2024-05-22 20:36:06 +00:00
|
|
|
*,
|
|
|
|
vector_search_dimensions: Optional[int] = None,
|
2023-12-11 21:53:30 +00:00
|
|
|
**kwargs: Any,
|
|
|
|
):
|
2024-06-05 21:39:54 +00:00
|
|
|
try:
|
|
|
|
from azure.search.documents.indexes.models import (
|
|
|
|
SearchableField,
|
|
|
|
SearchField,
|
|
|
|
SearchFieldDataType,
|
|
|
|
SimpleField,
|
|
|
|
)
|
|
|
|
except ImportError as e:
|
|
|
|
raise ImportError(
|
|
|
|
"Unable to import azure.search.documents. Please install with "
|
|
|
|
"`pip install -U azure-search-documents`."
|
|
|
|
) from e
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
"""Initialize with necessary components."""
|
|
|
|
# Initialize base class
|
|
|
|
self.embedding_function = embedding_function
|
community: Fixing a performance issue with AzureSearch to perform batch embedding (#15594)
- **Description:** Azure Cognitive Search vector DB store performs slow
embedding as it does not utilize the batch embedding functionality. This
PR provide a fix to improve the performance of Azure Search class when
adding documents to the vector search,
- **Issue:** #11313 ,
- **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!
Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.
See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/
If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
-->
2024-01-12 18:58:55 +00:00
|
|
|
|
|
|
|
if isinstance(self.embedding_function, Embeddings):
|
|
|
|
self.embed_query = self.embedding_function.embed_query
|
|
|
|
else:
|
|
|
|
self.embed_query = self.embedding_function
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
default_fields = [
|
|
|
|
SimpleField(
|
|
|
|
name=FIELDS_ID,
|
|
|
|
type=SearchFieldDataType.String,
|
|
|
|
key=True,
|
|
|
|
filterable=True,
|
|
|
|
),
|
|
|
|
SearchableField(
|
|
|
|
name=FIELDS_CONTENT,
|
|
|
|
type=SearchFieldDataType.String,
|
|
|
|
),
|
|
|
|
SearchField(
|
|
|
|
name=FIELDS_CONTENT_VECTOR,
|
|
|
|
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
|
|
|
|
searchable=True,
|
2024-05-22 20:36:06 +00:00
|
|
|
vector_search_dimensions=vector_search_dimensions
|
|
|
|
or len(self.embed_query("Text")),
|
2024-02-16 06:23:52 +00:00
|
|
|
vector_search_profile_name="myHnswProfile",
|
2023-12-11 21:53:30 +00:00
|
|
|
),
|
|
|
|
SearchableField(
|
|
|
|
name=FIELDS_METADATA,
|
|
|
|
type=SearchFieldDataType.String,
|
|
|
|
),
|
|
|
|
]
|
|
|
|
user_agent = "langchain"
|
|
|
|
if "user_agent" in kwargs and kwargs["user_agent"]:
|
|
|
|
user_agent += " " + kwargs["user_agent"]
|
|
|
|
self.client = _get_search_client(
|
|
|
|
azure_search_endpoint,
|
|
|
|
azure_search_key,
|
|
|
|
index_name,
|
|
|
|
semantic_configuration_name=semantic_configuration_name,
|
|
|
|
fields=fields,
|
|
|
|
vector_search=vector_search,
|
2024-02-13 03:23:35 +00:00
|
|
|
semantic_configurations=semantic_configurations,
|
2023-12-11 21:53:30 +00:00
|
|
|
scoring_profiles=scoring_profiles,
|
|
|
|
default_scoring_profile=default_scoring_profile,
|
|
|
|
default_fields=default_fields,
|
|
|
|
user_agent=user_agent,
|
|
|
|
cors_options=cors_options,
|
|
|
|
)
|
|
|
|
self.search_type = search_type
|
|
|
|
self.semantic_configuration_name = semantic_configuration_name
|
|
|
|
self.fields = fields if fields else default_fields
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
self._azure_search_endpoint = azure_search_endpoint
|
|
|
|
self._azure_search_key = azure_search_key
|
|
|
|
self._index_name = index_name
|
|
|
|
self._semantic_configuration_name = semantic_configuration_name
|
|
|
|
self._fields = fields
|
|
|
|
self._vector_search = vector_search
|
|
|
|
self._semantic_configurations = semantic_configurations
|
|
|
|
self._scoring_profiles = scoring_profiles
|
|
|
|
self._default_scoring_profile = default_scoring_profile
|
|
|
|
self._default_fields = default_fields
|
|
|
|
self._user_agent = user_agent
|
|
|
|
self._cors_options = cors_options
|
|
|
|
|
|
|
|
def _async_client(self) -> AsyncSearchClient:
|
|
|
|
return _get_search_client(
|
|
|
|
self._azure_search_endpoint,
|
|
|
|
self._azure_search_key,
|
|
|
|
self._index_name,
|
|
|
|
semantic_configuration_name=self._semantic_configuration_name,
|
|
|
|
fields=self._fields,
|
|
|
|
vector_search=self._vector_search,
|
|
|
|
semantic_configurations=self._semantic_configurations,
|
|
|
|
scoring_profiles=self._scoring_profiles,
|
|
|
|
default_scoring_profile=self._default_scoring_profile,
|
|
|
|
default_fields=self._default_fields,
|
|
|
|
user_agent=self._user_agent,
|
|
|
|
cors_options=self._cors_options,
|
|
|
|
async_=True,
|
|
|
|
)
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
@property
|
|
|
|
def embeddings(self) -> Optional[Embeddings]:
|
|
|
|
# TODO: Support embedding object directly
|
2024-06-05 21:39:54 +00:00
|
|
|
return (
|
|
|
|
self.embedding_function
|
|
|
|
if isinstance(self.embedding_function, Embeddings)
|
|
|
|
else None
|
|
|
|
)
|
|
|
|
|
|
|
|
async def _aembed_query(self, text: str) -> List[float]:
|
|
|
|
if self.embeddings:
|
|
|
|
return await self.embeddings.aembed_query(text)
|
|
|
|
else:
|
|
|
|
return cast(Callable, self.embedding_function)(text)
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
def add_texts(
|
|
|
|
self,
|
|
|
|
texts: Iterable[str],
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
2024-06-05 21:39:54 +00:00
|
|
|
*,
|
|
|
|
keys: Optional[List[str]] = None,
|
2023-12-11 21:53:30 +00:00
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[str]:
|
|
|
|
"""Add texts data to an existing index."""
|
community: Fixing a performance issue with AzureSearch to perform batch embedding (#15594)
- **Description:** Azure Cognitive Search vector DB store performs slow
embedding as it does not utilize the batch embedding functionality. This
PR provide a fix to improve the performance of Azure Search class when
adding documents to the vector search,
- **Issue:** #11313 ,
- **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!
Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.
See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/
If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
-->
2024-01-12 18:58:55 +00:00
|
|
|
# batching support if embedding function is an Embeddings object
|
|
|
|
if isinstance(self.embedding_function, Embeddings):
|
|
|
|
try:
|
2024-06-05 21:39:54 +00:00
|
|
|
embeddings = self.embedding_function.embed_documents(list(texts))
|
community: Fixing a performance issue with AzureSearch to perform batch embedding (#15594)
- **Description:** Azure Cognitive Search vector DB store performs slow
embedding as it does not utilize the batch embedding functionality. This
PR provide a fix to improve the performance of Azure Search class when
adding documents to the vector search,
- **Issue:** #11313 ,
- **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!
Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.
See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/
If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
-->
2024-01-12 18:58:55 +00:00
|
|
|
except NotImplementedError:
|
|
|
|
embeddings = [self.embedding_function.embed_query(x) for x in texts]
|
|
|
|
else:
|
|
|
|
embeddings = [self.embedding_function(x) for x in texts]
|
|
|
|
|
|
|
|
if len(embeddings) == 0:
|
|
|
|
logger.debug("Nothing to insert, skipping.")
|
|
|
|
return []
|
|
|
|
|
2024-05-22 20:36:06 +00:00
|
|
|
return self.add_embeddings(zip(texts, embeddings), metadatas, keys=keys)
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def aadd_texts(
|
|
|
|
self,
|
|
|
|
texts: Iterable[str],
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
*,
|
|
|
|
keys: Optional[List[str]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[str]:
|
|
|
|
if isinstance(self.embedding_function, Embeddings):
|
|
|
|
try:
|
|
|
|
embeddings = await self.embedding_function.aembed_documents(list(texts))
|
|
|
|
except NotImplementedError:
|
|
|
|
embeddings = [
|
|
|
|
await self.embedding_function.aembed_query(x) for x in texts
|
|
|
|
]
|
|
|
|
else:
|
|
|
|
embeddings = [self.embedding_function(x) for x in texts]
|
|
|
|
|
|
|
|
if len(embeddings) == 0:
|
|
|
|
logger.debug("Nothing to insert, skipping.")
|
|
|
|
return []
|
|
|
|
|
|
|
|
return await self.aadd_embeddings(zip(texts, embeddings), metadatas, keys=keys)
|
|
|
|
|
2024-05-22 20:36:06 +00:00
|
|
|
def add_embeddings(
|
|
|
|
self,
|
|
|
|
text_embeddings: Iterable[Tuple[str, List[float]]],
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
*,
|
|
|
|
keys: Optional[List[str]] = None,
|
|
|
|
) -> List[str]:
|
|
|
|
"""Add embeddings to an existing index."""
|
|
|
|
ids = []
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
# Write data to index
|
|
|
|
data = []
|
2024-05-22 20:36:06 +00:00
|
|
|
for i, (text, embedding) in enumerate(text_embeddings):
|
2023-12-11 21:53:30 +00:00
|
|
|
# Use provided key otherwise use default key
|
|
|
|
key = keys[i] if keys else str(uuid.uuid4())
|
|
|
|
# Encoding key for Azure Search valid characters
|
|
|
|
key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii")
|
|
|
|
metadata = metadatas[i] if metadatas else {}
|
|
|
|
# Add data to index
|
|
|
|
# Additional metadata to fields mapping
|
|
|
|
doc = {
|
|
|
|
"@search.action": "upload",
|
|
|
|
FIELDS_ID: key,
|
|
|
|
FIELDS_CONTENT: text,
|
2024-05-22 20:36:06 +00:00
|
|
|
FIELDS_CONTENT_VECTOR: np.array(embedding, dtype=np.float32).tolist(),
|
2023-12-11 21:53:30 +00:00
|
|
|
FIELDS_METADATA: json.dumps(metadata),
|
|
|
|
}
|
|
|
|
if metadata:
|
|
|
|
additional_fields = {
|
|
|
|
k: v
|
|
|
|
for k, v in metadata.items()
|
|
|
|
if k in [x.name for x in self.fields]
|
|
|
|
}
|
|
|
|
doc.update(additional_fields)
|
|
|
|
data.append(doc)
|
|
|
|
ids.append(key)
|
|
|
|
# Upload data in batches
|
|
|
|
if len(data) == MAX_UPLOAD_BATCH_SIZE:
|
|
|
|
response = self.client.upload_documents(documents=data)
|
|
|
|
# Check if all documents were successfully uploaded
|
2024-05-22 20:36:06 +00:00
|
|
|
if not all(r.succeeded for r in response):
|
2023-12-11 21:53:30 +00:00
|
|
|
raise Exception(response)
|
|
|
|
# Reset data
|
|
|
|
data = []
|
|
|
|
|
|
|
|
# Considering case where data is an exact multiple of batch-size entries
|
|
|
|
if len(data) == 0:
|
|
|
|
return ids
|
|
|
|
|
|
|
|
# Upload data to index
|
|
|
|
response = self.client.upload_documents(documents=data)
|
|
|
|
# Check if all documents were successfully uploaded
|
2024-05-22 20:36:06 +00:00
|
|
|
if all(r.succeeded for r in response):
|
2023-12-11 21:53:30 +00:00
|
|
|
return ids
|
|
|
|
else:
|
|
|
|
raise Exception(response)
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def aadd_embeddings(
|
|
|
|
self,
|
|
|
|
text_embeddings: Iterable[Tuple[str, List[float]]],
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
*,
|
|
|
|
keys: Optional[List[str]] = None,
|
|
|
|
) -> List[str]:
|
|
|
|
"""Add embeddings to an existing index."""
|
|
|
|
ids = []
|
|
|
|
|
|
|
|
# Write data to index
|
|
|
|
data = []
|
|
|
|
for i, (text, embedding) in enumerate(text_embeddings):
|
|
|
|
# Use provided key otherwise use default key
|
|
|
|
key = keys[i] if keys else str(uuid.uuid4())
|
|
|
|
# Encoding key for Azure Search valid characters
|
|
|
|
key = base64.urlsafe_b64encode(bytes(key, "utf-8")).decode("ascii")
|
|
|
|
metadata = metadatas[i] if metadatas else {}
|
|
|
|
# Add data to index
|
|
|
|
# Additional metadata to fields mapping
|
|
|
|
doc = {
|
|
|
|
"@search.action": "upload",
|
|
|
|
FIELDS_ID: key,
|
|
|
|
FIELDS_CONTENT: text,
|
|
|
|
FIELDS_CONTENT_VECTOR: np.array(embedding, dtype=np.float32).tolist(),
|
|
|
|
FIELDS_METADATA: json.dumps(metadata),
|
|
|
|
}
|
|
|
|
if metadata:
|
|
|
|
additional_fields = {
|
|
|
|
k: v
|
|
|
|
for k, v in metadata.items()
|
|
|
|
if k in [x.name for x in self.fields]
|
|
|
|
}
|
|
|
|
doc.update(additional_fields)
|
|
|
|
data.append(doc)
|
|
|
|
ids.append(key)
|
|
|
|
# Upload data in batches
|
|
|
|
if len(data) == MAX_UPLOAD_BATCH_SIZE:
|
|
|
|
async with self._async_client() as async_client:
|
|
|
|
response = await async_client.upload_documents(documents=data)
|
|
|
|
# Check if all documents were successfully uploaded
|
|
|
|
if not all(r.succeeded for r in response):
|
|
|
|
raise Exception(response)
|
|
|
|
# Reset data
|
|
|
|
data = []
|
|
|
|
|
|
|
|
# Considering case where data is an exact multiple of batch-size entries
|
|
|
|
if len(data) == 0:
|
|
|
|
return ids
|
|
|
|
|
|
|
|
# Upload data to index
|
|
|
|
async with self._async_client() as async_client:
|
|
|
|
response = await async_client.upload_documents(documents=data)
|
|
|
|
# Check if all documents were successfully uploaded
|
|
|
|
if all(r.succeeded for r in response):
|
|
|
|
return ids
|
|
|
|
else:
|
|
|
|
raise Exception(response)
|
|
|
|
|
2024-04-30 23:46:18 +00:00
|
|
|
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> bool:
|
|
|
|
"""Delete by vector ID.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
ids: List of ids to delete.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
bool: True if deletion is successful,
|
|
|
|
False otherwise.
|
|
|
|
"""
|
|
|
|
if ids:
|
2024-06-03 15:55:06 +00:00
|
|
|
res = self.client.delete_documents([{FIELDS_ID: i} for i in ids])
|
2024-04-30 23:46:18 +00:00
|
|
|
return len(res) > 0
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def adelete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> bool:
|
|
|
|
"""Delete by vector ID.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
ids: List of ids to delete.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
bool: True if deletion is successful,
|
|
|
|
False otherwise.
|
|
|
|
"""
|
|
|
|
if ids:
|
|
|
|
async with self._async_client() as async_client:
|
|
|
|
res = await async_client.delete_documents([{"id": i} for i in ids])
|
|
|
|
return len(res) > 0
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
def similarity_search(
|
2024-06-05 21:39:54 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
*,
|
|
|
|
search_type: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-11 21:53:30 +00:00
|
|
|
) -> List[Document]:
|
2024-06-05 21:39:54 +00:00
|
|
|
search_type = search_type or self.search_type
|
2023-12-11 21:53:30 +00:00
|
|
|
if search_type == "similarity":
|
|
|
|
docs = self.vector_search(query, k=k, **kwargs)
|
|
|
|
elif search_type == "hybrid":
|
|
|
|
docs = self.hybrid_search(query, k=k, **kwargs)
|
|
|
|
elif search_type == "semantic_hybrid":
|
|
|
|
docs = self.semantic_hybrid_search(query, k=k, **kwargs)
|
|
|
|
else:
|
|
|
|
raise ValueError(f"search_type of {search_type} not allowed.")
|
|
|
|
return docs
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
def similarity_search_with_score(
|
|
|
|
self, query: str, *, k: int = 4, **kwargs: Any
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Run similarity search with distance."""
|
|
|
|
search_type = kwargs.get("search_type", self.search_type)
|
|
|
|
if search_type == "similarity":
|
|
|
|
return self.vector_search_with_score(query, k=k, **kwargs)
|
|
|
|
elif search_type == "hybrid":
|
|
|
|
return self.hybrid_search_with_score(query, k=k, **kwargs)
|
|
|
|
elif search_type == "semantic_hybrid":
|
|
|
|
return self.semantic_hybrid_search_with_score(query, k=k, **kwargs)
|
|
|
|
else:
|
|
|
|
raise ValueError(f"search_type of {search_type} not allowed.")
|
|
|
|
|
|
|
|
async def asimilarity_search(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
*,
|
|
|
|
search_type: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
|
|
|
search_type = search_type or self.search_type
|
|
|
|
if search_type == "similarity":
|
|
|
|
docs = await self.avector_search(query, k=k, **kwargs)
|
|
|
|
elif search_type == "hybrid":
|
|
|
|
docs = await self.ahybrid_search(query, k=k, **kwargs)
|
|
|
|
elif search_type == "semantic_hybrid":
|
|
|
|
docs = await self.asemantic_hybrid_search(query, k=k, **kwargs)
|
|
|
|
else:
|
|
|
|
raise ValueError(f"search_type of {search_type} not allowed.")
|
|
|
|
return docs
|
|
|
|
|
|
|
|
async def asimilarity_search_with_score(
|
|
|
|
self, query: str, *, k: int = 4, **kwargs: Any
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Run similarity search with distance."""
|
|
|
|
search_type = kwargs.get("search_type", self.search_type)
|
|
|
|
if search_type == "similarity":
|
|
|
|
return await self.avector_search_with_score(query, k=k, **kwargs)
|
|
|
|
elif search_type == "hybrid":
|
|
|
|
return await self.ahybrid_search_with_score(query, k=k, **kwargs)
|
|
|
|
elif search_type == "semantic_hybrid":
|
|
|
|
return await self.asemantic_hybrid_search_with_score(query, k=k, **kwargs)
|
|
|
|
else:
|
|
|
|
raise ValueError(f"search_type of {search_type} not allowed.")
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
def similarity_search_with_relevance_scores(
|
2024-06-05 21:39:54 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
*,
|
|
|
|
score_threshold: Optional[float] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-11 21:53:30 +00:00
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
result = self.vector_search_with_score(query, k=k, **kwargs)
|
|
|
|
return (
|
|
|
|
result
|
|
|
|
if score_threshold is None
|
|
|
|
else [r for r in result if r[1] >= score_threshold]
|
|
|
|
)
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def asimilarity_search_with_relevance_scores(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
*,
|
|
|
|
score_threshold: Optional[float] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
result = await self.avector_search_with_score(query, k=k, **kwargs)
|
|
|
|
return (
|
|
|
|
result
|
|
|
|
if score_threshold is None
|
|
|
|
else [r for r in result if r[1] >= score_threshold]
|
|
|
|
)
|
|
|
|
|
|
|
|
def vector_search(
|
|
|
|
self, query: str, k: int = 4, *, filters: Optional[str] = None, **kwargs: Any
|
|
|
|
) -> List[Document]:
|
|
|
|
"""
|
|
|
|
Returns the most similar indexed documents to the query text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): The query text for which to find similar documents.
|
|
|
|
k (int): The number of documents to return. Default is 4.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Document]: A list of documents that are most similar to the query text.
|
|
|
|
"""
|
|
|
|
docs_and_scores = self.vector_search_with_score(query, k=k, filters=filters)
|
|
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
|
|
|
|
async def avector_search(
|
|
|
|
self, query: str, k: int = 4, *, filters: Optional[str] = None, **kwargs: Any
|
|
|
|
) -> List[Document]:
|
2023-12-11 21:53:30 +00:00
|
|
|
"""
|
|
|
|
Returns the most similar indexed documents to the query text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): The query text for which to find similar documents.
|
|
|
|
k (int): The number of documents to return. Default is 4.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Document]: A list of documents that are most similar to the query text.
|
|
|
|
"""
|
2024-06-05 21:39:54 +00:00
|
|
|
docs_and_scores = await self.avector_search_with_score(
|
|
|
|
query, k=k, filters=filters
|
2023-12-11 21:53:30 +00:00
|
|
|
)
|
|
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
|
|
|
|
def vector_search_with_score(
|
2024-05-22 20:36:06 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-11 21:53:30 +00:00
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Return docs most similar to query.
|
|
|
|
|
|
|
|
Args:
|
2024-05-22 20:36:06 +00:00
|
|
|
query (str): Text to look up documents similar to.
|
|
|
|
k (int, optional): Number of Documents to return. Defaults to 4.
|
|
|
|
filters (str, optional): Filtering expression. Defaults to None.
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
Returns:
|
2024-05-22 20:36:06 +00:00
|
|
|
List[Tuple[Document, float]]: List of Documents most similar
|
|
|
|
to the query and score for each
|
2023-12-11 21:53:30 +00:00
|
|
|
"""
|
2024-05-22 20:36:06 +00:00
|
|
|
embedding = self.embed_query(query)
|
|
|
|
results = self._simple_search(embedding, "", k, filters=filters, **kwargs)
|
2024-02-13 03:23:35 +00:00
|
|
|
|
2024-05-22 20:36:06 +00:00
|
|
|
return _results_to_documents(results)
|
2023-12-11 21:53:30 +00:00
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def avector_search_with_score(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Return docs most similar to query.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Text to look up documents similar to.
|
|
|
|
k (int, optional): Number of Documents to return. Defaults to 4.
|
|
|
|
filters (str, optional): Filtering expression. Defaults to None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Tuple[Document, float]]: List of Documents most similar
|
|
|
|
to the query and score for each
|
|
|
|
"""
|
|
|
|
embedding = await self._aembed_query(query)
|
|
|
|
docs, scores, _ = await self._asimple_search(
|
|
|
|
embedding, "", k, filters=filters, **kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
return list(zip(docs, scores))
|
|
|
|
|
2024-05-22 20:36:06 +00:00
|
|
|
def max_marginal_relevance_search_with_score(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
*,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Perform a search and return results that are reordered by MMR.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Text to look up documents similar to.
|
|
|
|
k (int, optional): How many results to give. Defaults to 4.
|
|
|
|
fetch_k (int, optional): Total results to select k from.
|
|
|
|
Defaults to 20.
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
Defaults to 0.5
|
|
|
|
filters (str, optional): Filtering expression. Defaults to None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Tuple[Document, float]]: List of Documents most similar
|
|
|
|
to the query and score for each
|
|
|
|
"""
|
|
|
|
embedding = self.embed_query(query)
|
|
|
|
results = self._simple_search(embedding, "", fetch_k, filters=filters, **kwargs)
|
|
|
|
|
|
|
|
return _reorder_results_with_maximal_marginal_relevance(
|
|
|
|
results, query_embedding=np.array(embedding), lambda_mult=lambda_mult, k=k
|
2023-12-11 21:53:30 +00:00
|
|
|
)
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def amax_marginal_relevance_search_with_score(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
*,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Perform a search and return results that are reordered by MMR.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Text to look up documents similar to.
|
|
|
|
k (int, optional): How many results to give. Defaults to 4.
|
|
|
|
fetch_k (int, optional): Total results to select k from.
|
|
|
|
Defaults to 20.
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
Defaults to 0.5
|
|
|
|
filters (str, optional): Filtering expression. Defaults to None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Tuple[Document, float]]: List of Documents most similar
|
|
|
|
to the query and score for each
|
|
|
|
"""
|
|
|
|
embedding = await self._aembed_query(query)
|
|
|
|
docs, scores, vectors = await self._asimple_search(
|
|
|
|
embedding, "", fetch_k, filters=filters, **kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
return await self._areorder_results_with_maximal_marginal_relevance(
|
|
|
|
docs,
|
|
|
|
scores,
|
|
|
|
vectors,
|
|
|
|
query_embedding=np.array(embedding),
|
|
|
|
lambda_mult=lambda_mult,
|
|
|
|
k=k,
|
|
|
|
)
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
def hybrid_search(self, query: str, k: int = 4, **kwargs: Any) -> List[Document]:
|
|
|
|
"""
|
|
|
|
Returns the most similar indexed documents to the query text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): The query text for which to find similar documents.
|
|
|
|
k (int): The number of documents to return. Default is 4.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Document]: A list of documents that are most similar to the query text.
|
|
|
|
"""
|
2024-05-22 20:36:06 +00:00
|
|
|
docs_and_scores = self.hybrid_search_with_score(query, k=k, **kwargs)
|
2023-12-11 21:53:30 +00:00
|
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def ahybrid_search(
|
|
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
|
|
) -> List[Document]:
|
|
|
|
"""
|
|
|
|
Returns the most similar indexed documents to the query text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): The query text for which to find similar documents.
|
|
|
|
k (int): The number of documents to return. Default is 4.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Document]: A list of documents that are most similar to the query text.
|
|
|
|
"""
|
|
|
|
docs_and_scores = await self.ahybrid_search_with_score(query, k=k, **kwargs)
|
|
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
def hybrid_search_with_score(
|
2024-05-22 20:36:06 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
2023-12-11 21:53:30 +00:00
|
|
|
) -> List[Tuple[Document, float]]:
|
2024-04-24 19:14:33 +00:00
|
|
|
"""Return docs most similar to query with a hybrid query.
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Text to look up documents similar to.
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query and score for each
|
|
|
|
"""
|
|
|
|
|
2024-05-22 20:36:06 +00:00
|
|
|
embedding = self.embed_query(query)
|
|
|
|
results = self._simple_search(embedding, query, k, filters=filters, **kwargs)
|
|
|
|
|
|
|
|
return _results_to_documents(results)
|
2023-12-11 21:53:30 +00:00
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def ahybrid_search_with_score(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Return docs most similar to query with a hybrid query.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Text to look up documents similar to.
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query and score for each
|
|
|
|
"""
|
|
|
|
|
|
|
|
embedding = await self._aembed_query(query)
|
|
|
|
docs, scores, _ = await self._asimple_search(
|
|
|
|
embedding, query, k, filters=filters, **kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
return list(zip(docs, scores))
|
|
|
|
|
2024-04-29 16:11:44 +00:00
|
|
|
def hybrid_search_with_relevance_scores(
|
2024-06-05 21:39:54 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
*,
|
|
|
|
score_threshold: Optional[float] = None,
|
|
|
|
**kwargs: Any,
|
2024-04-29 16:11:44 +00:00
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
result = self.hybrid_search_with_score(query, k=k, **kwargs)
|
|
|
|
return (
|
|
|
|
result
|
|
|
|
if score_threshold is None
|
|
|
|
else [r for r in result if r[1] >= score_threshold]
|
|
|
|
)
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def ahybrid_search_with_relevance_scores(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
*,
|
|
|
|
score_threshold: Optional[float] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
result = await self.ahybrid_search_with_score(query, k=k, **kwargs)
|
|
|
|
return (
|
|
|
|
result
|
|
|
|
if score_threshold is None
|
|
|
|
else [r for r in result if r[1] >= score_threshold]
|
|
|
|
)
|
|
|
|
|
2024-05-22 20:36:06 +00:00
|
|
|
def hybrid_max_marginal_relevance_search_with_score(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
*,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Return docs most similar to query with a hybrid query
|
|
|
|
and reorder results by MMR.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Text to look up documents similar to.
|
|
|
|
k (int, optional): Number of Documents to return. Defaults to 4.
|
|
|
|
fetch_k (int, optional): Total results to select k from.
|
|
|
|
Defaults to 20.
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
Defaults to 0.5
|
|
|
|
filters (str, optional): Filtering expression. Defaults to None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query and score for each
|
|
|
|
"""
|
|
|
|
|
|
|
|
embedding = self.embed_query(query)
|
|
|
|
results = self._simple_search(
|
|
|
|
embedding, query, fetch_k, filters=filters, **kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
return _reorder_results_with_maximal_marginal_relevance(
|
|
|
|
results, query_embedding=np.array(embedding), lambda_mult=lambda_mult, k=k
|
|
|
|
)
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def ahybrid_max_marginal_relevance_search_with_score(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
fetch_k: int = 20,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
*,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""Return docs most similar to query with a hybrid query
|
|
|
|
and reorder results by MMR.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): Text to look up documents similar to.
|
|
|
|
k (int, optional): Number of Documents to return. Defaults to 4.
|
|
|
|
fetch_k (int, optional): Total results to select k from.
|
|
|
|
Defaults to 20.
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
Defaults to 0.5
|
|
|
|
filters (str, optional): Filtering expression. Defaults to None.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query and score for each
|
|
|
|
"""
|
|
|
|
|
|
|
|
embedding = await self._aembed_query(query)
|
|
|
|
docs, scores, vectors = await self._asimple_search(
|
|
|
|
embedding, query, fetch_k, filters=filters, **kwargs
|
|
|
|
)
|
|
|
|
|
|
|
|
return await self._areorder_results_with_maximal_marginal_relevance(
|
|
|
|
docs,
|
|
|
|
scores,
|
|
|
|
vectors,
|
|
|
|
query_embedding=np.array(embedding),
|
|
|
|
lambda_mult=lambda_mult,
|
|
|
|
k=k,
|
|
|
|
)
|
|
|
|
|
2024-05-22 20:36:06 +00:00
|
|
|
def _simple_search(
|
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
text_query: str,
|
|
|
|
k: int,
|
|
|
|
*,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> SearchItemPaged[dict]:
|
|
|
|
"""Perform vector or hybrid search in the Azure search index.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
embedding: A vector embedding to search in the vector space.
|
|
|
|
text_query: A full-text search query expression;
|
|
|
|
Use "*" or omit this parameter to perform only vector search.
|
|
|
|
k: Number of documents to return.
|
|
|
|
filters: Filtering expression.
|
|
|
|
Returns:
|
|
|
|
Search items
|
|
|
|
"""
|
|
|
|
from azure.search.documents.models import VectorizedQuery
|
|
|
|
|
|
|
|
return self.client.search(
|
|
|
|
search_text=text_query,
|
|
|
|
vector_queries=[
|
|
|
|
VectorizedQuery(
|
|
|
|
vector=np.array(embedding, dtype=np.float32).tolist(),
|
|
|
|
k_nearest_neighbors=k,
|
|
|
|
fields=FIELDS_CONTENT_VECTOR,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
filter=filters,
|
|
|
|
top=k,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def _asimple_search(
|
|
|
|
self,
|
|
|
|
embedding: List[float],
|
|
|
|
text_query: str,
|
|
|
|
k: int,
|
|
|
|
*,
|
|
|
|
filters: Optional[str] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> Tuple[List[Document], List[float], List[List[float]]]:
|
|
|
|
"""Perform vector or hybrid search in the Azure search index.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
embedding: A vector embedding to search in the vector space.
|
|
|
|
text_query: A full-text search query expression;
|
|
|
|
Use "*" or omit this parameter to perform only vector search.
|
|
|
|
k: Number of documents to return.
|
|
|
|
filters: Filtering expression.
|
|
|
|
Returns:
|
|
|
|
Search items
|
|
|
|
"""
|
|
|
|
from azure.search.documents.models import VectorizedQuery
|
|
|
|
|
|
|
|
async with self._async_client() as async_client:
|
|
|
|
results = await async_client.search(
|
|
|
|
search_text=text_query,
|
|
|
|
vector_queries=[
|
|
|
|
VectorizedQuery(
|
|
|
|
vector=np.array(embedding, dtype=np.float32).tolist(),
|
|
|
|
k_nearest_neighbors=k,
|
|
|
|
fields=FIELDS_CONTENT_VECTOR,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
filter=filters,
|
|
|
|
top=k,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
docs = [
|
|
|
|
(
|
|
|
|
_result_to_document(result),
|
|
|
|
float(result["@search.score"]),
|
|
|
|
result[FIELDS_CONTENT_VECTOR],
|
|
|
|
)
|
|
|
|
async for result in results
|
|
|
|
]
|
|
|
|
if not docs:
|
|
|
|
raise ValueError(f"No {docs=}")
|
|
|
|
documents, scores, vectors = map(list, zip(*docs))
|
|
|
|
return documents, scores, vectors
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
def semantic_hybrid_search(
|
|
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
|
|
) -> List[Document]:
|
|
|
|
"""
|
|
|
|
Returns the most similar indexed documents to the query text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): The query text for which to find similar documents.
|
|
|
|
k (int): The number of documents to return. Default is 4.
|
2024-05-22 20:36:06 +00:00
|
|
|
filters: Filtering expression.
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Document]: A list of documents that are most similar to the query text.
|
|
|
|
"""
|
|
|
|
docs_and_scores = self.semantic_hybrid_search_with_score_and_rerank(
|
2024-05-22 20:36:06 +00:00
|
|
|
query, k=k, **kwargs
|
2023-12-11 21:53:30 +00:00
|
|
|
)
|
|
|
|
return [doc for doc, _, _ in docs_and_scores]
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def asemantic_hybrid_search(
|
|
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
|
|
) -> List[Document]:
|
|
|
|
"""
|
|
|
|
Returns the most similar indexed documents to the query text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): The query text for which to find similar documents.
|
|
|
|
k (int): The number of documents to return. Default is 4.
|
|
|
|
filters: Filtering expression.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Document]: A list of documents that are most similar to the query text.
|
|
|
|
"""
|
|
|
|
docs_and_scores = await self.asemantic_hybrid_search_with_score_and_rerank(
|
|
|
|
query, k=k, **kwargs
|
|
|
|
)
|
|
|
|
return [doc for doc, _, _ in docs_and_scores]
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
def semantic_hybrid_search_with_score(
|
2024-05-16 19:54:32 +00:00
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
score_type: Literal["score", "reranker_score"] = "score",
|
2024-06-05 21:39:54 +00:00
|
|
|
*,
|
|
|
|
score_threshold: Optional[float] = None,
|
2024-05-16 19:54:32 +00:00
|
|
|
**kwargs: Any,
|
2023-12-11 21:53:30 +00:00
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""
|
|
|
|
Returns the most similar indexed documents to the query text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): The query text for which to find similar documents.
|
|
|
|
k (int): The number of documents to return. Default is 4.
|
2024-05-16 19:54:32 +00:00
|
|
|
score_type: Must either be "score" or "reranker_score".
|
|
|
|
Defaulted to "score".
|
2024-05-22 20:36:06 +00:00
|
|
|
filters: Filtering expression.
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
Returns:
|
2024-05-16 19:54:32 +00:00
|
|
|
List[Tuple[Document, float]]: A list of documents and their
|
|
|
|
corresponding scores.
|
2023-12-11 21:53:30 +00:00
|
|
|
"""
|
|
|
|
docs_and_scores = self.semantic_hybrid_search_with_score_and_rerank(
|
2024-05-22 20:36:06 +00:00
|
|
|
query, k=k, **kwargs
|
2023-12-11 21:53:30 +00:00
|
|
|
)
|
2024-05-16 19:54:32 +00:00
|
|
|
if score_type == "score":
|
|
|
|
return [
|
|
|
|
(doc, score)
|
|
|
|
for doc, score, _ in docs_and_scores
|
|
|
|
if score_threshold is None or score >= score_threshold
|
|
|
|
]
|
|
|
|
elif score_type == "reranker_score":
|
|
|
|
return [
|
|
|
|
(doc, reranker_score)
|
|
|
|
for doc, _, reranker_score in docs_and_scores
|
|
|
|
if score_threshold is None or reranker_score >= score_threshold
|
|
|
|
]
|
2023-12-11 21:53:30 +00:00
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def asemantic_hybrid_search_with_score(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
k: int = 4,
|
|
|
|
score_type: Literal["score", "reranker_score"] = "score",
|
|
|
|
*,
|
|
|
|
score_threshold: Optional[float] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
"""
|
|
|
|
Returns the most similar indexed documents to the query text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query (str): The query text for which to find similar documents.
|
|
|
|
k (int): The number of documents to return. Default is 4.
|
|
|
|
score_type: Must either be "score" or "reranker_score".
|
|
|
|
Defaulted to "score".
|
|
|
|
filters: Filtering expression.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List[Tuple[Document, float]]: A list of documents and their
|
|
|
|
corresponding scores.
|
|
|
|
"""
|
|
|
|
docs_and_scores = await self.asemantic_hybrid_search_with_score_and_rerank(
|
|
|
|
query, k=k, **kwargs
|
|
|
|
)
|
|
|
|
if score_type == "score":
|
|
|
|
return [
|
|
|
|
(doc, score)
|
|
|
|
for doc, score, _ in docs_and_scores
|
|
|
|
if score_threshold is None or score >= score_threshold
|
|
|
|
]
|
|
|
|
elif score_type == "reranker_score":
|
|
|
|
return [
|
|
|
|
(doc, reranker_score)
|
|
|
|
for doc, _, reranker_score in docs_and_scores
|
|
|
|
if score_threshold is None or reranker_score >= score_threshold
|
|
|
|
]
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
def semantic_hybrid_search_with_score_and_rerank(
|
2024-05-22 20:36:06 +00:00
|
|
|
self, query: str, k: int = 4, *, filters: Optional[str] = None, **kwargs: Any
|
2023-12-11 21:53:30 +00:00
|
|
|
) -> List[Tuple[Document, float, float]]:
|
2024-04-24 19:14:33 +00:00
|
|
|
"""Return docs most similar to query with a hybrid query.
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Text to look up documents similar to.
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
2024-05-22 20:36:06 +00:00
|
|
|
filters: Filtering expression.
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query and score for each
|
|
|
|
"""
|
2024-02-13 03:23:35 +00:00
|
|
|
from azure.search.documents.models import VectorizedQuery
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
results = self.client.search(
|
|
|
|
search_text=query,
|
2024-02-13 03:23:35 +00:00
|
|
|
vector_queries=[
|
|
|
|
VectorizedQuery(
|
|
|
|
vector=np.array(self.embed_query(query), dtype=np.float32).tolist(),
|
|
|
|
k_nearest_neighbors=k,
|
2023-12-11 21:53:30 +00:00
|
|
|
fields=FIELDS_CONTENT_VECTOR,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
filter=filters,
|
|
|
|
query_type="semantic",
|
|
|
|
semantic_configuration_name=self.semantic_configuration_name,
|
|
|
|
query_caption="extractive",
|
|
|
|
query_answer="extractive",
|
|
|
|
top=k,
|
2024-05-22 20:36:06 +00:00
|
|
|
**kwargs,
|
2023-12-11 21:53:30 +00:00
|
|
|
)
|
|
|
|
# Get Semantic Answers
|
|
|
|
semantic_answers = results.get_answers() or []
|
|
|
|
semantic_answers_dict: Dict = {}
|
|
|
|
for semantic_answer in semantic_answers:
|
|
|
|
semantic_answers_dict[semantic_answer.key] = {
|
|
|
|
"text": semantic_answer.text,
|
|
|
|
"highlights": semantic_answer.highlights,
|
|
|
|
}
|
|
|
|
# Convert results to Document objects
|
|
|
|
docs = [
|
|
|
|
(
|
|
|
|
Document(
|
|
|
|
page_content=result.pop(FIELDS_CONTENT),
|
|
|
|
metadata={
|
|
|
|
**(
|
|
|
|
json.loads(result[FIELDS_METADATA])
|
|
|
|
if FIELDS_METADATA in result
|
|
|
|
else {
|
|
|
|
k: v
|
|
|
|
for k, v in result.items()
|
|
|
|
if k != FIELDS_CONTENT_VECTOR
|
|
|
|
}
|
|
|
|
),
|
|
|
|
**{
|
|
|
|
"captions": {
|
|
|
|
"text": result.get("@search.captions", [{}])[0].text,
|
|
|
|
"highlights": result.get("@search.captions", [{}])[
|
|
|
|
0
|
|
|
|
].highlights,
|
|
|
|
}
|
|
|
|
if result.get("@search.captions")
|
|
|
|
else {},
|
|
|
|
"answers": semantic_answers_dict.get(
|
2024-03-26 01:51:54 +00:00
|
|
|
result.get(FIELDS_ID, ""),
|
2024-01-07 01:04:59 +00:00
|
|
|
"",
|
2023-12-11 21:53:30 +00:00
|
|
|
),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
),
|
|
|
|
float(result["@search.score"]),
|
|
|
|
float(result["@search.reranker_score"]),
|
|
|
|
)
|
|
|
|
for result in results
|
|
|
|
]
|
|
|
|
return docs
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def asemantic_hybrid_search_with_score_and_rerank(
|
|
|
|
self, query: str, k: int = 4, *, filters: Optional[str] = None, **kwargs: Any
|
|
|
|
) -> List[Tuple[Document, float, float]]:
|
|
|
|
"""Return docs most similar to query with a hybrid query.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
query: Text to look up documents similar to.
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
filters: Filtering expression.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of Documents most similar to the query and score for each
|
|
|
|
"""
|
|
|
|
from azure.search.documents.models import VectorizedQuery
|
|
|
|
|
|
|
|
vector = await self._aembed_query(query)
|
|
|
|
async with self._async_client() as async_client:
|
|
|
|
results = await async_client.search(
|
|
|
|
search_text=query,
|
|
|
|
vector_queries=[
|
|
|
|
VectorizedQuery(
|
|
|
|
vector=np.array(vector, dtype=np.float32).tolist(),
|
|
|
|
k_nearest_neighbors=k,
|
|
|
|
fields=FIELDS_CONTENT_VECTOR,
|
|
|
|
)
|
|
|
|
],
|
|
|
|
filter=filters,
|
|
|
|
query_type="semantic",
|
|
|
|
semantic_configuration_name=self.semantic_configuration_name,
|
|
|
|
query_caption="extractive",
|
|
|
|
query_answer="extractive",
|
|
|
|
top=k,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
# Get Semantic Answers
|
|
|
|
semantic_answers = (await results.get_answers()) or []
|
|
|
|
semantic_answers_dict: Dict = {}
|
|
|
|
for semantic_answer in semantic_answers:
|
|
|
|
semantic_answers_dict[semantic_answer.key] = {
|
|
|
|
"text": semantic_answer.text,
|
|
|
|
"highlights": semantic_answer.highlights,
|
|
|
|
}
|
|
|
|
# Convert results to Document objects
|
|
|
|
docs = [
|
|
|
|
(
|
|
|
|
Document(
|
|
|
|
page_content=result.pop(FIELDS_CONTENT),
|
|
|
|
metadata={
|
|
|
|
**(
|
|
|
|
json.loads(result[FIELDS_METADATA])
|
|
|
|
if FIELDS_METADATA in result
|
|
|
|
else {
|
|
|
|
k: v
|
|
|
|
for k, v in result.items()
|
|
|
|
if k != FIELDS_CONTENT_VECTOR
|
|
|
|
}
|
|
|
|
),
|
|
|
|
**{
|
|
|
|
"captions": {
|
|
|
|
"text": result.get("@search.captions", [{}])[
|
|
|
|
0
|
|
|
|
].text,
|
|
|
|
"highlights": result.get("@search.captions", [{}])[
|
|
|
|
0
|
|
|
|
].highlights,
|
|
|
|
}
|
|
|
|
if result.get("@search.captions")
|
|
|
|
else {},
|
|
|
|
"answers": semantic_answers_dict.get(
|
|
|
|
result.get(FIELDS_ID, ""),
|
|
|
|
"",
|
|
|
|
),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
),
|
|
|
|
float(result["@search.score"]),
|
|
|
|
float(result["@search.reranker_score"]),
|
|
|
|
)
|
|
|
|
async for result in results
|
|
|
|
]
|
|
|
|
return docs
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
@classmethod
|
|
|
|
def from_texts(
|
|
|
|
cls: Type[AzureSearch],
|
|
|
|
texts: List[str],
|
|
|
|
embedding: Embeddings,
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
azure_search_endpoint: str = "",
|
|
|
|
azure_search_key: str = "",
|
|
|
|
index_name: str = "langchain-index",
|
2024-03-09 01:05:35 +00:00
|
|
|
fields: Optional[List[SearchField]] = None,
|
2023-12-11 21:53:30 +00:00
|
|
|
**kwargs: Any,
|
|
|
|
) -> AzureSearch:
|
|
|
|
# Creating a new Azure Search instance
|
|
|
|
azure_search = cls(
|
|
|
|
azure_search_endpoint,
|
|
|
|
azure_search_key,
|
|
|
|
index_name,
|
community: Fixing a performance issue with AzureSearch to perform batch embedding (#15594)
- **Description:** Azure Cognitive Search vector DB store performs slow
embedding as it does not utilize the batch embedding functionality. This
PR provide a fix to improve the performance of Azure Search class when
adding documents to the vector search,
- **Issue:** #11313 ,
- **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!
Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.
See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/
If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
-->
2024-01-12 18:58:55 +00:00
|
|
|
embedding,
|
2024-03-09 01:05:35 +00:00
|
|
|
fields=fields,
|
2024-05-22 20:36:06 +00:00
|
|
|
**kwargs,
|
2023-12-11 21:53:30 +00:00
|
|
|
)
|
|
|
|
azure_search.add_texts(texts, metadatas, **kwargs)
|
|
|
|
return azure_search
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
@classmethod
|
|
|
|
async def afrom_texts(
|
|
|
|
cls: Type[AzureSearch],
|
|
|
|
texts: List[str],
|
|
|
|
embedding: Embeddings,
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
azure_search_endpoint: str = "",
|
|
|
|
azure_search_key: str = "",
|
|
|
|
index_name: str = "langchain-index",
|
|
|
|
fields: Optional[List[SearchField]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> AzureSearch:
|
|
|
|
# Creating a new Azure Search instance
|
|
|
|
azure_search = cls(
|
|
|
|
azure_search_endpoint,
|
|
|
|
azure_search_key,
|
|
|
|
index_name,
|
|
|
|
embedding,
|
|
|
|
fields=fields,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
await azure_search.aadd_texts(texts, metadatas, **kwargs)
|
|
|
|
return azure_search
|
|
|
|
|
2024-05-22 20:36:06 +00:00
|
|
|
@classmethod
|
|
|
|
async def afrom_embeddings(
|
|
|
|
cls: Type[AzureSearch],
|
|
|
|
text_embeddings: Iterable[Tuple[str, List[float]]],
|
|
|
|
embedding: Embeddings,
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
*,
|
|
|
|
azure_search_endpoint: str = "",
|
|
|
|
azure_search_key: str = "",
|
|
|
|
index_name: str = "langchain-index",
|
|
|
|
fields: Optional[List[SearchField]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> AzureSearch:
|
2024-06-05 21:39:54 +00:00
|
|
|
text_embeddings, first_text_embedding = _peek(text_embeddings)
|
|
|
|
if first_text_embedding is None:
|
|
|
|
raise ValueError("Cannot create AzureSearch from empty embeddings.")
|
|
|
|
vector_search_dimensions = len(first_text_embedding[1])
|
|
|
|
|
|
|
|
azure_search = cls(
|
2024-05-22 20:36:06 +00:00
|
|
|
azure_search_endpoint=azure_search_endpoint,
|
|
|
|
azure_search_key=azure_search_key,
|
|
|
|
index_name=index_name,
|
2024-06-05 21:39:54 +00:00
|
|
|
embedding_function=embedding,
|
2024-05-22 20:36:06 +00:00
|
|
|
fields=fields,
|
2024-06-05 21:39:54 +00:00
|
|
|
vector_search_dimensions=vector_search_dimensions,
|
2024-05-22 20:36:06 +00:00
|
|
|
**kwargs,
|
|
|
|
)
|
2024-06-05 21:39:54 +00:00
|
|
|
await azure_search.aadd_embeddings(text_embeddings, metadatas, **kwargs)
|
|
|
|
return azure_search
|
2024-05-22 20:36:06 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_embeddings(
|
|
|
|
cls: Type[AzureSearch],
|
|
|
|
text_embeddings: Iterable[Tuple[str, List[float]]],
|
|
|
|
embedding: Embeddings,
|
|
|
|
metadatas: Optional[List[dict]] = None,
|
|
|
|
*,
|
|
|
|
azure_search_endpoint: str = "",
|
|
|
|
azure_search_key: str = "",
|
|
|
|
index_name: str = "langchain-index",
|
|
|
|
fields: Optional[List[SearchField]] = None,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> AzureSearch:
|
|
|
|
# Creating a new Azure Search instance
|
|
|
|
text_embeddings, first_text_embedding = _peek(text_embeddings)
|
|
|
|
if first_text_embedding is None:
|
|
|
|
raise ValueError("Cannot create AzureSearch from empty embeddings.")
|
|
|
|
vector_search_dimensions = len(first_text_embedding[1])
|
|
|
|
|
|
|
|
azure_search = cls(
|
|
|
|
azure_search_endpoint=azure_search_endpoint,
|
|
|
|
azure_search_key=azure_search_key,
|
|
|
|
index_name=index_name,
|
|
|
|
embedding_function=embedding,
|
|
|
|
fields=fields,
|
|
|
|
vector_search_dimensions=vector_search_dimensions,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
azure_search.add_embeddings(text_embeddings, metadatas, **kwargs)
|
|
|
|
return azure_search
|
|
|
|
|
2024-06-05 21:39:54 +00:00
|
|
|
async def _areorder_results_with_maximal_marginal_relevance(
|
|
|
|
self,
|
|
|
|
documents: List[Document],
|
|
|
|
scores: List[float],
|
|
|
|
vectors: List[List[float]],
|
|
|
|
query_embedding: np.ndarray,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
k: int = 4,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
# Get the new order of results.
|
|
|
|
new_ordering = maximal_marginal_relevance(
|
|
|
|
query_embedding, vectors, k=k, lambda_mult=lambda_mult
|
|
|
|
)
|
|
|
|
|
|
|
|
# Reorder the values and return.
|
|
|
|
ret: List[Tuple[Document, float]] = []
|
|
|
|
for x in new_ordering:
|
|
|
|
# Function can return -1 index
|
|
|
|
if x == -1:
|
|
|
|
break
|
|
|
|
ret.append((documents[x], scores[x])) # type: ignore
|
|
|
|
|
|
|
|
return ret
|
|
|
|
|
2024-04-18 20:06:47 +00:00
|
|
|
def as_retriever(self, **kwargs: Any) -> AzureSearchVectorStoreRetriever: # type: ignore
|
|
|
|
"""Return AzureSearchVectorStoreRetriever initialized from this VectorStore.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
search_type (Optional[str]): Defines the type of search that
|
|
|
|
the Retriever should perform.
|
|
|
|
Can be "similarity" (default), "hybrid", or
|
|
|
|
"semantic_hybrid".
|
|
|
|
search_kwargs (Optional[Dict]): Keyword arguments to pass to the
|
|
|
|
search function. Can include things like:
|
|
|
|
score_threshold: Minimum relevance threshold
|
|
|
|
for similarity_score_threshold
|
|
|
|
fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
|
|
|
|
lambda_mult: Diversity of results returned by MMR;
|
|
|
|
1 for minimum diversity and 0 for maximum. (Default: 0.5)
|
|
|
|
filter: Filter by document metadata
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
AzureSearchVectorStoreRetriever: Retriever class for VectorStore.
|
|
|
|
"""
|
|
|
|
tags = kwargs.pop("tags", None) or []
|
|
|
|
tags.extend(self._get_retriever_tags())
|
|
|
|
return AzureSearchVectorStoreRetriever(vectorstore=self, **kwargs, tags=tags)
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
class AzureSearchVectorStoreRetriever(BaseRetriever):
|
|
|
|
"""Retriever that uses `Azure Cognitive Search`."""
|
|
|
|
|
|
|
|
vectorstore: AzureSearch
|
|
|
|
"""Azure Search instance used to find similar documents."""
|
|
|
|
search_type: str = "hybrid"
|
|
|
|
"""Type of search to perform. Options are "similarity", "hybrid",
|
2024-05-16 19:54:32 +00:00
|
|
|
"semantic_hybrid", "similarity_score_threshold", "hybrid_score_threshold",
|
|
|
|
or "semantic_hybrid_score_threshold"."""
|
2023-12-11 21:53:30 +00:00
|
|
|
k: int = 4
|
|
|
|
"""Number of documents to return."""
|
2024-05-22 21:46:41 +00:00
|
|
|
search_kwargs: dict = {}
|
|
|
|
"""Search params.
|
|
|
|
score_threshold: Minimum relevance threshold
|
|
|
|
for similarity_score_threshold
|
|
|
|
fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
|
|
|
|
lambda_mult: Diversity of results returned by MMR;
|
|
|
|
1 for minimum diversity and 0 for maximum. (Default: 0.5)
|
|
|
|
filter: Filter by document metadata
|
|
|
|
"""
|
|
|
|
|
2024-04-29 16:11:44 +00:00
|
|
|
allowed_search_types: ClassVar[Collection[str]] = (
|
|
|
|
"similarity",
|
|
|
|
"similarity_score_threshold",
|
|
|
|
"hybrid",
|
|
|
|
"hybrid_score_threshold",
|
|
|
|
"semantic_hybrid",
|
2024-05-16 19:54:32 +00:00
|
|
|
"semantic_hybrid_score_threshold",
|
2024-04-29 16:11:44 +00:00
|
|
|
)
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
class Config:
|
|
|
|
"""Configuration for this pydantic object."""
|
|
|
|
|
|
|
|
arbitrary_types_allowed = True
|
|
|
|
|
|
|
|
@root_validator()
|
|
|
|
def validate_search_type(cls, values: Dict) -> Dict:
|
|
|
|
"""Validate search type."""
|
|
|
|
if "search_type" in values:
|
|
|
|
search_type = values["search_type"]
|
2024-04-29 16:11:44 +00:00
|
|
|
if search_type not in cls.allowed_search_types:
|
2024-04-18 20:06:47 +00:00
|
|
|
raise ValueError(
|
|
|
|
f"search_type of {search_type} not allowed. Valid values are: "
|
2024-04-29 16:11:44 +00:00
|
|
|
f"{cls.allowed_search_types}"
|
2024-04-18 20:06:47 +00:00
|
|
|
)
|
2023-12-11 21:53:30 +00:00
|
|
|
return values
|
|
|
|
|
|
|
|
def _get_relevant_documents(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
run_manager: CallbackManagerForRetrieverRun,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
2024-05-22 21:46:41 +00:00
|
|
|
params = {**self.search_kwargs, **kwargs}
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
if self.search_type == "similarity":
|
2024-05-22 21:46:41 +00:00
|
|
|
docs = self.vectorstore.vector_search(query, k=self.k, **params)
|
2024-04-24 19:14:33 +00:00
|
|
|
elif self.search_type == "similarity_score_threshold":
|
|
|
|
docs = [
|
|
|
|
doc
|
|
|
|
for doc, _ in self.vectorstore.similarity_search_with_relevance_scores(
|
2024-05-22 21:46:41 +00:00
|
|
|
query, k=self.k, **params
|
2024-04-24 19:14:33 +00:00
|
|
|
)
|
|
|
|
]
|
2023-12-11 21:53:30 +00:00
|
|
|
elif self.search_type == "hybrid":
|
2024-05-22 21:46:41 +00:00
|
|
|
docs = self.vectorstore.hybrid_search(query, k=self.k, **params)
|
2024-04-29 16:11:44 +00:00
|
|
|
elif self.search_type == "hybrid_score_threshold":
|
|
|
|
docs = [
|
|
|
|
doc
|
|
|
|
for doc, _ in self.vectorstore.hybrid_search_with_relevance_scores(
|
2024-05-22 21:46:41 +00:00
|
|
|
query, k=self.k, **params
|
2024-04-29 16:11:44 +00:00
|
|
|
)
|
|
|
|
]
|
2023-12-11 21:53:30 +00:00
|
|
|
elif self.search_type == "semantic_hybrid":
|
2024-05-22 21:46:41 +00:00
|
|
|
docs = self.vectorstore.semantic_hybrid_search(query, k=self.k, **params)
|
2024-05-16 19:54:32 +00:00
|
|
|
elif self.search_type == "semantic_hybrid_score_threshold":
|
|
|
|
docs = [
|
|
|
|
doc
|
|
|
|
for doc, _ in self.vectorstore.semantic_hybrid_search_with_score(
|
2024-06-05 21:39:54 +00:00
|
|
|
query, k=self.k, **params
|
|
|
|
)
|
|
|
|
]
|
|
|
|
else:
|
|
|
|
raise ValueError(f"search_type of {self.search_type} not allowed.")
|
|
|
|
return docs
|
|
|
|
|
|
|
|
async def _aget_relevant_documents(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
*,
|
|
|
|
run_manager: AsyncCallbackManagerForRetrieverRun,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> List[Document]:
|
|
|
|
params = {**self.search_kwargs, **kwargs}
|
|
|
|
|
|
|
|
if self.search_type == "similarity":
|
|
|
|
docs = await self.vectorstore.avector_search(query, k=self.k, **params)
|
|
|
|
elif self.search_type == "similarity_score_threshold":
|
|
|
|
docs_and_scores = (
|
|
|
|
await self.vectorstore.asimilarity_search_with_relevance_scores(
|
|
|
|
query, k=self.k, **params
|
|
|
|
)
|
|
|
|
)
|
|
|
|
docs = [doc for doc, _ in docs_and_scores]
|
|
|
|
elif self.search_type == "hybrid":
|
|
|
|
docs = await self.vectorstore.ahybrid_search(query, k=self.k, **params)
|
|
|
|
elif self.search_type == "hybrid_score_threshold":
|
|
|
|
docs_and_scores = (
|
|
|
|
await self.vectorstore.ahybrid_search_with_relevance_scores(
|
|
|
|
query, k=self.k, **params
|
|
|
|
)
|
|
|
|
)
|
|
|
|
docs = [doc for doc, _ in docs_and_scores]
|
|
|
|
elif self.search_type == "semantic_hybrid":
|
|
|
|
docs = await self.vectorstore.asemantic_hybrid_search(
|
|
|
|
query, k=self.k, **params
|
|
|
|
)
|
|
|
|
elif self.search_type == "semantic_hybrid_score_threshold":
|
|
|
|
docs = [
|
|
|
|
doc
|
|
|
|
for doc, _ in await self.vectorstore.asemantic_hybrid_search_with_score(
|
2024-05-22 21:46:41 +00:00
|
|
|
query, k=self.k, **params
|
2024-05-16 19:54:32 +00:00
|
|
|
)
|
|
|
|
]
|
2023-12-11 21:53:30 +00:00
|
|
|
else:
|
|
|
|
raise ValueError(f"search_type of {self.search_type} not allowed.")
|
|
|
|
return docs
|
2024-05-22 20:36:06 +00:00
|
|
|
|
|
|
|
|
|
|
|
def _results_to_documents(
|
|
|
|
results: SearchItemPaged[Dict],
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
docs = [
|
|
|
|
(
|
|
|
|
_result_to_document(result),
|
|
|
|
float(result["@search.score"]),
|
|
|
|
)
|
|
|
|
for result in results
|
|
|
|
]
|
|
|
|
return docs
|
|
|
|
|
|
|
|
|
|
|
|
def _reorder_results_with_maximal_marginal_relevance(
|
|
|
|
results: SearchItemPaged[Dict],
|
|
|
|
query_embedding: np.ndarray,
|
|
|
|
lambda_mult: float = 0.5,
|
|
|
|
k: int = 4,
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
# Convert results to Document objects
|
|
|
|
docs = [
|
|
|
|
(
|
|
|
|
_result_to_document(result),
|
|
|
|
float(result["@search.score"]),
|
|
|
|
result[FIELDS_CONTENT_VECTOR],
|
|
|
|
)
|
|
|
|
for result in results
|
|
|
|
]
|
|
|
|
documents, scores, vectors = map(list, zip(*docs))
|
|
|
|
|
|
|
|
# Get the new order of results.
|
|
|
|
new_ordering = maximal_marginal_relevance(
|
|
|
|
query_embedding, vectors, k=k, lambda_mult=lambda_mult
|
|
|
|
)
|
|
|
|
|
|
|
|
# Reorder the values and return.
|
|
|
|
ret: List[Tuple[Document, float]] = []
|
|
|
|
for x in new_ordering:
|
|
|
|
# Function can return -1 index
|
|
|
|
if x == -1:
|
|
|
|
break
|
|
|
|
ret.append((documents[x], scores[x])) # type: ignore
|
|
|
|
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
|
|
|
def _result_to_document(result: Dict) -> Document:
|
|
|
|
return Document(
|
|
|
|
page_content=result.pop(FIELDS_CONTENT),
|
|
|
|
metadata=json.loads(result[FIELDS_METADATA])
|
|
|
|
if FIELDS_METADATA in result
|
|
|
|
else {
|
|
|
|
key: value for key, value in result.items() if key != FIELDS_CONTENT_VECTOR
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _peek(iterable: Iterable, default: Optional[Any] = None) -> Tuple[Iterable, Any]:
|
|
|
|
try:
|
|
|
|
iterator = iter(iterable)
|
|
|
|
value = next(iterator)
|
|
|
|
iterable = itertools.chain([value], iterator)
|
|
|
|
return iterable, value
|
|
|
|
except StopIteration:
|
|
|
|
return iterable, default
|