mirror of https://github.com/hwchase17/langchain
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
799 lines
28 KiB
Python
799 lines
28 KiB
Python
from __future__ import annotations
|
|
|
|
import uuid
|
|
import warnings
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Dict,
|
|
Iterable,
|
|
List,
|
|
Mapping,
|
|
Optional,
|
|
Tuple,
|
|
Union,
|
|
)
|
|
|
|
from langchain_core._api import deprecated
|
|
from langchain_core.documents import Document
|
|
from langchain_core.embeddings import Embeddings
|
|
from langchain_core.utils import get_from_dict_or_env
|
|
from langchain_core.vectorstores import VectorStore
|
|
|
|
if TYPE_CHECKING:
|
|
from elasticsearch import Elasticsearch
|
|
|
|
|
|
def _default_text_mapping(dim: int) -> Dict:
|
|
return {
|
|
"properties": {
|
|
"text": {"type": "text"},
|
|
"vector": {"type": "dense_vector", "dims": dim},
|
|
}
|
|
}
|
|
|
|
|
|
def _default_script_query(query_vector: List[float], filter: Optional[dict]) -> Dict:
|
|
if filter:
|
|
((key, value),) = filter.items()
|
|
filter = {"match": {f"metadata.{key}.keyword": f"{value}"}}
|
|
else:
|
|
filter = {"match_all": {}}
|
|
return {
|
|
"script_score": {
|
|
"query": filter,
|
|
"script": {
|
|
"source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
|
|
"params": {"query_vector": query_vector},
|
|
},
|
|
}
|
|
}
|
|
|
|
|
|
class ElasticVectorSearch(VectorStore):
|
|
"""
|
|
|
|
ElasticVectorSearch uses the brute force method of searching on vectors.
|
|
|
|
Recommended to use ElasticsearchStore instead, which gives you the option
|
|
to uses the approx HNSW algorithm which performs better on large datasets.
|
|
|
|
ElasticsearchStore also supports metadata filtering, customising the
|
|
query retriever and much more!
|
|
|
|
You can read more on ElasticsearchStore:
|
|
https://python.langchain.com/docs/integrations/vectorstores/elasticsearch
|
|
|
|
To connect to an `Elasticsearch` instance that does not require
|
|
login credentials, pass the Elasticsearch URL and index name along with the
|
|
embedding object to the constructor.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.vectorstores import ElasticVectorSearch
|
|
from langchain_community.embeddings import OpenAIEmbeddings
|
|
|
|
embedding = OpenAIEmbeddings()
|
|
elastic_vector_search = ElasticVectorSearch(
|
|
elasticsearch_url="http://localhost:9200",
|
|
index_name="test_index",
|
|
embedding=embedding
|
|
)
|
|
|
|
|
|
To connect to an Elasticsearch instance that requires login credentials,
|
|
including Elastic Cloud, use the Elasticsearch URL format
|
|
https://username:password@es_host:9243. For example, to connect to Elastic
|
|
Cloud, create the Elasticsearch URL with the required authentication details and
|
|
pass it to the ElasticVectorSearch constructor as the named parameter
|
|
elasticsearch_url.
|
|
|
|
You can obtain your Elastic Cloud URL and login credentials by logging in to the
|
|
Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and
|
|
navigating to the "Deployments" page.
|
|
|
|
To obtain your Elastic Cloud password for the default "elastic" user:
|
|
|
|
1. Log in to the Elastic Cloud console at https://cloud.elastic.co
|
|
2. Go to "Security" > "Users"
|
|
3. Locate the "elastic" user and click "Edit"
|
|
4. Click "Reset password"
|
|
5. Follow the prompts to reset the password
|
|
|
|
The format for Elastic Cloud URLs is
|
|
https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.vectorstores import ElasticVectorSearch
|
|
from langchain_community.embeddings import OpenAIEmbeddings
|
|
|
|
embedding = OpenAIEmbeddings()
|
|
|
|
elastic_host = "cluster_id.region_id.gcp.cloud.es.io"
|
|
elasticsearch_url = f"https://username:password@{elastic_host}:9243"
|
|
elastic_vector_search = ElasticVectorSearch(
|
|
elasticsearch_url=elasticsearch_url,
|
|
index_name="test_index",
|
|
embedding=embedding
|
|
)
|
|
|
|
Args:
|
|
elasticsearch_url (str): The URL for the Elasticsearch instance.
|
|
index_name (str): The name of the Elasticsearch index for the embeddings.
|
|
embedding (Embeddings): An object that provides the ability to embed text.
|
|
It should be an instance of a class that subclasses the Embeddings
|
|
abstract base class, such as OpenAIEmbeddings()
|
|
|
|
Raises:
|
|
ValueError: If the elasticsearch python package is not installed.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
elasticsearch_url: str,
|
|
index_name: str,
|
|
embedding: Embeddings,
|
|
*,
|
|
ssl_verify: Optional[Dict[str, Any]] = None,
|
|
):
|
|
"""Initialize with necessary components."""
|
|
warnings.warn(
|
|
"ElasticVectorSearch will be removed in a future release. See"
|
|
"Elasticsearch integration docs on how to upgrade."
|
|
)
|
|
|
|
try:
|
|
import elasticsearch
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import elasticsearch python package. "
|
|
"Please install it with `pip install elasticsearch`."
|
|
)
|
|
self.embedding = embedding
|
|
self.index_name = index_name
|
|
_ssl_verify = ssl_verify or {}
|
|
try:
|
|
self.client = elasticsearch.Elasticsearch(
|
|
elasticsearch_url,
|
|
**_ssl_verify,
|
|
headers={"user-agent": self.get_user_agent()},
|
|
)
|
|
except ValueError as e:
|
|
raise ValueError(
|
|
f"Your elasticsearch client string is mis-formatted. Got error: {e} "
|
|
)
|
|
|
|
@staticmethod
|
|
def get_user_agent() -> str:
|
|
from langchain_community import __version__
|
|
|
|
return f"langchain-py-dvs/{__version__}"
|
|
|
|
@property
|
|
def embeddings(self) -> Embeddings:
|
|
return self.embedding
|
|
|
|
def add_texts(
|
|
self,
|
|
texts: Iterable[str],
|
|
metadatas: Optional[List[dict]] = None,
|
|
ids: Optional[List[str]] = None,
|
|
refresh_indices: bool = True,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
"""Run more texts through the embeddings and add to the vectorstore.
|
|
|
|
Args:
|
|
texts: Iterable of strings to add to the vectorstore.
|
|
metadatas: Optional list of metadatas associated with the texts.
|
|
ids: Optional list of unique IDs.
|
|
refresh_indices: bool to refresh ElasticSearch indices
|
|
|
|
Returns:
|
|
List of ids from adding the texts into the vectorstore.
|
|
"""
|
|
try:
|
|
from elasticsearch.exceptions import NotFoundError
|
|
from elasticsearch.helpers import bulk
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import elasticsearch python package. "
|
|
"Please install it with `pip install elasticsearch`."
|
|
)
|
|
requests = []
|
|
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
|
embeddings = self.embedding.embed_documents(list(texts))
|
|
dim = len(embeddings[0])
|
|
mapping = _default_text_mapping(dim)
|
|
|
|
# check to see if the index already exists
|
|
try:
|
|
self.client.indices.get(index=self.index_name)
|
|
except NotFoundError:
|
|
# TODO would be nice to create index before embedding,
|
|
# just to save expensive steps for last
|
|
self.create_index(self.client, self.index_name, mapping)
|
|
|
|
for i, text in enumerate(texts):
|
|
metadata = metadatas[i] if metadatas else {}
|
|
request = {
|
|
"_op_type": "index",
|
|
"_index": self.index_name,
|
|
"vector": embeddings[i],
|
|
"text": text,
|
|
"metadata": metadata,
|
|
"_id": ids[i],
|
|
}
|
|
requests.append(request)
|
|
bulk(self.client, requests)
|
|
|
|
if refresh_indices:
|
|
self.client.indices.refresh(index=self.index_name)
|
|
return ids
|
|
|
|
def similarity_search(
|
|
self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
|
|
) -> List[Document]:
|
|
"""Return docs most similar to query.
|
|
|
|
Args:
|
|
query: Text to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
Returns:
|
|
List of Documents most similar to the query.
|
|
"""
|
|
docs_and_scores = self.similarity_search_with_score(query, k, filter=filter)
|
|
documents = [d[0] for d in docs_and_scores]
|
|
return documents
|
|
|
|
def similarity_search_with_score(
|
|
self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
|
|
) -> List[Tuple[Document, float]]:
|
|
"""Return docs most similar to query.
|
|
Args:
|
|
query: Text to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 4.
|
|
Returns:
|
|
List of Documents most similar to the query.
|
|
"""
|
|
embedding = self.embedding.embed_query(query)
|
|
script_query = _default_script_query(embedding, filter)
|
|
response = self.client_search(
|
|
self.client, self.index_name, script_query, size=k
|
|
)
|
|
hits = [hit for hit in response["hits"]["hits"]]
|
|
docs_and_scores = [
|
|
(
|
|
Document(
|
|
page_content=hit["_source"]["text"],
|
|
metadata=hit["_source"]["metadata"],
|
|
),
|
|
hit["_score"],
|
|
)
|
|
for hit in hits
|
|
]
|
|
return docs_and_scores
|
|
|
|
@classmethod
|
|
def from_texts(
|
|
cls,
|
|
texts: List[str],
|
|
embedding: Embeddings,
|
|
metadatas: Optional[List[dict]] = None,
|
|
ids: Optional[List[str]] = None,
|
|
index_name: Optional[str] = None,
|
|
refresh_indices: bool = True,
|
|
**kwargs: Any,
|
|
) -> ElasticVectorSearch:
|
|
"""Construct ElasticVectorSearch wrapper from raw documents.
|
|
|
|
This is a user-friendly interface that:
|
|
1. Embeds documents.
|
|
2. Creates a new index for the embeddings in the Elasticsearch instance.
|
|
3. Adds the documents to the newly created Elasticsearch index.
|
|
|
|
This is intended to be a quick way to get started.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.vectorstores import ElasticVectorSearch
|
|
from langchain_community.embeddings import OpenAIEmbeddings
|
|
embeddings = OpenAIEmbeddings()
|
|
elastic_vector_search = ElasticVectorSearch.from_texts(
|
|
texts,
|
|
embeddings,
|
|
elasticsearch_url="http://localhost:9200"
|
|
)
|
|
"""
|
|
elasticsearch_url = get_from_dict_or_env(
|
|
kwargs, "elasticsearch_url", "ELASTICSEARCH_URL"
|
|
)
|
|
if "elasticsearch_url" in kwargs:
|
|
del kwargs["elasticsearch_url"]
|
|
index_name = index_name or uuid.uuid4().hex
|
|
vectorsearch = cls(elasticsearch_url, index_name, embedding, **kwargs)
|
|
vectorsearch.add_texts(
|
|
texts, metadatas=metadatas, ids=ids, refresh_indices=refresh_indices
|
|
)
|
|
return vectorsearch
|
|
|
|
def create_index(self, client: Any, index_name: str, mapping: Dict) -> None:
|
|
version_num = client.info()["version"]["number"][0]
|
|
version_num = int(version_num)
|
|
if version_num >= 8:
|
|
client.indices.create(index=index_name, mappings=mapping)
|
|
else:
|
|
client.indices.create(index=index_name, body={"mappings": mapping})
|
|
|
|
def client_search(
|
|
self, client: Any, index_name: str, script_query: Dict, size: int
|
|
) -> Any:
|
|
version_num = client.info()["version"]["number"][0]
|
|
version_num = int(version_num)
|
|
if version_num >= 8:
|
|
response = client.search(index=index_name, query=script_query, size=size)
|
|
else:
|
|
response = client.search(
|
|
index=index_name, body={"query": script_query, "size": size}
|
|
)
|
|
return response
|
|
|
|
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
|
|
"""Delete by vector IDs.
|
|
|
|
Args:
|
|
ids: List of ids to delete.
|
|
"""
|
|
|
|
if ids is None:
|
|
raise ValueError("No ids provided to delete.")
|
|
|
|
# TODO: Check if this can be done in bulk
|
|
for id in ids:
|
|
self.client.delete(index=self.index_name, id=id)
|
|
|
|
|
|
@deprecated("0.0.265", alternative="ElasticsearchStore class.", pending=True)
|
|
class ElasticKnnSearch(VectorStore):
|
|
"""[DEPRECATED] `Elasticsearch` with k-nearest neighbor search
|
|
(`k-NN`) vector store.
|
|
|
|
Recommended to use ElasticsearchStore instead, which supports
|
|
metadata filtering, customising the query retriever and much more!
|
|
|
|
You can read more on ElasticsearchStore:
|
|
https://python.langchain.com/docs/integrations/vectorstores/elasticsearch
|
|
|
|
It creates an Elasticsearch index of text data that
|
|
can be searched using k-NN search. The text data is transformed into
|
|
vector embeddings using a provided embedding model, and these embeddings
|
|
are stored in the Elasticsearch index.
|
|
|
|
Attributes:
|
|
index_name (str): The name of the Elasticsearch index.
|
|
embedding (Embeddings): The embedding model to use for transforming text data
|
|
into vector embeddings.
|
|
es_connection (Elasticsearch, optional): An existing Elasticsearch connection.
|
|
es_cloud_id (str, optional): The Cloud ID of your Elasticsearch Service
|
|
deployment.
|
|
es_user (str, optional): The username for your Elasticsearch Service deployment.
|
|
es_password (str, optional): The password for your Elasticsearch Service
|
|
deployment.
|
|
vector_query_field (str, optional): The name of the field in the Elasticsearch
|
|
index that contains the vector embeddings.
|
|
query_field (str, optional): The name of the field in the Elasticsearch index
|
|
that contains the original text data.
|
|
|
|
Usage:
|
|
>>> from embeddings import Embeddings
|
|
>>> embedding = Embeddings.load('glove')
|
|
>>> es_search = ElasticKnnSearch('my_index', embedding)
|
|
>>> es_search.add_texts(['Hello world!', 'Another text'])
|
|
>>> results = es_search.knn_search('Hello')
|
|
[(Document(page_content='Hello world!', metadata={}), 0.9)]
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
index_name: str,
|
|
embedding: Embeddings,
|
|
es_connection: Optional["Elasticsearch"] = None,
|
|
es_cloud_id: Optional[str] = None,
|
|
es_user: Optional[str] = None,
|
|
es_password: Optional[str] = None,
|
|
vector_query_field: Optional[str] = "vector",
|
|
query_field: Optional[str] = "text",
|
|
):
|
|
try:
|
|
import elasticsearch
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import elasticsearch python package. "
|
|
"Please install it with `pip install elasticsearch`."
|
|
)
|
|
|
|
warnings.warn(
|
|
"ElasticKnnSearch will be removed in a future release."
|
|
"Use ElasticsearchStore instead. See Elasticsearch "
|
|
"integration docs on how to upgrade."
|
|
)
|
|
self.embedding = embedding
|
|
self.index_name = index_name
|
|
self.query_field = query_field
|
|
self.vector_query_field = vector_query_field
|
|
|
|
# If a pre-existing Elasticsearch connection is provided, use it.
|
|
if es_connection is not None:
|
|
self.client = es_connection
|
|
else:
|
|
# If credentials for a new Elasticsearch connection are provided,
|
|
# create a new connection.
|
|
if es_cloud_id and es_user and es_password:
|
|
self.client = elasticsearch.Elasticsearch(
|
|
cloud_id=es_cloud_id, basic_auth=(es_user, es_password)
|
|
)
|
|
else:
|
|
raise ValueError(
|
|
"""Either provide a pre-existing Elasticsearch connection, \
|
|
or valid credentials for creating a new connection."""
|
|
)
|
|
|
|
@staticmethod
|
|
def _default_knn_mapping(
|
|
dims: int, similarity: Optional[str] = "dot_product"
|
|
) -> Dict:
|
|
return {
|
|
"properties": {
|
|
"text": {"type": "text"},
|
|
"vector": {
|
|
"type": "dense_vector",
|
|
"dims": dims,
|
|
"index": True,
|
|
"similarity": similarity,
|
|
},
|
|
}
|
|
}
|
|
|
|
def _default_knn_query(
|
|
self,
|
|
query_vector: Optional[List[float]] = None,
|
|
query: Optional[str] = None,
|
|
model_id: Optional[str] = None,
|
|
k: Optional[int] = 10,
|
|
num_candidates: Optional[int] = 10,
|
|
) -> Dict:
|
|
knn: Dict = {
|
|
"field": self.vector_query_field,
|
|
"k": k,
|
|
"num_candidates": num_candidates,
|
|
}
|
|
|
|
# Case 1: `query_vector` is provided, but not `model_id` -> use query_vector
|
|
if query_vector and not model_id:
|
|
knn["query_vector"] = query_vector
|
|
|
|
# Case 2: `query` and `model_id` are provided, -> use query_vector_builder
|
|
elif query and model_id:
|
|
knn["query_vector_builder"] = {
|
|
"text_embedding": {
|
|
"model_id": model_id, # use 'model_id' argument
|
|
"model_text": query, # use 'query' argument
|
|
}
|
|
}
|
|
|
|
else:
|
|
raise ValueError(
|
|
"Either `query_vector` or `model_id` must be provided, but not both."
|
|
)
|
|
|
|
return knn
|
|
|
|
def similarity_search(
|
|
self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
|
|
) -> List[Document]:
|
|
"""
|
|
Pass through to `knn_search`
|
|
"""
|
|
results = self.knn_search(query=query, k=k, **kwargs)
|
|
return [doc for doc, score in results]
|
|
|
|
def similarity_search_with_score(
|
|
self, query: str, k: int = 10, **kwargs: Any
|
|
) -> List[Tuple[Document, float]]:
|
|
"""Pass through to `knn_search including score`"""
|
|
return self.knn_search(query=query, k=k, **kwargs)
|
|
|
|
def knn_search(
|
|
self,
|
|
query: Optional[str] = None,
|
|
k: Optional[int] = 10,
|
|
query_vector: Optional[List[float]] = None,
|
|
model_id: Optional[str] = None,
|
|
size: Optional[int] = 10,
|
|
source: Optional[bool] = True,
|
|
fields: Optional[
|
|
Union[List[Mapping[str, Any]], Tuple[Mapping[str, Any], ...], None]
|
|
] = None,
|
|
page_content: Optional[str] = "text",
|
|
) -> List[Tuple[Document, float]]:
|
|
"""
|
|
Perform a k-NN search on the Elasticsearch index.
|
|
|
|
Args:
|
|
query (str, optional): The query text to search for.
|
|
k (int, optional): The number of nearest neighbors to return.
|
|
query_vector (List[float], optional): The query vector to search for.
|
|
model_id (str, optional): The ID of the model to use for transforming the
|
|
query text into a vector.
|
|
size (int, optional): The number of search results to return.
|
|
source (bool, optional): Whether to return the source of the search results.
|
|
fields (List[Mapping[str, Any]], optional): The fields to return in the
|
|
search results.
|
|
page_content (str, optional): The name of the field that contains the page
|
|
content.
|
|
|
|
Returns:
|
|
A list of tuples, where each tuple contains a Document object and a score.
|
|
"""
|
|
|
|
# if not source and (fields == None or page_content not in fields):
|
|
if not source and (
|
|
fields is None or not any(page_content in field for field in fields)
|
|
):
|
|
raise ValueError("If source=False `page_content` field must be in `fields`")
|
|
|
|
knn_query_body = self._default_knn_query(
|
|
query_vector=query_vector, query=query, model_id=model_id, k=k
|
|
)
|
|
|
|
# Perform the kNN search on the Elasticsearch index and return the results.
|
|
response = self.client.search(
|
|
index=self.index_name,
|
|
knn=knn_query_body,
|
|
size=size,
|
|
source=source,
|
|
fields=fields,
|
|
)
|
|
|
|
hits = [hit for hit in response["hits"]["hits"]]
|
|
docs_and_scores = [
|
|
(
|
|
Document(
|
|
page_content=hit["_source"][page_content]
|
|
if source
|
|
else hit["fields"][page_content][0],
|
|
metadata=hit["fields"] if fields else {},
|
|
),
|
|
hit["_score"],
|
|
)
|
|
for hit in hits
|
|
]
|
|
|
|
return docs_and_scores
|
|
|
|
def knn_hybrid_search(
|
|
self,
|
|
query: Optional[str] = None,
|
|
k: Optional[int] = 10,
|
|
query_vector: Optional[List[float]] = None,
|
|
model_id: Optional[str] = None,
|
|
size: Optional[int] = 10,
|
|
source: Optional[bool] = True,
|
|
knn_boost: Optional[float] = 0.9,
|
|
query_boost: Optional[float] = 0.1,
|
|
fields: Optional[
|
|
Union[List[Mapping[str, Any]], Tuple[Mapping[str, Any], ...], None]
|
|
] = None,
|
|
page_content: Optional[str] = "text",
|
|
) -> List[Tuple[Document, float]]:
|
|
"""
|
|
Perform a hybrid k-NN and text search on the Elasticsearch index.
|
|
|
|
Args:
|
|
query (str, optional): The query text to search for.
|
|
k (int, optional): The number of nearest neighbors to return.
|
|
query_vector (List[float], optional): The query vector to search for.
|
|
model_id (str, optional): The ID of the model to use for transforming the
|
|
query text into a vector.
|
|
size (int, optional): The number of search results to return.
|
|
source (bool, optional): Whether to return the source of the search results.
|
|
knn_boost (float, optional): The boost value to apply to the k-NN search
|
|
results.
|
|
query_boost (float, optional): The boost value to apply to the text search
|
|
results.
|
|
fields (List[Mapping[str, Any]], optional): The fields to return in the
|
|
search results.
|
|
page_content (str, optional): The name of the field that contains the page
|
|
content.
|
|
|
|
Returns:
|
|
A list of tuples, where each tuple contains a Document object and a score.
|
|
"""
|
|
|
|
# if not source and (fields == None or page_content not in fields):
|
|
if not source and (
|
|
fields is None or not any(page_content in field for field in fields)
|
|
):
|
|
raise ValueError("If source=False `page_content` field must be in `fields`")
|
|
|
|
knn_query_body = self._default_knn_query(
|
|
query_vector=query_vector, query=query, model_id=model_id, k=k
|
|
)
|
|
|
|
# Modify the knn_query_body to add a "boost" parameter
|
|
knn_query_body["boost"] = knn_boost
|
|
|
|
# Generate the body of the standard Elasticsearch query
|
|
match_query_body = {
|
|
"match": {self.query_field: {"query": query, "boost": query_boost}}
|
|
}
|
|
|
|
# Perform the hybrid search on the Elasticsearch index and return the results.
|
|
response = self.client.search(
|
|
index=self.index_name,
|
|
query=match_query_body,
|
|
knn=knn_query_body,
|
|
fields=fields,
|
|
size=size,
|
|
source=source,
|
|
)
|
|
|
|
hits = [hit for hit in response["hits"]["hits"]]
|
|
docs_and_scores = [
|
|
(
|
|
Document(
|
|
page_content=hit["_source"][page_content]
|
|
if source
|
|
else hit["fields"][page_content][0],
|
|
metadata=hit["fields"] if fields else {},
|
|
),
|
|
hit["_score"],
|
|
)
|
|
for hit in hits
|
|
]
|
|
|
|
return docs_and_scores
|
|
|
|
def create_knn_index(self, mapping: Dict) -> None:
|
|
"""
|
|
Create a new k-NN index in Elasticsearch.
|
|
|
|
Args:
|
|
mapping (Dict): The mapping to use for the new index.
|
|
|
|
Returns:
|
|
None
|
|
"""
|
|
|
|
self.client.indices.create(index=self.index_name, mappings=mapping)
|
|
|
|
def add_texts(
|
|
self,
|
|
texts: Iterable[str],
|
|
metadatas: Optional[List[Dict[Any, Any]]] = None,
|
|
model_id: Optional[str] = None,
|
|
refresh_indices: bool = False,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
"""
|
|
Add a list of texts to the Elasticsearch index.
|
|
|
|
Args:
|
|
texts (Iterable[str]): The texts to add to the index.
|
|
metadatas (List[Dict[Any, Any]], optional): A list of metadata dictionaries
|
|
to associate with the texts.
|
|
model_id (str, optional): The ID of the model to use for transforming the
|
|
texts into vectors.
|
|
refresh_indices (bool, optional): Whether to refresh the Elasticsearch
|
|
indices after adding the texts.
|
|
**kwargs: Arbitrary keyword arguments.
|
|
|
|
Returns:
|
|
A list of IDs for the added texts.
|
|
"""
|
|
|
|
# Check if the index exists.
|
|
if not self.client.indices.exists(index=self.index_name):
|
|
dims = kwargs.get("dims")
|
|
|
|
if dims is None:
|
|
raise ValueError("ElasticKnnSearch requires 'dims' parameter")
|
|
|
|
similarity = kwargs.get("similarity")
|
|
optional_args = {}
|
|
|
|
if similarity is not None:
|
|
optional_args["similarity"] = similarity
|
|
|
|
mapping = self._default_knn_mapping(dims=dims, **optional_args)
|
|
self.create_knn_index(mapping)
|
|
|
|
embeddings = self.embedding.embed_documents(list(texts))
|
|
|
|
# body = []
|
|
body: List[Mapping[str, Any]] = []
|
|
for text, vector in zip(texts, embeddings):
|
|
body.extend(
|
|
[
|
|
{"index": {"_index": self.index_name}},
|
|
{"text": text, "vector": vector},
|
|
]
|
|
)
|
|
|
|
responses = self.client.bulk(operations=body)
|
|
|
|
ids = [
|
|
item["index"]["_id"]
|
|
for item in responses["items"]
|
|
if item["index"]["result"] == "created"
|
|
]
|
|
|
|
if refresh_indices:
|
|
self.client.indices.refresh(index=self.index_name)
|
|
|
|
return ids
|
|
|
|
@classmethod
|
|
def from_texts(
|
|
cls,
|
|
texts: List[str],
|
|
embedding: Embeddings,
|
|
metadatas: Optional[List[Dict[Any, Any]]] = None,
|
|
**kwargs: Any,
|
|
) -> ElasticKnnSearch:
|
|
"""
|
|
Create a new ElasticKnnSearch instance and add a list of texts to the
|
|
Elasticsearch index.
|
|
|
|
Args:
|
|
texts (List[str]): The texts to add to the index.
|
|
embedding (Embeddings): The embedding model to use for transforming the
|
|
texts into vectors.
|
|
metadatas (List[Dict[Any, Any]], optional): A list of metadata dictionaries
|
|
to associate with the texts.
|
|
**kwargs: Arbitrary keyword arguments.
|
|
|
|
Returns:
|
|
A new ElasticKnnSearch instance.
|
|
"""
|
|
|
|
index_name = kwargs.get("index_name", str(uuid.uuid4()))
|
|
es_connection = kwargs.get("es_connection")
|
|
es_cloud_id = kwargs.get("es_cloud_id")
|
|
es_user = kwargs.get("es_user")
|
|
es_password = kwargs.get("es_password")
|
|
vector_query_field = kwargs.get("vector_query_field", "vector")
|
|
query_field = kwargs.get("query_field", "text")
|
|
model_id = kwargs.get("model_id")
|
|
dims = kwargs.get("dims")
|
|
|
|
if dims is None:
|
|
raise ValueError("ElasticKnnSearch requires 'dims' parameter")
|
|
|
|
optional_args = {}
|
|
|
|
if vector_query_field is not None:
|
|
optional_args["vector_query_field"] = vector_query_field
|
|
|
|
if query_field is not None:
|
|
optional_args["query_field"] = query_field
|
|
|
|
knnvectorsearch = cls(
|
|
index_name=index_name,
|
|
embedding=embedding,
|
|
es_connection=es_connection,
|
|
es_cloud_id=es_cloud_id,
|
|
es_user=es_user,
|
|
es_password=es_password,
|
|
**optional_args,
|
|
)
|
|
# Encode the provided texts and add them to the newly created index.
|
|
knnvectorsearch.add_texts(texts, model_id=model_id, dims=dims, **optional_args)
|
|
|
|
return knnvectorsearch
|