mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
312 lines
10 KiB
Python
312 lines
10 KiB
Python
|
from __future__ import annotations
|
||
|
|
||
|
import uuid
|
||
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type
|
||
|
|
||
|
from langchain_core.documents import Document
|
||
|
from langchain_core.embeddings import Embeddings
|
||
|
from langchain_core.utils import get_from_env
|
||
|
from langchain_core.vectorstores import VectorStore
|
||
|
|
||
|
if TYPE_CHECKING:
|
||
|
from meilisearch import Client
|
||
|
|
||
|
|
||
|
def _create_client(
|
||
|
client: Optional[Client] = None,
|
||
|
url: Optional[str] = None,
|
||
|
api_key: Optional[str] = None,
|
||
|
) -> Client:
|
||
|
try:
|
||
|
import meilisearch
|
||
|
except ImportError:
|
||
|
raise ImportError(
|
||
|
"Could not import meilisearch python package. "
|
||
|
"Please install it with `pip install meilisearch`."
|
||
|
)
|
||
|
if not client:
|
||
|
url = url or get_from_env("url", "MEILI_HTTP_ADDR")
|
||
|
try:
|
||
|
api_key = api_key or get_from_env("api_key", "MEILI_MASTER_KEY")
|
||
|
except Exception:
|
||
|
pass
|
||
|
client = meilisearch.Client(url=url, api_key=api_key)
|
||
|
elif not isinstance(client, meilisearch.Client):
|
||
|
raise ValueError(
|
||
|
f"client should be an instance of meilisearch.Client, "
|
||
|
f"got {type(client)}"
|
||
|
)
|
||
|
try:
|
||
|
client.version()
|
||
|
except ValueError as e:
|
||
|
raise ValueError(f"Failed to connect to Meilisearch: {e}")
|
||
|
return client
|
||
|
|
||
|
|
||
|
class Meilisearch(VectorStore):
|
||
|
"""`Meilisearch` vector store.
|
||
|
|
||
|
To use this, you need to have `meilisearch` python package installed,
|
||
|
and a running Meilisearch instance.
|
||
|
|
||
|
To learn more about Meilisearch Python, refer to the in-depth
|
||
|
Meilisearch Python documentation: https://meilisearch.github.io/meilisearch-python/.
|
||
|
|
||
|
See the following documentation for how to run a Meilisearch instance:
|
||
|
https://www.meilisearch.com/docs/learn/getting_started/quick_start.
|
||
|
|
||
|
Example:
|
||
|
.. code-block:: python
|
||
|
|
||
|
from langchain_community.vectorstores import Meilisearch
|
||
|
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||
|
import meilisearch
|
||
|
|
||
|
# api_key is optional; provide it if your meilisearch instance requires it
|
||
|
client = meilisearch.Client(url='http://127.0.0.1:7700', api_key='***')
|
||
|
embeddings = OpenAIEmbeddings()
|
||
|
vectorstore = Meilisearch(
|
||
|
embedding=embeddings,
|
||
|
client=client,
|
||
|
index_name='langchain_demo',
|
||
|
text_key='text')
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
embedding: Embeddings,
|
||
|
client: Optional[Client] = None,
|
||
|
url: Optional[str] = None,
|
||
|
api_key: Optional[str] = None,
|
||
|
index_name: str = "langchain-demo",
|
||
|
text_key: str = "text",
|
||
|
metadata_key: str = "metadata",
|
||
|
):
|
||
|
"""Initialize with Meilisearch client."""
|
||
|
client = _create_client(client=client, url=url, api_key=api_key)
|
||
|
|
||
|
self._client = client
|
||
|
self._index_name = index_name
|
||
|
self._embedding = embedding
|
||
|
self._text_key = text_key
|
||
|
self._metadata_key = metadata_key
|
||
|
|
||
|
def add_texts(
|
||
|
self,
|
||
|
texts: Iterable[str],
|
||
|
metadatas: Optional[List[dict]] = None,
|
||
|
ids: Optional[List[str]] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> List[str]:
|
||
|
"""Run more texts through the embedding and add them to the vector store.
|
||
|
|
||
|
Args:
|
||
|
texts (Iterable[str]): Iterable of strings/text to add to the vectorstore.
|
||
|
metadatas (Optional[List[dict]]): Optional list of metadata.
|
||
|
Defaults to None.
|
||
|
ids Optional[List[str]]: Optional list of IDs.
|
||
|
Defaults to None.
|
||
|
|
||
|
Returns:
|
||
|
List[str]: List of IDs of the texts added to the vectorstore.
|
||
|
"""
|
||
|
texts = list(texts)
|
||
|
|
||
|
# Embed and create the documents
|
||
|
docs = []
|
||
|
if ids is None:
|
||
|
ids = [uuid.uuid4().hex for _ in texts]
|
||
|
if metadatas is None:
|
||
|
metadatas = [{} for _ in texts]
|
||
|
embedding_vectors = self._embedding.embed_documents(texts)
|
||
|
|
||
|
for i, text in enumerate(texts):
|
||
|
id = ids[i]
|
||
|
metadata = metadatas[i]
|
||
|
metadata[self._text_key] = text
|
||
|
embedding = embedding_vectors[i]
|
||
|
docs.append(
|
||
|
{
|
||
|
"id": id,
|
||
|
"_vectors": embedding,
|
||
|
f"{self._metadata_key}": metadata,
|
||
|
}
|
||
|
)
|
||
|
|
||
|
# Send to Meilisearch
|
||
|
self._client.index(str(self._index_name)).add_documents(docs)
|
||
|
return ids
|
||
|
|
||
|
def similarity_search(
|
||
|
self,
|
||
|
query: str,
|
||
|
k: int = 4,
|
||
|
filter: Optional[Dict[str, str]] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> List[Document]:
|
||
|
"""Return meilisearch documents most similar to the query.
|
||
|
|
||
|
Args:
|
||
|
query (str): Query text for which to find similar documents.
|
||
|
k (int): Number of documents to return. Defaults to 4.
|
||
|
filter (Optional[Dict[str, str]]): Filter by metadata.
|
||
|
Defaults to None.
|
||
|
|
||
|
Returns:
|
||
|
List[Document]: List of Documents most similar to the query
|
||
|
text and score for each.
|
||
|
"""
|
||
|
docs_and_scores = self.similarity_search_with_score(
|
||
|
query=query,
|
||
|
k=k,
|
||
|
filter=filter,
|
||
|
kwargs=kwargs,
|
||
|
)
|
||
|
return [doc for doc, _ in docs_and_scores]
|
||
|
|
||
|
def similarity_search_with_score(
|
||
|
self,
|
||
|
query: str,
|
||
|
k: int = 4,
|
||
|
filter: Optional[Dict[str, str]] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> List[Tuple[Document, float]]:
|
||
|
"""Return meilisearch documents most similar to the query, along with scores.
|
||
|
|
||
|
Args:
|
||
|
query (str): Query text for which to find similar documents.
|
||
|
k (int): Number of documents to return. Defaults to 4.
|
||
|
filter (Optional[Dict[str, str]]): Filter by metadata.
|
||
|
Defaults to None.
|
||
|
|
||
|
Returns:
|
||
|
List[Document]: List of Documents most similar to the query
|
||
|
text and score for each.
|
||
|
"""
|
||
|
_query = self._embedding.embed_query(query)
|
||
|
|
||
|
docs = self.similarity_search_by_vector_with_scores(
|
||
|
embedding=_query,
|
||
|
k=k,
|
||
|
filter=filter,
|
||
|
kwargs=kwargs,
|
||
|
)
|
||
|
return docs
|
||
|
|
||
|
def similarity_search_by_vector_with_scores(
|
||
|
self,
|
||
|
embedding: List[float],
|
||
|
k: int = 4,
|
||
|
filter: Optional[Dict[str, Any]] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> List[Tuple[Document, float]]:
|
||
|
"""Return meilisearch documents most similar to embedding vector.
|
||
|
|
||
|
Args:
|
||
|
embedding (List[float]): Embedding to look up similar documents.
|
||
|
k (int): Number of documents to return. Defaults to 4.
|
||
|
filter (Optional[Dict[str, str]]): Filter by metadata.
|
||
|
Defaults to None.
|
||
|
|
||
|
Returns:
|
||
|
List[Document]: List of Documents most similar to the query
|
||
|
vector and score for each.
|
||
|
"""
|
||
|
docs = []
|
||
|
results = self._client.index(str(self._index_name)).search(
|
||
|
"", {"vector": embedding, "limit": k, "filter": filter}
|
||
|
)
|
||
|
|
||
|
for result in results["hits"]:
|
||
|
metadata = result[self._metadata_key]
|
||
|
if self._text_key in metadata:
|
||
|
text = metadata.pop(self._text_key)
|
||
|
semantic_score = result["_semanticScore"]
|
||
|
docs.append(
|
||
|
(Document(page_content=text, metadata=metadata), semantic_score)
|
||
|
)
|
||
|
|
||
|
return docs
|
||
|
|
||
|
def similarity_search_by_vector(
|
||
|
self,
|
||
|
embedding: List[float],
|
||
|
k: int = 4,
|
||
|
filter: Optional[Dict[str, str]] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> List[Document]:
|
||
|
"""Return meilisearch documents most similar to embedding vector.
|
||
|
|
||
|
Args:
|
||
|
embedding (List[float]): Embedding to look up similar documents.
|
||
|
k (int): Number of documents to return. Defaults to 4.
|
||
|
filter (Optional[Dict[str, str]]): Filter by metadata.
|
||
|
Defaults to None.
|
||
|
|
||
|
Returns:
|
||
|
List[Document]: List of Documents most similar to the query
|
||
|
vector and score for each.
|
||
|
"""
|
||
|
docs = self.similarity_search_by_vector_with_scores(
|
||
|
embedding=embedding,
|
||
|
k=k,
|
||
|
filter=filter,
|
||
|
kwargs=kwargs,
|
||
|
)
|
||
|
return [doc for doc, _ in docs]
|
||
|
|
||
|
@classmethod
|
||
|
def from_texts(
|
||
|
cls: Type[Meilisearch],
|
||
|
texts: List[str],
|
||
|
embedding: Embeddings,
|
||
|
metadatas: Optional[List[dict]] = None,
|
||
|
client: Optional[Client] = None,
|
||
|
url: Optional[str] = None,
|
||
|
api_key: Optional[str] = None,
|
||
|
index_name: str = "langchain-demo",
|
||
|
ids: Optional[List[str]] = None,
|
||
|
text_key: Optional[str] = "text",
|
||
|
metadata_key: Optional[str] = "metadata",
|
||
|
**kwargs: Any,
|
||
|
) -> Meilisearch:
|
||
|
"""Construct Meilisearch wrapper from raw documents.
|
||
|
|
||
|
This is a user-friendly interface that:
|
||
|
1. Embeds documents.
|
||
|
2. Adds the documents to a provided Meilisearch index.
|
||
|
|
||
|
This is intended to be a quick way to get started.
|
||
|
|
||
|
Example:
|
||
|
.. code-block:: python
|
||
|
|
||
|
from langchain_community.vectorstores import Meilisearch
|
||
|
from langchain_community.embeddings import OpenAIEmbeddings
|
||
|
import meilisearch
|
||
|
|
||
|
# The environment should be the one specified next to the API key
|
||
|
# in your Meilisearch console
|
||
|
client = meilisearch.Client(url='http://127.0.0.1:7700', api_key='***')
|
||
|
embeddings = OpenAIEmbeddings()
|
||
|
docsearch = Meilisearch.from_texts(
|
||
|
client=client,
|
||
|
embeddings=embeddings,
|
||
|
)
|
||
|
"""
|
||
|
client = _create_client(client=client, url=url, api_key=api_key)
|
||
|
|
||
|
vectorstore = cls(
|
||
|
embedding=embedding,
|
||
|
client=client,
|
||
|
index_name=index_name,
|
||
|
)
|
||
|
vectorstore.add_texts(
|
||
|
texts=texts,
|
||
|
metadatas=metadatas,
|
||
|
ids=ids,
|
||
|
text_key=text_key,
|
||
|
metadata_key=metadata_key,
|
||
|
)
|
||
|
return vectorstore
|