forked from Archives/langchain
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
211 lines
7.2 KiB
Python
211 lines
7.2 KiB
Python
"""Wrapper around weaviate vector database."""
|
|
from __future__ import annotations
|
|
|
|
from typing import Any, Dict, Iterable, List, Optional, Type
|
|
from uuid import uuid4
|
|
|
|
from langchain.docstore.document import Document
|
|
from langchain.embeddings.base import Embeddings
|
|
from langchain.utils import get_from_dict_or_env
|
|
from langchain.vectorstores.base import VectorStore
|
|
|
|
|
|
def _default_schema(index_name: str) -> Dict:
|
|
return {
|
|
"class": index_name,
|
|
"properties": [
|
|
{
|
|
"name": "text",
|
|
"dataType": ["text"],
|
|
}
|
|
],
|
|
}
|
|
|
|
|
|
class Weaviate(VectorStore):
|
|
"""Wrapper around Weaviate vector database.
|
|
|
|
To use, you should have the ``weaviate-client`` python package installed.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
import weaviate
|
|
from langchain.vectorstores import Weaviate
|
|
client = weaviate.Client(url=os.environ["WEAVIATE_URL"], ...)
|
|
weaviate = Weaviate(client, index_name, text_key)
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
client: Any,
|
|
index_name: str,
|
|
text_key: str,
|
|
attributes: Optional[List[str]] = None,
|
|
):
|
|
"""Initialize with Weaviate client."""
|
|
try:
|
|
import weaviate
|
|
except ImportError:
|
|
raise ValueError(
|
|
"Could not import weaviate python package. "
|
|
"Please install it with `pip install weaviate-client`."
|
|
)
|
|
if not isinstance(client, weaviate.Client):
|
|
raise ValueError(
|
|
f"client should be an instance of weaviate.Client, got {type(client)}"
|
|
)
|
|
self._client = client
|
|
self._index_name = index_name
|
|
self._text_key = text_key
|
|
self._query_attrs = [self._text_key]
|
|
if attributes is not None:
|
|
self._query_attrs.extend(attributes)
|
|
|
|
def add_texts(
|
|
self,
|
|
texts: Iterable[str],
|
|
metadatas: Optional[List[dict]] = None,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
"""Upload texts with metadata (properties) to Weaviate."""
|
|
from weaviate.util import get_valid_uuid
|
|
|
|
with self._client.batch as batch:
|
|
ids = []
|
|
for i, doc in enumerate(texts):
|
|
data_properties = {
|
|
self._text_key: doc,
|
|
}
|
|
if metadatas is not None:
|
|
for key in metadatas[i].keys():
|
|
data_properties[key] = metadatas[i][key]
|
|
|
|
_id = get_valid_uuid(uuid4())
|
|
batch.add_data_object(
|
|
data_object=data_properties, class_name=self._index_name, uuid=_id
|
|
)
|
|
ids.append(_id)
|
|
return ids
|
|
|
|
def similarity_search(
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
) -> List[Document]:
|
|
"""Return docs most similar to query.
|
|
|
|
Args:
|
|
query: Text to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
Returns:
|
|
List of Documents most similar to the query.
|
|
"""
|
|
content: Dict[str, Any] = {"concepts": [query]}
|
|
if kwargs.get("search_distance"):
|
|
content["certainty"] = kwargs.get("search_distance")
|
|
query_obj = self._client.query.get(self._index_name, self._query_attrs)
|
|
result = query_obj.with_near_text(content).with_limit(k).do()
|
|
if "errors" in result:
|
|
raise ValueError(f"Error during query: {result['errors']}")
|
|
docs = []
|
|
for res in result["data"]["Get"][self._index_name]:
|
|
text = res.pop(self._text_key)
|
|
docs.append(Document(page_content=text, metadata=res))
|
|
return docs
|
|
|
|
def similarity_search_by_vector(
|
|
self, embedding: List[float], k: int = 4, **kwargs: Any
|
|
) -> List[Document]:
|
|
"""Look up similar documents by embedding vector in Weaviate."""
|
|
vector = {"vector": embedding}
|
|
query_obj = self._client.query.get(self._index_name, self._query_attrs)
|
|
result = query_obj.with_near_vector(vector).with_limit(k).do()
|
|
if "errors" in result:
|
|
raise ValueError(f"Error during query: {result['errors']}")
|
|
docs = []
|
|
for res in result["data"]["Get"][self._index_name]:
|
|
text = res.pop(self._text_key)
|
|
docs.append(Document(page_content=text, metadata=res))
|
|
return docs
|
|
|
|
@classmethod
|
|
def from_texts(
|
|
cls: Type[Weaviate],
|
|
texts: List[str],
|
|
embedding: Embeddings,
|
|
metadatas: Optional[List[dict]] = None,
|
|
**kwargs: Any,
|
|
) -> Weaviate:
|
|
"""Construct Weaviate wrapper from raw documents.
|
|
|
|
This is a user-friendly interface that:
|
|
1. Embeds documents.
|
|
2. Creates a new index for the embeddings in the Weaviate instance.
|
|
3. Adds the documents to the newly created Weaviate index.
|
|
|
|
This is intended to be a quick way to get started.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain.vectorstores.weaviate import Weaviate
|
|
from langchain.embeddings import OpenAIEmbeddings
|
|
embeddings = OpenAIEmbeddings()
|
|
weaviate = Weaviate.from_texts(
|
|
texts,
|
|
embeddings,
|
|
weaviate_url="http://localhost:8080"
|
|
)
|
|
"""
|
|
weaviate_url = get_from_dict_or_env(kwargs, "weaviate_url", "WEAVIATE_URL")
|
|
|
|
try:
|
|
from weaviate import Client
|
|
from weaviate.util import get_valid_uuid
|
|
except ImportError:
|
|
raise ValueError(
|
|
"Could not import weaviate python package. "
|
|
"Please install it with `pip instal weaviate-client`"
|
|
)
|
|
|
|
client = Client(weaviate_url)
|
|
index_name = kwargs.get("index_name", f"LangChain_{uuid4().hex}")
|
|
embeddings = embedding.embed_documents(texts) if embedding else None
|
|
text_key = "text"
|
|
schema = _default_schema(index_name)
|
|
attributes = list(metadatas[0].keys()) if metadatas else None
|
|
|
|
# check whether the index already exists
|
|
if not client.schema.contains(schema):
|
|
client.schema.create_class(schema)
|
|
|
|
with client.batch as batch:
|
|
for i, text in enumerate(texts):
|
|
data_properties = {
|
|
text_key: text,
|
|
}
|
|
if metadatas is not None:
|
|
for key in metadatas[i].keys():
|
|
data_properties[key] = metadatas[i][key]
|
|
|
|
_id = get_valid_uuid(uuid4())
|
|
|
|
# if an embedding strategy is not provided, we let
|
|
# weaviate create the embedding. Note that this will only
|
|
# work if weaviate has been installed with a vectorizer module
|
|
# like text2vec-contextionary for example
|
|
params = {
|
|
"uuid": _id,
|
|
"data_object": data_properties,
|
|
"class_name": index_name,
|
|
}
|
|
if embeddings is not None:
|
|
params["vector"] = (embeddings[i],)
|
|
|
|
batch.add_data_object(**params)
|
|
|
|
batch.flush()
|
|
|
|
return cls(client, index_name, text_key, attributes)
|