Harrison/weaviate fixes (#2872)

Co-authored-by: cs0lar <cristiano.solarino@gmail.com>
Co-authored-by: cs0lar <cristiano.solarino@brightminded.com>
This commit is contained in:
Harrison Chase 2023-04-13 22:37:34 -07:00 committed by GitHub
parent 07d7096de6
commit 1e9378d0a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 178 additions and 12 deletions

View File

@ -6,9 +6,22 @@ from uuid import uuid4
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.utils import get_from_dict_or_env
from langchain.vectorstores.base import VectorStore
def _default_schema(index_name: str) -> Dict:
return {
"class": index_name,
"properties": [
{
"name": "text",
"dataType": ["text"],
}
],
}
class Weaviate(VectorStore):
"""Wrapper around Weaviate vector database.
@ -70,14 +83,24 @@ class Weaviate(VectorStore):
data_properties[key] = metadatas[i][key]
_id = get_valid_uuid(uuid4())
batch.add_data_object(data_properties, self._index_name, _id)
batch.add_data_object(
data_object=data_properties, class_name=self._index_name, uuid=_id
)
ids.append(_id)
return ids
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Look up similar documents in weaviate."""
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query.
"""
content: Dict[str, Any] = {"concepts": [query]}
if kwargs.get("search_distance"):
content["certainty"] = kwargs.get("search_distance")
@ -114,5 +137,74 @@ class Weaviate(VectorStore):
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> Weaviate:
"""Not implemented for Weaviate yet."""
raise NotImplementedError("weaviate does not currently support `from_texts`.")
"""Construct Weaviate wrapper from raw documents.
This is a user-friendly interface that:
1. Embeds documents.
2. Creates a new index for the embeddings in the Weaviate instance.
3. Adds the documents to the newly created Weaviate index.
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from langchain.vectorstores.weaviate import Weaviate
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
weaviate = Weaviate.from_texts(
texts,
embeddings,
weaviate_url="http://localhost:8080"
)
"""
weaviate_url = get_from_dict_or_env(kwargs, "weaviate_url", "WEAVIATE_URL")
try:
from weaviate import Client
from weaviate.util import get_valid_uuid
except ImportError:
raise ValueError(
"Could not import weaviate python package. "
"Please install it with `pip instal weaviate-client`"
)
client = Client(weaviate_url)
index_name = kwargs.get("index_name", f"LangChain_{uuid4().hex}")
embeddings = embedding.embed_documents(texts) if embedding else None
text_key = "text"
schema = _default_schema(index_name)
attributes = list(metadatas[0].keys()) if metadatas else None
# check whether the index already exists
if not client.schema.contains(schema):
client.schema.create_class(schema)
with client.batch as batch:
for i, text in enumerate(texts):
data_properties = {
text_key: text,
}
if metadatas is not None:
for key in metadatas[i].keys():
data_properties[key] = metadatas[i][key]
_id = get_valid_uuid(uuid4())
# if an embedding strategy is not provided, we let
# weaviate create the embedding. Note that this will only
# work if weaviate has been installed with a vectorizer module
# like text2vec-contextionary for example
params = {
"uuid": _id,
"data_object": data_properties,
"class_name": index_name,
}
if embeddings is not None:
params["vector"] = (embeddings[i],)
batch.add_data_object(**params)
batch.flush()
return cls(client, index_name, text_key, attributes)

16
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
# This file is automatically @generated by Poetry and should not be changed by hand.
[[package]]
name = "absl-py"
@ -499,7 +499,7 @@ name = "authlib"
version = "1.2.0"
description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients."
category = "main"
optional = true
optional = false
python-versions = "*"
files = [
{file = "Authlib-1.2.0-py2.py3-none-any.whl", hash = "sha256:4ddf4fd6cfa75c9a460b361d4bd9dac71ffda0be879dbe4292a02e92349ad55a"},
@ -7258,7 +7258,7 @@ files = [
]
[package.dependencies]
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""}
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
[package.extras]
aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
@ -8360,7 +8360,7 @@ name = "validators"
version = "0.20.0"
description = "Python Data Validation for Humans™."
category = "main"
optional = true
optional = false
python-versions = ">=3.4"
files = [
{file = "validators-0.20.0.tar.gz", hash = "sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a"},
@ -8497,7 +8497,7 @@ name = "weaviate-client"
version = "3.15.5"
description = "A python native weaviate client"
category = "main"
optional = true
optional = false
python-versions = ">=3.7"
files = [
{file = "weaviate-client-3.15.5.tar.gz", hash = "sha256:6da7e5d08dc9bb8b7879661d1a457c50af7d73e621a5305efe131160e83da69e"},
@ -9026,13 +9026,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
all = ["aleph-alpha-client", "anthropic", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm"]
cohere = ["cohere"]
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
openai = ["openai"]
qdrant = ["qdrant-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "3c8488864a754852fdec3e56dd5630ed73852ec2120a94cfe22537c075901b24"
content-hash = "373f68ef16e7f3d5d9cde8b81c5f261096cc537ddca4f6a36711d7215b63f226"

View File

@ -101,6 +101,7 @@ pgvector = "^0.1.6"
transformers = "^4.27.4"
pandas = "^2.0.0"
deeplake = "^3.2.21"
weaviate-client = "^3.15.5"
torch = "^1.0.0"
chromadb = "^0.3.21"
tiktoken = "^0.3.3"

View File

@ -0,0 +1,22 @@
version: '3.4'
services:
weaviate:
command:
- --host
- 0.0.0.0
- --port
- '8080'
- --scheme
- http
image: semitechnologies/weaviate:1.18.2
ports:
- 8080:8080
restart: on-failure:0
environment:
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'none'
ENABLE_MODULES: ''
CLUSTER_HOSTNAME: 'node1'

View File

@ -0,0 +1,51 @@
"""Test Weaviate functionality."""
import logging
from typing import Generator, Union
import pytest
from weaviate import Client
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.weaviate import Weaviate
logging.basicConfig(level=logging.DEBUG)
"""
cd tests/integration_tests/vectorstores/docker-compose
docker compose -f weaviate.yml up
"""
class TestWeaviate:
@pytest.fixture(scope="class", autouse=True)
def weaviate_url(self) -> Union[str, Generator[str, None, None]]:
"""Return the weaviate url."""
url = "http://localhost:8080"
yield url
# Clear the test index
client = Client(url)
client.schema.delete_all()
def test_similarity_search_without_metadata(self, weaviate_url: str) -> None:
"""Test end to end construction and search without metadata."""
texts = ["foo", "bar", "baz"]
docsearch = Weaviate.from_texts(
texts,
OpenAIEmbeddings(),
weaviate_url=weaviate_url,
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_similarity_search_with_metadata(self, weaviate_url: str) -> None:
"""Test end to end construction and search with metadata."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Weaviate.from_texts(
texts, OpenAIEmbeddings(), metadatas=metadatas, weaviate_url=weaviate_url
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={"page": 0})]