diff --git a/langchain/vectorstores/weaviate.py b/langchain/vectorstores/weaviate.py index 011dc470..d5d8c134 100644 --- a/langchain/vectorstores/weaviate.py +++ b/langchain/vectorstores/weaviate.py @@ -6,9 +6,22 @@ from uuid import uuid4 from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings +from langchain.utils import get_from_dict_or_env from langchain.vectorstores.base import VectorStore +def _default_schema(index_name: str) -> Dict: + return { + "class": index_name, + "properties": [ + { + "name": "text", + "dataType": ["text"], + } + ], + } + + class Weaviate(VectorStore): """Wrapper around Weaviate vector database. @@ -70,14 +83,24 @@ class Weaviate(VectorStore): data_properties[key] = metadatas[i][key] _id = get_valid_uuid(uuid4()) - batch.add_data_object(data_properties, self._index_name, _id) + batch.add_data_object( + data_object=data_properties, class_name=self._index_name, uuid=_id + ) ids.append(_id) return ids def similarity_search( self, query: str, k: int = 4, **kwargs: Any ) -> List[Document]: - """Look up similar documents in weaviate.""" + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query. + """ content: Dict[str, Any] = {"concepts": [query]} if kwargs.get("search_distance"): content["certainty"] = kwargs.get("search_distance") @@ -114,5 +137,74 @@ class Weaviate(VectorStore): metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> Weaviate: - """Not implemented for Weaviate yet.""" - raise NotImplementedError("weaviate does not currently support `from_texts`.") + """Construct Weaviate wrapper from raw documents. + + This is a user-friendly interface that: + 1. Embeds documents. + 2. Creates a new index for the embeddings in the Weaviate instance. + 3. Adds the documents to the newly created Weaviate index. + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain.vectorstores.weaviate import Weaviate + from langchain.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + weaviate = Weaviate.from_texts( + texts, + embeddings, + weaviate_url="http://localhost:8080" + ) + """ + weaviate_url = get_from_dict_or_env(kwargs, "weaviate_url", "WEAVIATE_URL") + + try: + from weaviate import Client + from weaviate.util import get_valid_uuid + except ImportError: + raise ValueError( + "Could not import weaviate python package. " + "Please install it with `pip instal weaviate-client`" + ) + + client = Client(weaviate_url) + index_name = kwargs.get("index_name", f"LangChain_{uuid4().hex}") + embeddings = embedding.embed_documents(texts) if embedding else None + text_key = "text" + schema = _default_schema(index_name) + attributes = list(metadatas[0].keys()) if metadatas else None + + # check whether the index already exists + if not client.schema.contains(schema): + client.schema.create_class(schema) + + with client.batch as batch: + for i, text in enumerate(texts): + data_properties = { + text_key: text, + } + if metadatas is not None: + for key in metadatas[i].keys(): + data_properties[key] = metadatas[i][key] + + _id = get_valid_uuid(uuid4()) + + # if an embedding strategy is not provided, we let + # weaviate create the embedding. Note that this will only + # work if weaviate has been installed with a vectorizer module + # like text2vec-contextionary for example + params = { + "uuid": _id, + "data_object": data_properties, + "class_name": index_name, + } + if embeddings is not None: + params["vector"] = (embeddings[i],) + + batch.add_data_object(**params) + + batch.flush() + + return cls(client, index_name, text_key, attributes) diff --git a/poetry.lock b/poetry.lock index 434a6bcc..7a88ee2a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "absl-py" @@ -499,7 +499,7 @@ name = "authlib" version = "1.2.0" description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients." category = "main" -optional = true +optional = false python-versions = "*" files = [ {file = "Authlib-1.2.0-py2.py3-none-any.whl", hash = "sha256:4ddf4fd6cfa75c9a460b361d4bd9dac71ffda0be879dbe4292a02e92349ad55a"}, @@ -7258,7 +7258,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -8360,7 +8360,7 @@ name = "validators" version = "0.20.0" description = "Python Data Validation for Humans™." category = "main" -optional = true +optional = false python-versions = ">=3.4" files = [ {file = "validators-0.20.0.tar.gz", hash = "sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a"}, @@ -8497,7 +8497,7 @@ name = "weaviate-client" version = "3.15.5" description = "A python native weaviate client" category = "main" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "weaviate-client-3.15.5.tar.gz", hash = "sha256:6da7e5d08dc9bb8b7879661d1a457c50af7d73e621a5305efe131160e83da69e"}, @@ -9026,13 +9026,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["aleph-alpha-client", "anthropic", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm"] cohere = ["cohere"] -llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] +llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "3c8488864a754852fdec3e56dd5630ed73852ec2120a94cfe22537c075901b24" +content-hash = "373f68ef16e7f3d5d9cde8b81c5f261096cc537ddca4f6a36711d7215b63f226" diff --git a/pyproject.toml b/pyproject.toml index d3f34c3c..3cc2d497 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,7 @@ pgvector = "^0.1.6" transformers = "^4.27.4" pandas = "^2.0.0" deeplake = "^3.2.21" +weaviate-client = "^3.15.5" torch = "^1.0.0" chromadb = "^0.3.21" tiktoken = "^0.3.3" diff --git a/tests/integration_tests/vectorstores/docker-compose/weaviate.yml b/tests/integration_tests/vectorstores/docker-compose/weaviate.yml new file mode 100644 index 00000000..a1911480 --- /dev/null +++ b/tests/integration_tests/vectorstores/docker-compose/weaviate.yml @@ -0,0 +1,22 @@ +version: '3.4' + +services: + weaviate: + command: + - --host + - 0.0.0.0 + - --port + - '8080' + - --scheme + - http + image: semitechnologies/weaviate:1.18.2 + ports: + - 8080:8080 + restart: on-failure:0 + environment: + QUERY_DEFAULTS_LIMIT: 25 + AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' + PERSISTENCE_DATA_PATH: '/var/lib/weaviate' + DEFAULT_VECTORIZER_MODULE: 'none' + ENABLE_MODULES: '' + CLUSTER_HOSTNAME: 'node1' diff --git a/tests/integration_tests/vectorstores/test_weaviate.py b/tests/integration_tests/vectorstores/test_weaviate.py new file mode 100644 index 00000000..5699ecea --- /dev/null +++ b/tests/integration_tests/vectorstores/test_weaviate.py @@ -0,0 +1,51 @@ +"""Test Weaviate functionality.""" +import logging +from typing import Generator, Union + +import pytest +from weaviate import Client + +from langchain.docstore.document import Document +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores.weaviate import Weaviate + +logging.basicConfig(level=logging.DEBUG) + +""" +cd tests/integration_tests/vectorstores/docker-compose +docker compose -f weaviate.yml up +""" + + +class TestWeaviate: + @pytest.fixture(scope="class", autouse=True) + def weaviate_url(self) -> Union[str, Generator[str, None, None]]: + """Return the weaviate url.""" + url = "http://localhost:8080" + yield url + + # Clear the test index + client = Client(url) + client.schema.delete_all() + + def test_similarity_search_without_metadata(self, weaviate_url: str) -> None: + """Test end to end construction and search without metadata.""" + texts = ["foo", "bar", "baz"] + docsearch = Weaviate.from_texts( + texts, + OpenAIEmbeddings(), + weaviate_url=weaviate_url, + ) + + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + def test_similarity_search_with_metadata(self, weaviate_url: str) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = Weaviate.from_texts( + texts, OpenAIEmbeddings(), metadatas=metadatas, weaviate_url=weaviate_url + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})]