forked from Archives/langchain
Harrison/weaviate fixes (#2872)
Co-authored-by: cs0lar <cristiano.solarino@gmail.com> Co-authored-by: cs0lar <cristiano.solarino@brightminded.com>
This commit is contained in:
parent
07d7096de6
commit
1e9378d0a8
@ -6,9 +6,22 @@ from uuid import uuid4
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.utils import get_from_dict_or_env
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
|
||||
def _default_schema(index_name: str) -> Dict:
|
||||
return {
|
||||
"class": index_name,
|
||||
"properties": [
|
||||
{
|
||||
"name": "text",
|
||||
"dataType": ["text"],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class Weaviate(VectorStore):
|
||||
"""Wrapper around Weaviate vector database.
|
||||
|
||||
@ -70,14 +83,24 @@ class Weaviate(VectorStore):
|
||||
data_properties[key] = metadatas[i][key]
|
||||
|
||||
_id = get_valid_uuid(uuid4())
|
||||
batch.add_data_object(data_properties, self._index_name, _id)
|
||||
batch.add_data_object(
|
||||
data_object=data_properties, class_name=self._index_name, uuid=_id
|
||||
)
|
||||
ids.append(_id)
|
||||
return ids
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Look up similar documents in weaviate."""
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query.
|
||||
"""
|
||||
content: Dict[str, Any] = {"concepts": [query]}
|
||||
if kwargs.get("search_distance"):
|
||||
content["certainty"] = kwargs.get("search_distance")
|
||||
@ -114,5 +137,74 @@ class Weaviate(VectorStore):
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Weaviate:
|
||||
"""Not implemented for Weaviate yet."""
|
||||
raise NotImplementedError("weaviate does not currently support `from_texts`.")
|
||||
"""Construct Weaviate wrapper from raw documents.
|
||||
|
||||
This is a user-friendly interface that:
|
||||
1. Embeds documents.
|
||||
2. Creates a new index for the embeddings in the Weaviate instance.
|
||||
3. Adds the documents to the newly created Weaviate index.
|
||||
|
||||
This is intended to be a quick way to get started.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.vectorstores.weaviate import Weaviate
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
embeddings = OpenAIEmbeddings()
|
||||
weaviate = Weaviate.from_texts(
|
||||
texts,
|
||||
embeddings,
|
||||
weaviate_url="http://localhost:8080"
|
||||
)
|
||||
"""
|
||||
weaviate_url = get_from_dict_or_env(kwargs, "weaviate_url", "WEAVIATE_URL")
|
||||
|
||||
try:
|
||||
from weaviate import Client
|
||||
from weaviate.util import get_valid_uuid
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import weaviate python package. "
|
||||
"Please install it with `pip instal weaviate-client`"
|
||||
)
|
||||
|
||||
client = Client(weaviate_url)
|
||||
index_name = kwargs.get("index_name", f"LangChain_{uuid4().hex}")
|
||||
embeddings = embedding.embed_documents(texts) if embedding else None
|
||||
text_key = "text"
|
||||
schema = _default_schema(index_name)
|
||||
attributes = list(metadatas[0].keys()) if metadatas else None
|
||||
|
||||
# check whether the index already exists
|
||||
if not client.schema.contains(schema):
|
||||
client.schema.create_class(schema)
|
||||
|
||||
with client.batch as batch:
|
||||
for i, text in enumerate(texts):
|
||||
data_properties = {
|
||||
text_key: text,
|
||||
}
|
||||
if metadatas is not None:
|
||||
for key in metadatas[i].keys():
|
||||
data_properties[key] = metadatas[i][key]
|
||||
|
||||
_id = get_valid_uuid(uuid4())
|
||||
|
||||
# if an embedding strategy is not provided, we let
|
||||
# weaviate create the embedding. Note that this will only
|
||||
# work if weaviate has been installed with a vectorizer module
|
||||
# like text2vec-contextionary for example
|
||||
params = {
|
||||
"uuid": _id,
|
||||
"data_object": data_properties,
|
||||
"class_name": index_name,
|
||||
}
|
||||
if embeddings is not None:
|
||||
params["vector"] = (embeddings[i],)
|
||||
|
||||
batch.add_data_object(**params)
|
||||
|
||||
batch.flush()
|
||||
|
||||
return cls(client, index_name, text_key, attributes)
|
||||
|
16
poetry.lock
generated
16
poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "absl-py"
|
||||
@ -499,7 +499,7 @@ name = "authlib"
|
||||
version = "1.2.0"
|
||||
description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients."
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "Authlib-1.2.0-py2.py3-none-any.whl", hash = "sha256:4ddf4fd6cfa75c9a460b361d4bd9dac71ffda0be879dbe4292a02e92349ad55a"},
|
||||
@ -7258,7 +7258,7 @@ files = [
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""}
|
||||
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
|
||||
|
||||
[package.extras]
|
||||
aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
|
||||
@ -8360,7 +8360,7 @@ name = "validators"
|
||||
version = "0.20.0"
|
||||
description = "Python Data Validation for Humans™."
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=3.4"
|
||||
files = [
|
||||
{file = "validators-0.20.0.tar.gz", hash = "sha256:24148ce4e64100a2d5e267233e23e7afeb55316b47d30faae7eb6e7292bc226a"},
|
||||
@ -8497,7 +8497,7 @@ name = "weaviate-client"
|
||||
version = "3.15.5"
|
||||
description = "A python native weaviate client"
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "weaviate-client-3.15.5.tar.gz", hash = "sha256:6da7e5d08dc9bb8b7879661d1a457c50af7d73e621a5305efe131160e83da69e"},
|
||||
@ -9026,13 +9026,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
||||
cffi = ["cffi (>=1.11)"]
|
||||
|
||||
[extras]
|
||||
all = ["aleph-alpha-client", "anthropic", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
||||
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm"]
|
||||
cohere = ["cohere"]
|
||||
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
|
||||
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
|
||||
openai = ["openai"]
|
||||
qdrant = ["qdrant-client"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "3c8488864a754852fdec3e56dd5630ed73852ec2120a94cfe22537c075901b24"
|
||||
content-hash = "373f68ef16e7f3d5d9cde8b81c5f261096cc537ddca4f6a36711d7215b63f226"
|
||||
|
@ -101,6 +101,7 @@ pgvector = "^0.1.6"
|
||||
transformers = "^4.27.4"
|
||||
pandas = "^2.0.0"
|
||||
deeplake = "^3.2.21"
|
||||
weaviate-client = "^3.15.5"
|
||||
torch = "^1.0.0"
|
||||
chromadb = "^0.3.21"
|
||||
tiktoken = "^0.3.3"
|
||||
|
@ -0,0 +1,22 @@
|
||||
version: '3.4'
|
||||
|
||||
services:
|
||||
weaviate:
|
||||
command:
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- '8080'
|
||||
- --scheme
|
||||
- http
|
||||
image: semitechnologies/weaviate:1.18.2
|
||||
ports:
|
||||
- 8080:8080
|
||||
restart: on-failure:0
|
||||
environment:
|
||||
QUERY_DEFAULTS_LIMIT: 25
|
||||
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
|
||||
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
|
||||
DEFAULT_VECTORIZER_MODULE: 'none'
|
||||
ENABLE_MODULES: ''
|
||||
CLUSTER_HOSTNAME: 'node1'
|
51
tests/integration_tests/vectorstores/test_weaviate.py
Normal file
51
tests/integration_tests/vectorstores/test_weaviate.py
Normal file
@ -0,0 +1,51 @@
|
||||
"""Test Weaviate functionality."""
|
||||
import logging
|
||||
from typing import Generator, Union
|
||||
|
||||
import pytest
|
||||
from weaviate import Client
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.vectorstores.weaviate import Weaviate
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
"""
|
||||
cd tests/integration_tests/vectorstores/docker-compose
|
||||
docker compose -f weaviate.yml up
|
||||
"""
|
||||
|
||||
|
||||
class TestWeaviate:
|
||||
@pytest.fixture(scope="class", autouse=True)
|
||||
def weaviate_url(self) -> Union[str, Generator[str, None, None]]:
|
||||
"""Return the weaviate url."""
|
||||
url = "http://localhost:8080"
|
||||
yield url
|
||||
|
||||
# Clear the test index
|
||||
client = Client(url)
|
||||
client.schema.delete_all()
|
||||
|
||||
def test_similarity_search_without_metadata(self, weaviate_url: str) -> None:
|
||||
"""Test end to end construction and search without metadata."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = Weaviate.from_texts(
|
||||
texts,
|
||||
OpenAIEmbeddings(),
|
||||
weaviate_url=weaviate_url,
|
||||
)
|
||||
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
def test_similarity_search_with_metadata(self, weaviate_url: str) -> None:
|
||||
"""Test end to end construction and search with metadata."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = Weaviate.from_texts(
|
||||
texts, OpenAIEmbeddings(), metadatas=metadatas, weaviate_url=weaviate_url
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo", metadata={"page": 0})]
|
Loading…
Reference in New Issue
Block a user