2023-04-14 05:37:34 +00:00
|
|
|
"""Test Weaviate functionality."""
|
|
|
|
import logging
|
2023-04-16 20:11:30 +00:00
|
|
|
import os
|
2023-05-16 22:26:46 +00:00
|
|
|
import uuid
|
2023-04-14 05:37:34 +00:00
|
|
|
from typing import Generator, Union
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
from weaviate import Client
|
|
|
|
|
|
|
|
from langchain.docstore.document import Document
|
|
|
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
|
|
|
from langchain.vectorstores.weaviate import Weaviate
|
2023-04-27 04:45:03 +00:00
|
|
|
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
2023-04-14 05:37:34 +00:00
|
|
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
|
|
|
|
"""
|
|
|
|
cd tests/integration_tests/vectorstores/docker-compose
|
|
|
|
docker compose -f weaviate.yml up
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
class TestWeaviate:
|
2023-04-16 20:11:30 +00:00
|
|
|
@classmethod
|
|
|
|
def setup_class(cls) -> None:
|
|
|
|
if not os.getenv("OPENAI_API_KEY"):
|
|
|
|
raise ValueError("OPENAI_API_KEY environment variable is not set")
|
|
|
|
|
2023-04-14 05:37:34 +00:00
|
|
|
@pytest.fixture(scope="class", autouse=True)
|
|
|
|
def weaviate_url(self) -> Union[str, Generator[str, None, None]]:
|
|
|
|
"""Return the weaviate url."""
|
|
|
|
url = "http://localhost:8080"
|
|
|
|
yield url
|
|
|
|
|
|
|
|
# Clear the test index
|
|
|
|
client = Client(url)
|
|
|
|
client.schema.delete_all()
|
|
|
|
|
2023-04-16 20:11:30 +00:00
|
|
|
@pytest.mark.vcr(ignore_localhost=True)
|
|
|
|
def test_similarity_search_without_metadata(
|
|
|
|
self, weaviate_url: str, embedding_openai: OpenAIEmbeddings
|
|
|
|
) -> None:
|
2023-04-14 05:37:34 +00:00
|
|
|
"""Test end to end construction and search without metadata."""
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
docsearch = Weaviate.from_texts(
|
|
|
|
texts,
|
2023-04-16 20:11:30 +00:00
|
|
|
embedding_openai,
|
2023-04-14 05:37:34 +00:00
|
|
|
weaviate_url=weaviate_url,
|
|
|
|
)
|
|
|
|
|
|
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
|
2023-04-16 20:11:30 +00:00
|
|
|
@pytest.mark.vcr(ignore_localhost=True)
|
|
|
|
def test_similarity_search_with_metadata(
|
|
|
|
self, weaviate_url: str, embedding_openai: OpenAIEmbeddings
|
|
|
|
) -> None:
|
2023-04-14 05:37:34 +00:00
|
|
|
"""Test end to end construction and search with metadata."""
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
metadatas = [{"page": i} for i in range(len(texts))]
|
|
|
|
docsearch = Weaviate.from_texts(
|
2023-04-16 20:11:30 +00:00
|
|
|
texts, embedding_openai, metadatas=metadatas, weaviate_url=weaviate_url
|
2023-04-14 05:37:34 +00:00
|
|
|
)
|
|
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
|
|
assert output == [Document(page_content="foo", metadata={"page": 0})]
|
2023-04-16 20:11:30 +00:00
|
|
|
|
2023-04-25 05:15:32 +00:00
|
|
|
@pytest.mark.vcr(ignore_localhost=True)
|
|
|
|
def test_similarity_search_with_metadata_and_filter(
|
|
|
|
self, weaviate_url: str, embedding_openai: OpenAIEmbeddings
|
|
|
|
) -> None:
|
|
|
|
"""Test end to end construction and search with metadata."""
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
metadatas = [{"page": i} for i in range(len(texts))]
|
|
|
|
docsearch = Weaviate.from_texts(
|
|
|
|
texts, embedding_openai, metadatas=metadatas, weaviate_url=weaviate_url
|
|
|
|
)
|
|
|
|
output = docsearch.similarity_search(
|
|
|
|
"foo",
|
|
|
|
k=2,
|
|
|
|
where_filter={"path": ["page"], "operator": "Equal", "valueNumber": 0},
|
|
|
|
)
|
|
|
|
assert output == [Document(page_content="foo", metadata={"page": 0})]
|
|
|
|
|
2023-05-23 01:57:10 +00:00
|
|
|
@pytest.mark.vcr(ignore_localhost=True)
|
|
|
|
def test_similarity_search_with_metadata_and_additional(
|
|
|
|
self, weaviate_url: str, embedding_openai: OpenAIEmbeddings
|
|
|
|
) -> None:
|
|
|
|
"""Test end to end construction and search with metadata and additional."""
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
metadatas = [{"page": i} for i in range(len(texts))]
|
|
|
|
docsearch = Weaviate.from_texts(
|
|
|
|
texts, embedding_openai, metadatas=metadatas, weaviate_url=weaviate_url
|
|
|
|
)
|
|
|
|
output = docsearch.similarity_search(
|
|
|
|
"foo",
|
|
|
|
k=1,
|
|
|
|
additional=["certainty"],
|
|
|
|
)
|
|
|
|
assert output == [
|
|
|
|
Document(
|
|
|
|
page_content="foo",
|
|
|
|
metadata={"page": 0, "_additional": {"certainty": 1}},
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
2023-05-16 22:26:46 +00:00
|
|
|
@pytest.mark.vcr(ignore_localhost=True)
|
|
|
|
def test_similarity_search_with_uuids(
|
|
|
|
self, weaviate_url: str, embedding_openai: OpenAIEmbeddings
|
|
|
|
) -> None:
|
|
|
|
"""Test end to end construction and search with uuids."""
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
# Weaviate replaces the object if the UUID already exists
|
|
|
|
uuids = [uuid.uuid5(uuid.NAMESPACE_DNS, "same-name") for text in texts]
|
|
|
|
|
|
|
|
metadatas = [{"page": i} for i in range(len(texts))]
|
|
|
|
docsearch = Weaviate.from_texts(
|
|
|
|
texts,
|
|
|
|
embedding_openai,
|
|
|
|
metadatas=metadatas,
|
|
|
|
weaviate_url=weaviate_url,
|
|
|
|
uuids=uuids,
|
|
|
|
)
|
|
|
|
output = docsearch.similarity_search("foo", k=2)
|
|
|
|
assert len(output) == 1
|
|
|
|
|
2023-04-16 20:11:30 +00:00
|
|
|
@pytest.mark.vcr(ignore_localhost=True)
|
|
|
|
def test_max_marginal_relevance_search(
|
|
|
|
self, weaviate_url: str, embedding_openai: OpenAIEmbeddings
|
|
|
|
) -> None:
|
|
|
|
"""Test end to end construction and MRR search."""
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
metadatas = [{"page": i} for i in range(len(texts))]
|
|
|
|
|
|
|
|
docsearch = Weaviate.from_texts(
|
|
|
|
texts, embedding_openai, metadatas=metadatas, weaviate_url=weaviate_url
|
|
|
|
)
|
|
|
|
# if lambda=1 the algorithm should be equivalent to standard ranking
|
|
|
|
standard_ranking = docsearch.similarity_search("foo", k=2)
|
|
|
|
output = docsearch.max_marginal_relevance_search(
|
|
|
|
"foo", k=2, fetch_k=3, lambda_mult=1.0
|
|
|
|
)
|
|
|
|
assert output == standard_ranking
|
|
|
|
|
|
|
|
# if lambda=0 the algorithm should favour maximal diversity
|
|
|
|
output = docsearch.max_marginal_relevance_search(
|
|
|
|
"foo", k=2, fetch_k=3, lambda_mult=0.0
|
|
|
|
)
|
|
|
|
assert output == [
|
|
|
|
Document(page_content="foo", metadata={"page": 0}),
|
|
|
|
Document(page_content="bar", metadata={"page": 1}),
|
|
|
|
]
|
2023-04-24 18:50:55 +00:00
|
|
|
|
|
|
|
@pytest.mark.vcr(ignore_localhost=True)
|
|
|
|
def test_max_marginal_relevance_search_by_vector(
|
|
|
|
self, weaviate_url: str, embedding_openai: OpenAIEmbeddings
|
|
|
|
) -> None:
|
|
|
|
"""Test end to end construction and MRR search by vector."""
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
metadatas = [{"page": i} for i in range(len(texts))]
|
|
|
|
|
|
|
|
docsearch = Weaviate.from_texts(
|
|
|
|
texts, embedding_openai, metadatas=metadatas, weaviate_url=weaviate_url
|
|
|
|
)
|
|
|
|
foo_embedding = embedding_openai.embed_query("foo")
|
|
|
|
|
|
|
|
# if lambda=1 the algorithm should be equivalent to standard ranking
|
|
|
|
standard_ranking = docsearch.similarity_search("foo", k=2)
|
|
|
|
output = docsearch.max_marginal_relevance_search_by_vector(
|
|
|
|
foo_embedding, k=2, fetch_k=3, lambda_mult=1.0
|
|
|
|
)
|
|
|
|
assert output == standard_ranking
|
|
|
|
|
|
|
|
# if lambda=0 the algorithm should favour maximal diversity
|
|
|
|
output = docsearch.max_marginal_relevance_search_by_vector(
|
|
|
|
foo_embedding, k=2, fetch_k=3, lambda_mult=0.0
|
|
|
|
)
|
|
|
|
assert output == [
|
|
|
|
Document(page_content="foo", metadata={"page": 0}),
|
|
|
|
Document(page_content="bar", metadata={"page": 1}),
|
|
|
|
]
|
2023-04-25 05:15:32 +00:00
|
|
|
|
|
|
|
@pytest.mark.vcr(ignore_localhost=True)
|
|
|
|
def test_max_marginal_relevance_search_with_filter(
|
|
|
|
self, weaviate_url: str, embedding_openai: OpenAIEmbeddings
|
|
|
|
) -> None:
|
|
|
|
"""Test end to end construction and MRR search."""
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
metadatas = [{"page": i} for i in range(len(texts))]
|
|
|
|
|
|
|
|
docsearch = Weaviate.from_texts(
|
|
|
|
texts, embedding_openai, metadatas=metadatas, weaviate_url=weaviate_url
|
|
|
|
)
|
|
|
|
where_filter = {"path": ["page"], "operator": "Equal", "valueNumber": 0}
|
|
|
|
# if lambda=1 the algorithm should be equivalent to standard ranking
|
|
|
|
standard_ranking = docsearch.similarity_search(
|
|
|
|
"foo", k=2, where_filter=where_filter
|
|
|
|
)
|
|
|
|
output = docsearch.max_marginal_relevance_search(
|
|
|
|
"foo", k=2, fetch_k=3, lambda_mult=1.0, where_filter=where_filter
|
|
|
|
)
|
|
|
|
assert output == standard_ranking
|
|
|
|
|
|
|
|
# if lambda=0 the algorithm should favour maximal diversity
|
|
|
|
output = docsearch.max_marginal_relevance_search(
|
|
|
|
"foo", k=2, fetch_k=3, lambda_mult=0.0, where_filter=where_filter
|
|
|
|
)
|
|
|
|
assert output == [
|
|
|
|
Document(page_content="foo", metadata={"page": 0}),
|
|
|
|
]
|
2023-04-27 04:45:03 +00:00
|
|
|
|
|
|
|
def test_add_texts_with_given_embedding(self, weaviate_url: str) -> None:
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
embedding = FakeEmbeddings()
|
|
|
|
|
|
|
|
docsearch = Weaviate.from_texts(
|
|
|
|
texts, embedding=embedding, weaviate_url=weaviate_url
|
|
|
|
)
|
|
|
|
|
|
|
|
docsearch.add_texts(["foo"])
|
|
|
|
output = docsearch.similarity_search_by_vector(
|
|
|
|
embedding.embed_query("foo"), k=2
|
|
|
|
)
|
|
|
|
assert output == [
|
|
|
|
Document(page_content="foo"),
|
|
|
|
Document(page_content="foo"),
|
|
|
|
]
|
2023-05-16 22:26:46 +00:00
|
|
|
|
|
|
|
def test_add_texts_with_given_uuids(self, weaviate_url: str) -> None:
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
|
|
embedding = FakeEmbeddings()
|
|
|
|
uuids = [uuid.uuid5(uuid.NAMESPACE_DNS, text) for text in texts]
|
|
|
|
|
|
|
|
docsearch = Weaviate.from_texts(
|
|
|
|
texts,
|
|
|
|
embedding=embedding,
|
|
|
|
weaviate_url=weaviate_url,
|
|
|
|
uuids=uuids,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Weaviate replaces the object if the UUID already exists
|
|
|
|
docsearch.add_texts(["foo"], uuids=[uuids[0]])
|
|
|
|
output = docsearch.similarity_search_by_vector(
|
|
|
|
embedding.embed_query("foo"), k=2
|
|
|
|
)
|
|
|
|
assert output[0] == Document(page_content="foo")
|
|
|
|
assert output[1] != Document(page_content="foo")
|