qdrant: test new QdrantVectorStore (#24165)

## Description

This PR adds integration tests to follow up on #24164.

By default, the tests use an in-memory instance.

To run the full suite of tests, with both in-memory and Qdrant server:

```
$ docker run -p 6333:6333 qdrant/qdrant

$ make test

$ make integration_test
```

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Anush 2024-07-13 05:29:30 +05:30 committed by GitHub
parent f071581aea
commit a653b209ba
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 1021 additions and 2 deletions

View File

@ -4,6 +4,8 @@ import requests # type: ignore
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_qdrant import SparseEmbeddings, SparseVector
def qdrant_running_locally() -> bool:
"""Check if Qdrant is running at http://localhost:6333."""
@ -55,3 +57,29 @@ class ConsistentFakeEmbeddings(Embeddings):
"""Return consistent embeddings for the text, if seen before, or a constant
one if the text is unknown."""
return self.embed_documents([text])[0]
class ConsistentFakeSparseEmbeddings(SparseEmbeddings):
"""Fake sparse embeddings which remembers all the texts seen so far "
"to return consistent vectors for the same texts."""
def __init__(self, dimensionality: int = 25) -> None:
self.known_texts: List[str] = []
self.dimensionality = 25
def embed_documents(self, texts: List[str]) -> List[SparseVector]:
"""Return consistent embeddings for each text seen so far."""
out_vectors = []
for text in texts:
if text not in self.known_texts:
self.known_texts.append(text)
index = self.known_texts.index(text)
indices = [i + index for i in range(self.dimensionality)]
values = [1.0] * (self.dimensionality - 1) + [float(index)]
out_vectors.append(SparseVector(indices=indices, values=values))
return out_vectors
def embed_query(self, text: str) -> SparseVector:
"""Return consistent embeddings for the text, "
"if seen before, or a constant one if the text is unknown."""
return self.embed_documents([text])[0]

View File

@ -5,8 +5,8 @@ from qdrant_client import QdrantClient
from tests.integration_tests.fixtures import qdrant_locations
def pytest_sessionfinish() -> None:
"""Clean up all collections after the test session."""
def pytest_runtest_teardown() -> None:
"""Clean up all collections after the each test."""
for location in qdrant_locations():
client = QdrantClient(location=location, api_key=os.getenv("QDRANT_API_KEY"))
collections = client.get_collections().collections

View File

@ -2,6 +2,7 @@ import logging
import os
from typing import List
from langchain_qdrant.qdrant import RetrievalMode
from tests.integration_tests.common import qdrant_running_locally
logger = logging.getLogger(__name__)
@ -23,3 +24,20 @@ def qdrant_locations(use_in_memory: bool = True) -> List[str]:
locations.append(qdrant_url)
return locations
def retrieval_modes(
*, dense: bool = True, sparse: bool = True, hybrid: bool = True
) -> List[RetrievalMode]:
modes = []
if dense:
modes.append(RetrievalMode.DENSE)
if sparse:
modes.append(RetrievalMode.SPARSE)
if hybrid:
modes.append(RetrievalMode.HYBRID)
return modes

View File

@ -0,0 +1,143 @@
import uuid
from typing import List, Union
import pytest
from langchain_core.documents import Document
from qdrant_client import QdrantClient, models
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from tests.integration_tests.common import (
ConsistentFakeEmbeddings,
ConsistentFakeSparseEmbeddings,
assert_documents_equals,
)
from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
def test_qdrant_add_documents_extends_existing_collection(
location: str,
vector_name: str,
retrieval_mode: RetrievalMode,
sparse_vector_name: str,
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
new_texts = ["foobar", "foobaz"]
docsearch.add_documents([Document(page_content=content) for content in new_texts])
output = docsearch.similarity_search("foobar", k=1)
assert_documents_equals(output, [Document(page_content="foobar")])
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
@pytest.mark.parametrize("batch_size", [1, 64])
def test_qdrant_add_texts_returns_all_ids(
location: str,
vector_name: str,
retrieval_mode: RetrievalMode,
sparse_vector_name: str,
batch_size: int,
) -> None:
"""Test end to end Qdrant.add_texts returns unique ids."""
docsearch = QdrantVectorStore.from_texts(
["foobar"],
ConsistentFakeEmbeddings(),
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
batch_size=batch_size,
)
ids = docsearch.add_texts(["foo", "bar", "baz"])
assert 3 == len(ids)
assert 3 == len(set(ids))
assert 3 == len(docsearch.get_by_ids(ids))
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
def test_qdrant_add_texts_stores_duplicated_texts(
location: str,
vector_name: str,
) -> None:
"""Test end to end Qdrant.add_texts stores duplicated texts separately."""
client = QdrantClient(location)
collection_name = uuid.uuid4().hex
vectors_config = {
vector_name: models.VectorParams(size=10, distance=models.Distance.COSINE)
}
client.recreate_collection(collection_name, vectors_config=vectors_config)
vec_store = QdrantVectorStore(
client,
collection_name,
embedding=ConsistentFakeEmbeddings(),
vector_name=vector_name,
)
ids = vec_store.add_texts(["abc", "abc"], [{"a": 1}, {"a": 2}])
assert 2 == len(set(ids))
assert 2 == client.count(collection_name).count
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
@pytest.mark.parametrize("batch_size", [1, 64])
def test_qdrant_add_texts_stores_ids(
location: str,
vector_name: str,
retrieval_mode: RetrievalMode,
sparse_vector_name: str,
batch_size: int,
) -> None:
"""Test end to end Qdrant.add_texts stores provided ids."""
ids: List[Union[str, int]] = [
"fa38d572-4c31-4579-aedc-1960d79df6df",
432,
432145435,
]
collection_name = uuid.uuid4().hex
vec_store = QdrantVectorStore.from_texts(
["abc", "def", "ghi"],
ConsistentFakeEmbeddings(),
ids=ids,
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
batch_size=batch_size,
)
assert 3 == vec_store.client.count(collection_name).count
stored_ids = [point.id for point in vec_store.client.scroll(collection_name)[0]]
assert set(ids) == set(stored_ids)
assert 3 == len(vec_store.get_by_ids(ids))

View File

@ -0,0 +1,51 @@
import uuid
import pytest
from langchain_qdrant.qdrant import QdrantVectorStore, RetrievalMode
from tests.integration_tests.common import (
ConsistentFakeEmbeddings,
ConsistentFakeSparseEmbeddings,
)
from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes
@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False))
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
def test_qdrant_from_existing_collection_uses_same_collection(
location: str,
vector_name: str,
retrieval_mode: RetrievalMode,
sparse_vector_name: str,
) -> None:
"""Test if the QdrantVectorStore.from_existing_collection reuses the collection."""
collection_name = uuid.uuid4().hex
docs = ["foo"]
QdrantVectorStore.from_texts(
docs,
embedding=ConsistentFakeEmbeddings(),
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
qdrant = QdrantVectorStore.from_existing_collection(
collection_name,
embedding=ConsistentFakeEmbeddings(),
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
qdrant.add_texts(["baz", "bar"])
assert 3 == qdrant.client.count(collection_name).count

View File

@ -0,0 +1,385 @@
import uuid
from typing import List, Union
import pytest
from langchain_core.documents import Document
from qdrant_client import models
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from langchain_qdrant.qdrant import QdrantVectorStoreError
from tests.integration_tests.common import (
ConsistentFakeEmbeddings,
ConsistentFakeSparseEmbeddings,
assert_documents_equals,
)
from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
def test_vectorstore_from_texts(location: str, retrieval_mode: RetrievalMode) -> None:
"""Test end to end Qdrant.from_texts stores texts."""
collection_name = uuid.uuid4().hex
vec_store = QdrantVectorStore.from_texts(
["Lorem ipsum dolor sit amet", "Ipsum dolor sit amet"],
ConsistentFakeEmbeddings(),
collection_name=collection_name,
location=location,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
assert 2 == vec_store.client.count(collection_name).count
@pytest.mark.parametrize("batch_size", [1, 64])
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
def test_qdrant_from_texts_stores_ids(
batch_size: int,
vector_name: str,
sparse_vector_name: str,
location: str,
retrieval_mode: RetrievalMode,
) -> None:
"""Test end to end Qdrant.from_texts stores provided ids."""
collection_name = uuid.uuid4().hex
ids: List[Union[str, int]] = [
"fa38d572-4c31-4579-aedc-1960d79df6df",
786,
]
vec_store = QdrantVectorStore.from_texts(
["abc", "def"],
ConsistentFakeEmbeddings(),
ids=ids,
collection_name=collection_name,
location=location,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
batch_size=batch_size,
vector_name=vector_name,
sparse_vector_name=sparse_vector_name,
)
assert 2 == vec_store.client.count(collection_name).count
stored_ids = [point.id for point in vec_store.client.retrieve(collection_name, ids)]
assert set(ids) == set(stored_ids)
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
def test_qdrant_from_texts_stores_embeddings_as_named_vectors(
location: str,
retrieval_mode: RetrievalMode,
vector_name: str,
sparse_vector_name: str,
) -> None:
"""Test end to end Qdrant.from_texts stores named vectors if name is provided."""
collection_name = uuid.uuid4().hex
vec_store = QdrantVectorStore.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(),
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
assert 5 == vec_store.client.count(collection_name).count
if retrieval_mode in retrieval_modes(sparse=False):
assert all(
(vector_name in point.vector or isinstance(point.vector, list)) # type: ignore
for point in vec_store.client.scroll(collection_name, with_vectors=True)[0]
)
if retrieval_mode in retrieval_modes(dense=False):
assert all(
sparse_vector_name in point.vector # type: ignore
for point in vec_store.client.scroll(collection_name, with_vectors=True)[0]
)
@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False))
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
def test_qdrant_from_texts_reuses_same_collection(
location: str,
retrieval_mode: RetrievalMode,
vector_name: str,
sparse_vector_name: str,
) -> None:
"""Test if Qdrant.from_texts reuses the same collection"""
collection_name = uuid.uuid4().hex
embeddings = ConsistentFakeEmbeddings()
sparse_embeddings = ConsistentFakeSparseEmbeddings()
vec_store = QdrantVectorStore.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
embeddings,
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=sparse_embeddings,
)
del vec_store
vec_store = QdrantVectorStore.from_texts(
["foo", "bar"],
embeddings,
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=sparse_embeddings,
)
assert 7 == vec_store.client.count(collection_name).count
@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False))
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False))
def test_qdrant_from_texts_raises_error_on_different_dimensionality(
location: str,
vector_name: str,
retrieval_mode: RetrievalMode,
) -> None:
"""Test if Qdrant.from_texts raises an exception if dimensionality does not match"""
collection_name = uuid.uuid4().hex
QdrantVectorStore.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(dimensionality=10),
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
with pytest.raises(QdrantVectorStoreError) as excinfo:
QdrantVectorStore.from_texts(
["foo", "bar"],
ConsistentFakeEmbeddings(dimensionality=5),
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
expected_message = "collection is configured for dense vectors "
"with 10 dimensions. Selected embeddings are 5-dimensional"
assert expected_message in str(excinfo.value)
@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False))
@pytest.mark.parametrize(
["first_vector_name", "second_vector_name"],
[
("", "custom-vector"),
("custom-vector", ""),
("my-first-vector", "my-second_vector"),
],
)
@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False))
def test_qdrant_from_texts_raises_error_on_different_vector_name(
location: str,
first_vector_name: str,
second_vector_name: str,
retrieval_mode: RetrievalMode,
) -> None:
"""Test if Qdrant.from_texts raises an exception if vector name does not match"""
collection_name = uuid.uuid4().hex
QdrantVectorStore.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(dimensionality=10),
collection_name=collection_name,
location=location,
vector_name=first_vector_name,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
with pytest.raises(QdrantVectorStoreError) as excinfo:
QdrantVectorStore.from_texts(
["foo", "bar"],
ConsistentFakeEmbeddings(dimensionality=10),
collection_name=collection_name,
location=location,
vector_name=second_vector_name,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
expected_message = "does not contain dense vector named"
assert expected_message in str(excinfo.value)
@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False))
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False))
def test_qdrant_from_texts_raises_error_on_different_distance(
location: str, vector_name: str, retrieval_mode: RetrievalMode
) -> None:
"""Test if Qdrant.from_texts raises an exception if distance does not match"""
collection_name = uuid.uuid4().hex
QdrantVectorStore.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(),
collection_name=collection_name,
location=location,
vector_name=vector_name,
distance=models.Distance.COSINE,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
with pytest.raises(QdrantVectorStoreError) as excinfo:
QdrantVectorStore.from_texts(
["foo", "bar"],
ConsistentFakeEmbeddings(),
collection_name=collection_name,
location=location,
vector_name=vector_name,
distance=models.Distance.EUCLID,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
expected_message = "configured for COSINE similarity, but requested EUCLID"
assert expected_message in str(excinfo.value)
@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False))
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
def test_qdrant_from_texts_recreates_collection_on_force_recreate(
location: str,
vector_name: str,
retrieval_mode: RetrievalMode,
sparse_vector_name: str,
) -> None:
collection_name = uuid.uuid4().hex
vec_store = QdrantVectorStore.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(dimensionality=10),
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
vec_store = QdrantVectorStore.from_texts(
["foo", "bar"],
ConsistentFakeEmbeddings(dimensionality=5),
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
force_recreate=True,
)
assert 2 == vec_store.client.count(collection_name).count
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("content_payload_key", [QdrantVectorStore.CONTENT_KEY, "foo"])
@pytest.mark.parametrize(
"metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"]
)
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
def test_qdrant_from_texts_stores_metadatas(
location: str,
content_payload_key: str,
metadata_payload_key: str,
vector_name: str,
retrieval_mode: RetrievalMode,
sparse_vector_name: str,
) -> None:
"""Test end to end construction and search."""
texts = ["fabrin", "barizda"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=location,
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
output = docsearch.similarity_search("fabrin", k=1)
assert_documents_equals(
output, [Document(page_content="fabrin", metadata={"page": 0})]
)
@pytest.mark.parametrize("location", qdrant_locations(use_in_memory=False))
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False))
@pytest.mark.parametrize(
"sparse_vector_name", ["my-sparse-vector", "another-sparse-vector"]
)
def test_from_texts_passed_optimizers_config_and_on_disk_payload(
location: str,
vector_name: str,
retrieval_mode: RetrievalMode,
sparse_vector_name: str,
) -> None:
collection_name = uuid.uuid4().hex
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
optimizers_config = models.OptimizersConfigDiff(memmap_threshold=1000)
vec_store = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
collection_create_options={
"on_disk_payload": True,
"optimizers_config": optimizers_config,
},
vector_params={
"on_disk": True,
},
collection_name=collection_name,
location=location,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_vector_name=sparse_vector_name,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
collection_info = vec_store.client.get_collection(collection_name)
assert collection_info.config.params.vectors[vector_name].on_disk is True # type: ignore
assert collection_info.config.optimizer_config.memmap_threshold == 1000
assert collection_info.config.params.on_disk_payload is True

View File

@ -0,0 +1,116 @@
import pytest # type: ignore[import-not-found]
from langchain_core.documents import Document
from qdrant_client import models
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from langchain_qdrant.qdrant import QdrantVectorStoreError
from tests.integration_tests.common import (
ConsistentFakeEmbeddings,
ConsistentFakeSparseEmbeddings,
assert_documents_equals,
)
from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes
# MMR is supported when dense embeddings are available
# i.e. In Dense and Hybrid retrieval modes
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize(
"content_payload_key", [QdrantVectorStore.CONTENT_KEY, "test_content"]
)
@pytest.mark.parametrize(
"metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "test_metadata"]
)
@pytest.mark.parametrize("retrieval_mode", retrieval_modes(sparse=False))
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
def test_qdrant_mmr_search(
location: str,
content_payload_key: str,
metadata_payload_key: str,
retrieval_mode: RetrievalMode,
vector_name: str,
) -> None:
"""Test end to end construction and MRR search."""
filter = models.Filter(
must=[
models.FieldCondition(
key=f"{metadata_payload_key}.page",
match=models.MatchValue(
value=2,
),
),
],
)
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
location=location,
retrieval_mode=retrieval_mode,
vector_name=vector_name,
distance=models.Distance.EUCLID,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
output = docsearch.max_marginal_relevance_search(
"foo", k=2, fetch_k=3, lambda_mult=0.0
)
assert_documents_equals(
output,
[
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="baz", metadata={"page": 2}),
],
)
output = docsearch.max_marginal_relevance_search(
"foo", k=2, fetch_k=3, lambda_mult=0.0, filter=filter
)
assert_documents_equals(
output,
[Document(page_content="baz", metadata={"page": 2})],
)
# MMR shouldn't work with only sparse retrieval mode
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize(
"content_payload_key", [QdrantVectorStore.CONTENT_KEY, "test_content"]
)
@pytest.mark.parametrize(
"metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "test_metadata"]
)
@pytest.mark.parametrize("retrieval_mode", retrieval_modes(dense=False, hybrid=False))
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
def test_invalid_qdrant_mmr_with_sparse(
location: str,
content_payload_key: str,
metadata_payload_key: str,
retrieval_mode: RetrievalMode,
vector_name: str,
) -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
location=location,
retrieval_mode=retrieval_mode,
vector_name=vector_name,
distance=models.Distance.EUCLID,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
with pytest.raises(QdrantVectorStoreError) as excinfo:
docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3, lambda_mult=0.0)
expected_message = "does not contain dense vector named"
assert expected_message in str(excinfo.value)

View File

@ -0,0 +1,278 @@
import pytest
from langchain_core.documents import Document
from qdrant_client import models
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from tests.integration_tests.common import (
ConsistentFakeEmbeddings,
ConsistentFakeSparseEmbeddings,
assert_documents_equals,
)
from tests.integration_tests.fixtures import qdrant_locations, retrieval_modes
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
@pytest.mark.parametrize("batch_size", [1, 64])
def test_similarity_search(
location: str,
vector_name: str,
retrieval_mode: RetrievalMode,
batch_size: int,
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
location=location,
batch_size=batch_size,
vector_name=vector_name,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
output = docsearch.similarity_search("foo", k=1)
assert_documents_equals(actual=output, expected=[Document(page_content="foo")])
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("content_payload_key", [QdrantVectorStore.CONTENT_KEY, "foo"])
@pytest.mark.parametrize(
"metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"]
)
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
@pytest.mark.parametrize("batch_size", [1, 64])
def test_similarity_search_by_vector(
location: str,
content_payload_key: str,
metadata_payload_key: str,
vector_name: str,
batch_size: int,
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
location=location,
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
batch_size=batch_size,
vector_name=vector_name,
)
embeddings = ConsistentFakeEmbeddings().embed_query("foo")
output = docsearch.similarity_search_by_vector(embeddings, k=1)
assert_documents_equals(output, [Document(page_content="foo")])
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize(
"metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"]
)
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
def test_similarity_search_filters(
location: str,
metadata_payload_key: str,
retrieval_mode: RetrievalMode,
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=location,
metadata_payload_key=metadata_payload_key,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
qdrant_filter = models.Filter(
must=[
models.FieldCondition(
key=f"{metadata_payload_key}.page", match=models.MatchValue(value=1)
)
]
)
output = docsearch.similarity_search("foo", k=1, filter=qdrant_filter)
assert_documents_equals(
actual=output,
expected=[
Document(
page_content="bar",
metadata={"page": 1, "metadata": {"page": 2, "pages": [3, -1]}},
)
],
)
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
def test_similarity_relevance_search_no_threshold(
location: str,
vector_name: str,
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=location,
vector_name=vector_name,
)
output = docsearch.similarity_search_with_relevance_scores(
"foo", k=3, score_threshold=None
)
assert len(output) == 3
for i in range(len(output)):
assert round(output[i][1], 2) >= 0
assert round(output[i][1], 2) <= 1
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
def test_relevance_search_with_threshold(
location: str,
vector_name: str,
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=location,
vector_name=vector_name,
)
score_threshold = 0.99
kwargs = {"score_threshold": score_threshold}
output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs)
assert len(output) == 1
assert all([score >= score_threshold for _, score in output])
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("content_payload_key", [QdrantVectorStore.CONTENT_KEY, "foo"])
@pytest.mark.parametrize(
"metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"]
)
@pytest.mark.parametrize("vector_name", ["", "my-vector"])
def test_relevance_search_with_threshold_and_filter(
location: str,
content_payload_key: str,
metadata_payload_key: str,
vector_name: str,
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=location,
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
vector_name=vector_name,
)
score_threshold = 0.99 # for almost exact match
negative_filter = models.Filter(
must=[
models.FieldCondition(
key=f"{metadata_payload_key}.page", match=models.MatchValue(value=1)
)
]
)
kwargs = {"filter": negative_filter, "score_threshold": score_threshold}
output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs)
assert len(output) == 0
positive_filter = models.Filter(
must=[
models.FieldCondition(
key=f"{metadata_payload_key}.page", match=models.MatchValue(value=0)
)
]
)
kwargs = {"filter": positive_filter, "score_threshold": score_threshold}
output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs)
assert len(output) == 1
assert all([score >= score_threshold for _, score in output])
@pytest.mark.parametrize("location", qdrant_locations())
@pytest.mark.parametrize("content_payload_key", [QdrantVectorStore.CONTENT_KEY, "foo"])
@pytest.mark.parametrize(
"metadata_payload_key", [QdrantVectorStore.METADATA_KEY, "bar"]
)
@pytest.mark.parametrize("retrieval_mode", retrieval_modes())
def test_similarity_search_filters_with_qdrant_filters(
location: str,
content_payload_key: str,
metadata_payload_key: str,
retrieval_mode: RetrievalMode,
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "details": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = QdrantVectorStore.from_texts(
texts,
ConsistentFakeEmbeddings(),
location=location,
metadatas=metadatas,
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
retrieval_mode=retrieval_mode,
sparse_embedding=ConsistentFakeSparseEmbeddings(),
)
qdrant_filter = models.Filter(
must=[
models.FieldCondition(
key=content_payload_key, match=models.MatchValue(value="bar")
),
models.FieldCondition(
key=f"{metadata_payload_key}.page",
match=models.MatchValue(value=1),
),
models.FieldCondition(
key=f"{metadata_payload_key}.details.page",
match=models.MatchValue(value=2),
),
models.FieldCondition(
key=f"{metadata_payload_key}.details.pages",
match=models.MatchAny(any=[3]),
),
]
)
output = docsearch.similarity_search("foo", k=1, filter=qdrant_filter)
assert_documents_equals(
actual=output,
expected=[
Document(
page_content="bar",
metadata={"page": 1, "details": {"page": 2, "pages": [3, -1]}},
)
],
)