langchain/tests/integration_tests/vectorstores/test_qdrant.py
Kacper Łukawski 140ba682f1
Support named vectors in Qdrant (#6871)
# Description

This PR makes it possible to use named vectors from Qdrant in Langchain.
That was requested multiple times, as people want to reuse externally
created collections in Langchain. It doesn't change anything for the
existing applications. The changes were covered with some integration
tests and included in the docs.

## Example

```python
Qdrant.from_documents(
    docs,
    embeddings,
    location=":memory:",
    collection_name="my_documents",
    vector_name="custom_vector",
)
```

### Issue: #2594 

Tagging @rlancemartin & @eyurtsev. I'd appreciate your review.
2023-06-29 15:14:22 -07:00

516 lines
17 KiB
Python

"""Test Qdrant functionality."""
import tempfile
from typing import Callable, Optional
import pytest
from qdrant_client.http import models as rest
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Qdrant
from tests.integration_tests.vectorstores.fake_embeddings import (
ConsistentFakeEmbeddings,
)
@pytest.mark.parametrize("batch_size", [1, 64])
@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"])
@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"])
def test_qdrant_similarity_search(
batch_size: int, content_payload_key: str, metadata_payload_key: str
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
location=":memory:",
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
batch_size=batch_size,
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
@pytest.mark.parametrize("batch_size", [1, 64])
@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"])
@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"])
def test_qdrant_similarity_search_by_vector(
batch_size: int, content_payload_key: str, metadata_payload_key: str
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
location=":memory:",
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
batch_size=batch_size,
)
embeddings = ConsistentFakeEmbeddings().embed_query("foo")
output = docsearch.similarity_search_by_vector(embeddings, k=1)
assert output == [Document(page_content="foo")]
@pytest.mark.parametrize("batch_size", [1, 64])
@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"])
@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"])
def test_qdrant_similarity_search_with_score_by_vector(
batch_size: int, content_payload_key: str, metadata_payload_key: str
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
location=":memory:",
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
batch_size=batch_size,
)
embeddings = ConsistentFakeEmbeddings().embed_query("foo")
output = docsearch.similarity_search_with_score_by_vector(embeddings, k=1)
assert len(output) == 1
document, score = output[0]
assert document == Document(page_content="foo")
assert score >= 0
@pytest.mark.parametrize("batch_size", [1, 64])
@pytest.mark.parametrize("vector_name", [None, "my-vector"])
def test_qdrant_add_documents(batch_size: int, vector_name: Optional[str]) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch: Qdrant = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
location=":memory:",
batch_size=batch_size,
vector_name=vector_name,
)
new_texts = ["foobar", "foobaz"]
docsearch.add_documents(
[Document(page_content=content) for content in new_texts], batch_size=batch_size
)
output = docsearch.similarity_search("foobar", k=1)
# StatefulFakeEmbeddings return the same query embedding as the first document
# embedding computed in `embedding.embed_documents`. Thus, "foo" embedding is the
# same as "foobar" embedding
assert output == [Document(page_content="foobar")] or output == [
Document(page_content="foo")
]
@pytest.mark.parametrize("batch_size", [1, 64])
def test_qdrant_add_texts_returns_all_ids(batch_size: int) -> None:
"""Test end to end Qdrant.add_texts returns unique ids."""
docsearch: Qdrant = Qdrant.from_texts(
["foobar"],
ConsistentFakeEmbeddings(),
location=":memory:",
batch_size=batch_size,
)
ids = docsearch.add_texts(["foo", "bar", "baz"])
assert 3 == len(ids)
assert 3 == len(set(ids))
@pytest.mark.parametrize("batch_size", [1, 64])
@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"])
@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"])
def test_qdrant_with_metadatas(
batch_size: int, content_payload_key: str, metadata_payload_key: str
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=":memory:",
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
batch_size=batch_size,
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={"page": 0})]
@pytest.mark.parametrize("batch_size", [1, 64])
def test_qdrant_similarity_search_filters(batch_size: int) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=":memory:",
batch_size=batch_size,
)
output = docsearch.similarity_search(
"foo", k=1, filter={"page": 1, "metadata": {"page": 2, "pages": [3]}}
)
assert output == [
Document(
page_content="bar",
metadata={"page": 1, "metadata": {"page": 2, "pages": [3, -1]}},
)
]
@pytest.mark.parametrize("vector_name", [None, "my-vector"])
def test_qdrant_similarity_search_with_relevance_score_no_threshold(
vector_name: Optional[str],
) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=":memory:",
vector_name=vector_name,
)
output = docsearch.similarity_search_with_relevance_scores(
"foo", k=3, score_threshold=None
)
assert len(output) == 3
for i in range(len(output)):
assert round(output[i][1], 2) >= 0
assert round(output[i][1], 2) <= 1
def test_qdrant_similarity_search_with_relevance_score_with_threshold() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=":memory:",
)
score_threshold = 0.98
kwargs = {"score_threshold": score_threshold}
output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs)
assert len(output) == 1
assert all([score >= score_threshold for _, score in output])
def test_qdrant_similarity_search_with_relevance_score_with_threshold_and_filter() -> (
None
):
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "metadata": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=":memory:",
)
score_threshold = 0.99 # for almost exact match
# test negative filter condition
negative_filter = {"page": 1, "metadata": {"page": 2, "pages": [3]}}
kwargs = {"filter": negative_filter, "score_threshold": score_threshold}
output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs)
assert len(output) == 0
# test positive filter condition
positive_filter = {"page": 0, "metadata": {"page": 1, "pages": [2]}}
kwargs = {"filter": positive_filter, "score_threshold": score_threshold}
output = docsearch.similarity_search_with_relevance_scores("foo", k=3, **kwargs)
assert len(output) == 1
assert all([score >= score_threshold for _, score in output])
def test_qdrant_similarity_search_filters_with_qdrant_filters() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": i, "details": {"page": i + 1, "pages": [i + 2, -1]}}
for i in range(len(texts))
]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=":memory:",
)
qdrant_filter = rest.Filter(
must=[
rest.FieldCondition(
key="metadata.page",
match=rest.MatchValue(value=1),
),
rest.FieldCondition(
key="metadata.details.page",
match=rest.MatchValue(value=2),
),
rest.FieldCondition(
key="metadata.details.pages",
match=rest.MatchAny(any=[3]),
),
]
)
output = docsearch.similarity_search("foo", k=1, filter=qdrant_filter)
assert output == [
Document(
page_content="bar",
metadata={"page": 1, "details": {"page": 2, "pages": [3, -1]}},
)
]
@pytest.mark.parametrize("batch_size", [1, 64])
@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "test_content"])
@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "test_metadata"])
@pytest.mark.parametrize("vector_name", [None, "my-vector"])
def test_qdrant_max_marginal_relevance_search(
batch_size: int,
content_payload_key: str,
metadata_payload_key: str,
vector_name: Optional[str],
) -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Qdrant.from_texts(
texts,
ConsistentFakeEmbeddings(),
metadatas=metadatas,
location=":memory:",
content_payload_key=content_payload_key,
metadata_payload_key=metadata_payload_key,
batch_size=batch_size,
vector_name=vector_name,
)
output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3)
assert output == [
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="bar", metadata={"page": 1}),
]
@pytest.mark.parametrize(
["embeddings", "embedding_function"],
[
(ConsistentFakeEmbeddings(), None),
(ConsistentFakeEmbeddings().embed_query, None),
(None, ConsistentFakeEmbeddings().embed_query),
],
)
def test_qdrant_embedding_interface(
embeddings: Optional[Embeddings], embedding_function: Optional[Callable]
) -> None:
"""Test Qdrant may accept different types for embeddings."""
from qdrant_client import QdrantClient
client = QdrantClient(":memory:")
collection_name = "test"
Qdrant(
client,
collection_name,
embeddings=embeddings,
embedding_function=embedding_function,
)
@pytest.mark.parametrize(
["embeddings", "embedding_function"],
[
(ConsistentFakeEmbeddings(), ConsistentFakeEmbeddings().embed_query),
(None, None),
],
)
def test_qdrant_embedding_interface_raises_value_error(
embeddings: Optional[Embeddings], embedding_function: Optional[Callable]
) -> None:
"""Test Qdrant requires only one method for embeddings."""
from qdrant_client import QdrantClient
client = QdrantClient(":memory:")
collection_name = "test"
with pytest.raises(ValueError):
Qdrant(
client,
collection_name,
embeddings=embeddings,
embedding_function=embedding_function,
)
@pytest.mark.parametrize("vector_name", [None, "my-vector"])
def test_qdrant_add_texts_stores_duplicated_texts(vector_name: Optional[str]) -> None:
"""Test end to end Qdrant.add_texts stores duplicated texts separately."""
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
client = QdrantClient(":memory:")
collection_name = "test"
vectors_config = rest.VectorParams(size=10, distance=rest.Distance.COSINE)
if vector_name is not None:
vectors_config = {vector_name: vectors_config} # type: ignore[assignment]
client.recreate_collection(collection_name, vectors_config=vectors_config)
vec_store = Qdrant(
client,
collection_name,
embeddings=ConsistentFakeEmbeddings(),
vector_name=vector_name,
)
ids = vec_store.add_texts(["abc", "abc"], [{"a": 1}, {"a": 2}])
assert 2 == len(set(ids))
assert 2 == client.count(collection_name).count
def test_qdrant_from_texts_stores_duplicated_texts() -> None:
"""Test end to end Qdrant.from_texts stores duplicated texts separately."""
from qdrant_client import QdrantClient
with tempfile.TemporaryDirectory() as tmpdir:
vec_store = Qdrant.from_texts(
["abc", "abc"],
ConsistentFakeEmbeddings(),
collection_name="test",
path=str(tmpdir),
)
del vec_store
client = QdrantClient(path=str(tmpdir))
assert 2 == client.count("test").count
@pytest.mark.parametrize("batch_size", [1, 64])
@pytest.mark.parametrize("vector_name", [None, "my-vector"])
def test_qdrant_from_texts_stores_ids(
batch_size: int, vector_name: Optional[str]
) -> None:
"""Test end to end Qdrant.from_texts stores provided ids."""
from qdrant_client import QdrantClient
with tempfile.TemporaryDirectory() as tmpdir:
ids = [
"fa38d572-4c31-4579-aedc-1960d79df6df",
"cdc1aa36-d6ab-4fb2-8a94-56674fd27484",
]
vec_store = Qdrant.from_texts(
["abc", "def"],
ConsistentFakeEmbeddings(),
ids=ids,
collection_name="test",
path=str(tmpdir),
batch_size=batch_size,
vector_name=vector_name,
)
del vec_store
client = QdrantClient(path=str(tmpdir))
assert 2 == client.count("test").count
stored_ids = [point.id for point in client.scroll("test")[0]]
assert set(ids) == set(stored_ids)
@pytest.mark.parametrize("batch_size", [1, 64])
def test_qdrant_add_texts_stores_ids(batch_size: int) -> None:
"""Test end to end Qdrant.add_texts stores provided ids."""
from qdrant_client import QdrantClient
ids = [
"fa38d572-4c31-4579-aedc-1960d79df6df",
"cdc1aa36-d6ab-4fb2-8a94-56674fd27484",
]
client = QdrantClient(":memory:")
collection_name = "test"
client.recreate_collection(
collection_name,
vectors_config=rest.VectorParams(size=10, distance=rest.Distance.COSINE),
)
vec_store = Qdrant(client, "test", ConsistentFakeEmbeddings())
returned_ids = vec_store.add_texts(["abc", "def"], ids=ids, batch_size=batch_size)
assert all(first == second for first, second in zip(ids, returned_ids))
assert 2 == client.count("test").count
stored_ids = [point.id for point in client.scroll("test")[0]]
assert set(ids) == set(stored_ids)
@pytest.mark.parametrize("vector_name", ["custom-vector"])
def test_qdrant_from_texts_stores_embeddings_as_named_vectors(vector_name: str) -> None:
"""Test end to end Qdrant.from_texts stores named vectors if name is provided."""
from qdrant_client import QdrantClient
collection_name = "test"
with tempfile.TemporaryDirectory() as tmpdir:
vec_store = Qdrant.from_texts(
["lorem", "ipsum", "dolor", "sit", "amet"],
ConsistentFakeEmbeddings(),
collection_name=collection_name,
path=str(tmpdir),
vector_name=vector_name,
)
del vec_store
client = QdrantClient(path=str(tmpdir))
assert 5 == client.count("test").count
assert all(
vector_name in point.vector # type: ignore[operator]
for point in client.scroll(collection_name, with_vectors=True)[0]
)
@pytest.mark.parametrize("vector_name", ["custom-vector"])
def test_qdrant_add_texts_stores_embeddings_as_named_vectors(vector_name: str) -> None:
"""Test end to end Qdrant.add_texts stores named vectors if name is provided."""
from qdrant_client import QdrantClient
collection_name = "test"
client = QdrantClient(":memory:")
client.recreate_collection(
collection_name,
vectors_config={
vector_name: rest.VectorParams(size=10, distance=rest.Distance.COSINE)
},
)
vec_store = Qdrant(
client,
collection_name,
ConsistentFakeEmbeddings(),
vector_name=vector_name,
)
vec_store.add_texts(["lorem", "ipsum", "dolor", "sit", "amet"])
assert 5 == client.count("test").count
assert all(
vector_name in point.vector # type: ignore[operator]
for point in client.scroll(collection_name, with_vectors=True)[0]
)