langchain/tests/integration_tests/vectorstores/test_deeplake.py
Davit Buniatyan 2c0023393b
Deep Lake mini upgrades (#3375)
Improvements
* set default num_workers for ingestion to 0
* upgraded notebooks for avoiding dataset creation ambiguity
* added `force_delete_dataset_by_path`
* bumped deeplake to 3.3.0
* creds arg passing to deeplake object that would allow custom S3

Notes
* please double check if poetry is not messed up (thanks!)

Asks
* Would be great to create a shared slack channel for quick questions

---------

Co-authored-by: Davit Buniatyan <d@activeloop.ai>
2023-04-23 21:23:54 -07:00

174 lines
5.6 KiB
Python

"""Test Deep Lake functionality."""
import deeplake
import pytest
from pytest import FixtureRequest
from langchain.docstore.document import Document
from langchain.vectorstores import DeepLake
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
@pytest.fixture
def deeplake_datastore() -> DeepLake:
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = DeepLake.from_texts(
dataset_path="mem://test_path",
texts=texts,
metadatas=metadatas,
embedding=FakeEmbeddings(),
)
return docsearch
@pytest.fixture(params=["L1", "L2", "max", "cos"])
def distance_metric(request: FixtureRequest) -> str:
return request.param
def test_deeplake() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = DeepLake.from_texts(
dataset_path="mem://test_path", texts=texts, embedding=FakeEmbeddings()
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_deeplake_with_metadatas() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = DeepLake.from_texts(
dataset_path="mem://test_path",
texts=texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
def test_deeplakewith_persistence() -> None:
"""Test end to end construction and search, with persistence."""
dataset_path = "./tests/persist_dir"
if deeplake.exists(dataset_path):
deeplake.delete(dataset_path)
texts = ["foo", "bar", "baz"]
docsearch = DeepLake.from_texts(
dataset_path=dataset_path,
texts=texts,
embedding=FakeEmbeddings(),
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
docsearch.persist()
# Get a new VectorStore from the persisted directory
docsearch = DeepLake(
dataset_path=dataset_path,
embedding_function=FakeEmbeddings(),
)
output = docsearch.similarity_search("foo", k=1)
# Clean up
docsearch.delete_dataset()
# Persist doesn't need to be called again
# Data will be automatically persisted on object deletion
# Or on program exit
def test_similarity_search(deeplake_datastore: DeepLake, distance_metric: str) -> None:
"""Test similarity search."""
output = deeplake_datastore.similarity_search(
"foo", k=1, distance_metric=distance_metric
)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
deeplake_datastore.delete_dataset()
def test_similarity_search_by_vector(
deeplake_datastore: DeepLake, distance_metric: str
) -> None:
"""Test similarity search by vector."""
embeddings = FakeEmbeddings().embed_documents(["foo", "bar", "baz"])
output = deeplake_datastore.similarity_search_by_vector(
embeddings[1], k=1, distance_metric=distance_metric
)
assert output == [Document(page_content="bar", metadata={"page": "1"})]
deeplake_datastore.delete_dataset()
def test_similarity_search_with_score(
deeplake_datastore: DeepLake, distance_metric: str
) -> None:
"""Test similarity search with score."""
output, score = deeplake_datastore.similarity_search_with_score(
"foo", k=1, distance_metric=distance_metric
)[0]
assert output == Document(page_content="foo", metadata={"page": "0"})
if distance_metric == "cos":
assert score == 1.0
else:
assert score == 0.0
deeplake_datastore.delete_dataset()
def test_similarity_search_with_filter(
deeplake_datastore: DeepLake, distance_metric: str
) -> None:
"""Test similarity search."""
output = deeplake_datastore.similarity_search(
"foo", k=1, distance_metric=distance_metric, filter={"page": "1"}
)
assert output == [Document(page_content="bar", metadata={"page": "1"})]
deeplake_datastore.delete_dataset()
def test_max_marginal_relevance_search(deeplake_datastore: DeepLake) -> None:
"""Test max marginal relevance search by vector."""
output = deeplake_datastore.max_marginal_relevance_search("foo", k=1, fetch_k=2)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
embeddings = FakeEmbeddings().embed_documents(["foo", "bar", "baz"])
output = deeplake_datastore.max_marginal_relevance_search_by_vector(
embeddings[0], k=1, fetch_k=2
)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
deeplake_datastore.delete_dataset()
def test_delete_dataset_by_ids(deeplake_datastore: DeepLake) -> None:
"""Test delete dataset."""
id = deeplake_datastore.ds.ids.data()["value"][0]
deeplake_datastore.delete(ids=[id])
assert deeplake_datastore.similarity_search("foo", k=1, filter={"page": "0"}) == []
assert len(deeplake_datastore.ds) == 2
deeplake_datastore.delete_dataset()
def test_delete_dataset_by_filter(deeplake_datastore: DeepLake) -> None:
"""Test delete dataset."""
deeplake_datastore.delete(filter={"page": "1"})
assert deeplake_datastore.similarity_search("bar", k=1, filter={"page": "1"}) == []
assert len(deeplake_datastore.ds) == 2
deeplake_datastore.delete_dataset()
def test_delete_by_path(deeplake_datastore: DeepLake) -> None:
"""Test delete dataset."""
path = deeplake_datastore.dataset_path
DeepLake.force_delete_by_path(path)
assert not deeplake.exists(path)