mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
03ac39368f
# Fix DeepLake Overwrite Flag Issue Fixes Issue #4682: essentially, setting overwrite to False in the DeepLake constructor still triggers an overwrite, because the logic is just checking for the presence of "overwrite" in kwargs. The fix is simple--just add some checks to inspect if "overwrite" in kwargs AND kwargs["overwrite"]==True. Added a new test in tests/integration_tests/vectorstores/test_deeplake.py to reflect the desired behavior. Co-authored-by: Anirudh Suresh <ani@Anirudhs-MBP.cable.rcn.com> Co-authored-by: Anirudh Suresh <ani@Anirudhs-MacBook-Pro.local> Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
221 lines
7.1 KiB
Python
221 lines
7.1 KiB
Python
"""Test Deep Lake functionality."""
|
|
import deeplake
|
|
import pytest
|
|
from pytest import FixtureRequest
|
|
|
|
from langchain.docstore.document import Document
|
|
from langchain.vectorstores import DeepLake
|
|
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
|
|
|
|
|
@pytest.fixture
|
|
def deeplake_datastore() -> DeepLake:
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = DeepLake.from_texts(
|
|
dataset_path="mem://test_path",
|
|
texts=texts,
|
|
metadatas=metadatas,
|
|
embedding=FakeEmbeddings(),
|
|
)
|
|
return docsearch
|
|
|
|
|
|
@pytest.fixture(params=["L1", "L2", "max", "cos"])
|
|
def distance_metric(request: FixtureRequest) -> str:
|
|
return request.param
|
|
|
|
|
|
def test_deeplake() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = DeepLake.from_texts(
|
|
dataset_path="mem://test_path", texts=texts, embedding=FakeEmbeddings()
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
|
|
def test_deeplake_with_metadatas() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = DeepLake.from_texts(
|
|
dataset_path="mem://test_path",
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
metadatas=metadatas,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo", metadata={"page": "0"})]
|
|
|
|
|
|
def test_deeplakewith_persistence() -> None:
|
|
"""Test end to end construction and search, with persistence."""
|
|
dataset_path = "./tests/persist_dir"
|
|
if deeplake.exists(dataset_path):
|
|
deeplake.delete(dataset_path)
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = DeepLake.from_texts(
|
|
dataset_path=dataset_path,
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
)
|
|
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
docsearch.persist()
|
|
|
|
# Get a new VectorStore from the persisted directory
|
|
docsearch = DeepLake(
|
|
dataset_path=dataset_path,
|
|
embedding_function=FakeEmbeddings(),
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
|
|
# Clean up
|
|
docsearch.delete_dataset()
|
|
|
|
# Persist doesn't need to be called again
|
|
# Data will be automatically persisted on object deletion
|
|
# Or on program exit
|
|
|
|
|
|
def test_deeplake_overwrite_flag() -> None:
|
|
"""Test overwrite behavior"""
|
|
dataset_path = "./tests/persist_dir"
|
|
if deeplake.exists(dataset_path):
|
|
deeplake.delete(dataset_path)
|
|
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = DeepLake.from_texts(
|
|
dataset_path=dataset_path,
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
docsearch.persist()
|
|
|
|
# Get a new VectorStore from the persisted directory, with no overwrite (implicit)
|
|
docsearch = DeepLake(
|
|
dataset_path=dataset_path,
|
|
embedding_function=FakeEmbeddings(),
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
# assert page still present
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
# Get a new VectorStore from the persisted directory, with no overwrite (explicit)
|
|
docsearch = DeepLake(
|
|
dataset_path=dataset_path,
|
|
embedding_function=FakeEmbeddings(),
|
|
overwrite=False,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
# assert page still present
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
# Get a new VectorStore from the persisted directory, with overwrite
|
|
docsearch = DeepLake(
|
|
dataset_path=dataset_path,
|
|
embedding_function=FakeEmbeddings(),
|
|
overwrite=True,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
# assert page no longer present
|
|
assert output == []
|
|
|
|
|
|
def test_similarity_search(deeplake_datastore: DeepLake, distance_metric: str) -> None:
|
|
"""Test similarity search."""
|
|
output = deeplake_datastore.similarity_search(
|
|
"foo", k=1, distance_metric=distance_metric
|
|
)
|
|
assert output == [Document(page_content="foo", metadata={"page": "0"})]
|
|
deeplake_datastore.delete_dataset()
|
|
|
|
|
|
def test_similarity_search_by_vector(
|
|
deeplake_datastore: DeepLake, distance_metric: str
|
|
) -> None:
|
|
"""Test similarity search by vector."""
|
|
embeddings = FakeEmbeddings().embed_documents(["foo", "bar", "baz"])
|
|
output = deeplake_datastore.similarity_search_by_vector(
|
|
embeddings[1], k=1, distance_metric=distance_metric
|
|
)
|
|
assert output == [Document(page_content="bar", metadata={"page": "1"})]
|
|
deeplake_datastore.delete_dataset()
|
|
|
|
|
|
def test_similarity_search_with_score(
|
|
deeplake_datastore: DeepLake, distance_metric: str
|
|
) -> None:
|
|
"""Test similarity search with score."""
|
|
output, score = deeplake_datastore.similarity_search_with_score(
|
|
"foo", k=1, distance_metric=distance_metric
|
|
)[0]
|
|
assert output == Document(page_content="foo", metadata={"page": "0"})
|
|
if distance_metric == "cos":
|
|
assert score == 1.0
|
|
else:
|
|
assert score == 0.0
|
|
deeplake_datastore.delete_dataset()
|
|
|
|
|
|
def test_similarity_search_with_filter(
|
|
deeplake_datastore: DeepLake, distance_metric: str
|
|
) -> None:
|
|
"""Test similarity search."""
|
|
|
|
output = deeplake_datastore.similarity_search(
|
|
"foo", k=1, distance_metric=distance_metric, filter={"page": "1"}
|
|
)
|
|
assert output == [Document(page_content="bar", metadata={"page": "1"})]
|
|
deeplake_datastore.delete_dataset()
|
|
|
|
|
|
def test_max_marginal_relevance_search(deeplake_datastore: DeepLake) -> None:
|
|
"""Test max marginal relevance search by vector."""
|
|
|
|
output = deeplake_datastore.max_marginal_relevance_search("foo", k=1, fetch_k=2)
|
|
|
|
assert output == [Document(page_content="foo", metadata={"page": "0"})]
|
|
|
|
embeddings = FakeEmbeddings().embed_documents(["foo", "bar", "baz"])
|
|
output = deeplake_datastore.max_marginal_relevance_search_by_vector(
|
|
embeddings[0], k=1, fetch_k=2
|
|
)
|
|
|
|
assert output == [Document(page_content="foo", metadata={"page": "0"})]
|
|
deeplake_datastore.delete_dataset()
|
|
|
|
|
|
def test_delete_dataset_by_ids(deeplake_datastore: DeepLake) -> None:
|
|
"""Test delete dataset."""
|
|
id = deeplake_datastore.ds.ids.data()["value"][0]
|
|
deeplake_datastore.delete(ids=[id])
|
|
assert deeplake_datastore.similarity_search("foo", k=1, filter={"page": "0"}) == []
|
|
assert len(deeplake_datastore.ds) == 2
|
|
|
|
deeplake_datastore.delete_dataset()
|
|
|
|
|
|
def test_delete_dataset_by_filter(deeplake_datastore: DeepLake) -> None:
|
|
"""Test delete dataset."""
|
|
deeplake_datastore.delete(filter={"page": "1"})
|
|
assert deeplake_datastore.similarity_search("bar", k=1, filter={"page": "1"}) == []
|
|
assert len(deeplake_datastore.ds) == 2
|
|
|
|
deeplake_datastore.delete_dataset()
|
|
|
|
|
|
def test_delete_by_path(deeplake_datastore: DeepLake) -> None:
|
|
"""Test delete dataset."""
|
|
path = deeplake_datastore.dataset_path
|
|
DeepLake.force_delete_by_path(path)
|
|
assert not deeplake.exists(path)
|