From 03ac39368fe60201a3f071d7d360c39f59c77cbf Mon Sep 17 00:00:00 2001 From: Anirudh Suresh Date: Mon, 15 May 2023 19:39:16 -0500 Subject: [PATCH] Fixing DeepLake Overwrite Flag (#4683) # Fix DeepLake Overwrite Flag Issue Fixes Issue #4682: essentially, setting overwrite to False in the DeepLake constructor still triggers an overwrite, because the logic is just checking for the presence of "overwrite" in kwargs. The fix is simple--just add some checks to inspect if "overwrite" in kwargs AND kwargs["overwrite"]==True. Added a new test in tests/integration_tests/vectorstores/test_deeplake.py to reflect the desired behavior. Co-authored-by: Anirudh Suresh Co-authored-by: Anirudh Suresh Co-authored-by: Dev 2049 --- langchain/vectorstores/deeplake.py | 8 ++-- .../vectorstores/test_deeplake.py | 47 +++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/langchain/vectorstores/deeplake.py b/langchain/vectorstores/deeplake.py index dc9a2106..01ed62df 100644 --- a/langchain/vectorstores/deeplake.py +++ b/langchain/vectorstores/deeplake.py @@ -120,10 +120,12 @@ class DeepLake(VectorStore): self.dataset_path = dataset_path creds_args = {"creds": kwargs["creds"]} if "creds" in kwargs else {} - if ( - deeplake.exists(dataset_path, token=token, **creds_args) - and "overwrite" not in kwargs + if deeplake.exists(dataset_path, token=token, **creds_args) and not kwargs.get( + "overwrite", False ): + if "overwrite" in kwargs: + del kwargs["overwrite"] + self.ds = deeplake.load( dataset_path, token=token, diff --git a/tests/integration_tests/vectorstores/test_deeplake.py b/tests/integration_tests/vectorstores/test_deeplake.py index f858c904..634d5237 100644 --- a/tests/integration_tests/vectorstores/test_deeplake.py +++ b/tests/integration_tests/vectorstores/test_deeplake.py @@ -83,6 +83,53 @@ def test_deeplakewith_persistence() -> None: # Or on program exit +def test_deeplake_overwrite_flag() -> None: + """Test overwrite behavior""" + dataset_path = "./tests/persist_dir" + if deeplake.exists(dataset_path): + deeplake.delete(dataset_path) + + texts = ["foo", "bar", "baz"] + docsearch = DeepLake.from_texts( + dataset_path=dataset_path, + texts=texts, + embedding=FakeEmbeddings(), + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + docsearch.persist() + + # Get a new VectorStore from the persisted directory, with no overwrite (implicit) + docsearch = DeepLake( + dataset_path=dataset_path, + embedding_function=FakeEmbeddings(), + ) + output = docsearch.similarity_search("foo", k=1) + # assert page still present + assert output == [Document(page_content="foo")] + + # Get a new VectorStore from the persisted directory, with no overwrite (explicit) + docsearch = DeepLake( + dataset_path=dataset_path, + embedding_function=FakeEmbeddings(), + overwrite=False, + ) + output = docsearch.similarity_search("foo", k=1) + # assert page still present + assert output == [Document(page_content="foo")] + + # Get a new VectorStore from the persisted directory, with overwrite + docsearch = DeepLake( + dataset_path=dataset_path, + embedding_function=FakeEmbeddings(), + overwrite=True, + ) + output = docsearch.similarity_search("foo", k=1) + # assert page no longer present + assert output == [] + + def test_similarity_search(deeplake_datastore: DeepLake, distance_metric: str) -> None: """Test similarity search.""" output = deeplake_datastore.similarity_search(