From 3701b2901e76f2f97239c2152a6a7d01754fb666 Mon Sep 17 00:00:00 2001 From: LeoGrin <45738728+LeoGrin@users.noreply.github.com> Date: Sun, 19 Mar 2023 03:55:38 +0100 Subject: [PATCH] use namespace argument in Pinecone constructor (#1757) Fix #1756 Use the `namespace` argument of `Pinecone.from_exisiting_index` to set the default value of `namespace` for other methods. Leads to more expected behavior and easier integration in chains. For the test, I've added a line to delete and rebuild the `langchain-demo` index at the beginning of the test. I'm not 100% sure if it's a good idea but it makes the test reproducible. --- langchain/vectorstores/pinecone.py | 12 +++++- .../vectorstores/test_pinecone.py | 38 +++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/langchain/vectorstores/pinecone.py b/langchain/vectorstores/pinecone.py index 6dfef7b7..7983ba45 100644 --- a/langchain/vectorstores/pinecone.py +++ b/langchain/vectorstores/pinecone.py @@ -32,6 +32,7 @@ class Pinecone(VectorStore): index: Any, embedding_function: Callable, text_key: str, + namespace: Optional[str] = None, ): """Initialize with Pinecone client.""" try: @@ -49,6 +50,7 @@ class Pinecone(VectorStore): self._index = index self._embedding_function = embedding_function self._text_key = text_key + self._namespace = namespace def add_texts( self, @@ -71,6 +73,8 @@ class Pinecone(VectorStore): List of ids from adding the texts into the vectorstore. """ + if namespace is None: + namespace = self._namespace # Embed and create the documents docs = [] ids = ids or [str(uuid.uuid4()) for _ in texts] @@ -101,6 +105,8 @@ class Pinecone(VectorStore): Returns: List of Documents most similar to the query and score for each """ + if namespace is None: + namespace = self._namespace query_obj = self._embedding_function(query) docs = [] results = self._index.query( @@ -135,6 +141,8 @@ class Pinecone(VectorStore): Returns: List of Documents most similar to the query and score for each """ + if namespace is None: + namespace = self._namespace query_obj = self._embedding_function(query) docs = [] results = self._index.query( @@ -222,7 +230,7 @@ class Pinecone(VectorStore): index = pinecone.Index(_index_name) # upsert to Pinecone index.upsert(vectors=list(to_upsert), namespace=namespace) - return cls(index, embedding.embed_query, text_key) + return cls(index, embedding.embed_query, text_key, namespace) @classmethod def from_existing_index( @@ -242,5 +250,5 @@ class Pinecone(VectorStore): ) return cls( - pinecone.Index(index_name, namespace), embedding.embed_query, text_key + pinecone.Index(index_name), embedding.embed_query, text_key, namespace ) diff --git a/tests/integration_tests/vectorstores/test_pinecone.py b/tests/integration_tests/vectorstores/test_pinecone.py index 2c92c501..bcfe4104 100644 --- a/tests/integration_tests/vectorstores/test_pinecone.py +++ b/tests/integration_tests/vectorstores/test_pinecone.py @@ -7,6 +7,11 @@ from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings pinecone.init(api_key="YOUR_API_KEY", environment="YOUR_ENV") +# if the index already exists, delete it +try: + pinecone.delete_index("langchain-demo") +except Exception: + pass index = pinecone.Index("langchain-demo") @@ -57,3 +62,36 @@ def test_pinecone_with_scores() -> None: Document(page_content="baz", metadata={"page": 2}), ] assert scores[0] > scores[1] > scores[2] + + +def test_pinecone_with_namespaces() -> None: + "Test that namespaces are properly handled." "" + # Create two indexes with the same name but different namespaces + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + Pinecone.from_texts( + texts, + FakeEmbeddings(), + index_name="langchain-demo", + metadatas=metadatas, + namespace="test-namespace", + ) + + texts = ["foo2", "bar2", "baz2"] + metadatas = [{"page": i} for i in range(len(texts))] + Pinecone.from_texts( + texts, + FakeEmbeddings(), + index_name="langchain-demo", + metadatas=metadatas, + namespace="test-namespace2", + ) + + # Search with namespace + docsearch = Pinecone.from_existing_index( + "langchain-demo", embedding=FakeEmbeddings(), namespace="test-namespace" + ) + output = docsearch.similarity_search("foo", k=6) + # check that we don't get results from the other namespace + page_contents = [o.page_content for o in output] + assert set(page_contents) == set(["foo", "bar", "baz"])