import importlib import os import uuid from typing import List import pinecone import pytest from langchain.docstore.document import Document from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores.pinecone import Pinecone index_name = "langchain-test-index" # name of the index namespace_name = "langchain-test-namespace" # name of the namespace dimension = 1536 # dimension of the embeddings def reset_pinecone() -> None: assert os.environ.get("PINECONE_API_KEY") is not None assert os.environ.get("PINECONE_ENVIRONMENT") is not None import pinecone importlib.reload(pinecone) pinecone.init( api_key=os.environ.get("PINECONE_API_KEY"), environment=os.environ.get("PINECONE_ENVIRONMENT"), ) class TestPinecone: index: pinecone.Index @classmethod def setup_class(cls) -> None: reset_pinecone() cls.index = pinecone.Index(index_name) if index_name in pinecone.list_indexes(): index_stats = cls.index.describe_index_stats() if index_stats["dimension"] == dimension: # delete all the vectors in the index if the dimension is the same # from all namespaces index_stats = cls.index.describe_index_stats() for _namespace_name in index_stats["namespaces"].keys(): cls.index.delete(delete_all=True, namespace=_namespace_name) else: pinecone.delete_index(index_name) pinecone.create_index(name=index_name, dimension=dimension) else: pinecone.create_index(name=index_name, dimension=dimension) # insure the index is empty index_stats = cls.index.describe_index_stats() assert index_stats["dimension"] == dimension if index_stats["namespaces"].get(namespace_name) is not None: assert index_stats["namespaces"][namespace_name]["vector_count"] == 0 @classmethod def teardown_class(cls) -> None: index_stats = cls.index.describe_index_stats() for _namespace_name in index_stats["namespaces"].keys(): cls.index.delete(delete_all=True, namespace=_namespace_name) reset_pinecone() @pytest.fixture(autouse=True) def setup(self) -> None: # delete all the vectors in the index index_stats = self.index.describe_index_stats() for _namespace_name in index_stats["namespaces"].keys(): self.index.delete(delete_all=True, namespace=_namespace_name) reset_pinecone() @pytest.mark.vcr() def test_from_texts( self, texts: List[str], embedding_openai: OpenAIEmbeddings ) -> None: """Test end to end construction and search.""" unique_id = uuid.uuid4().hex needs = f"foobuu {unique_id} booo" texts.insert(0, needs) docsearch = Pinecone.from_texts( texts=texts, embedding=embedding_openai, index_name=index_name, namespace=namespace_name, ) output = docsearch.similarity_search(unique_id, k=1, namespace=namespace_name) assert output == [Document(page_content=needs)] @pytest.mark.vcr() def test_from_texts_with_metadatas( self, texts: List[str], embedding_openai: OpenAIEmbeddings ) -> None: """Test end to end construction and search.""" unique_id = uuid.uuid4().hex needs = f"foobuu {unique_id} booo" texts.insert(0, needs) metadatas = [{"page": i} for i in range(len(texts))] docsearch = Pinecone.from_texts( texts, embedding_openai, index_name=index_name, metadatas=metadatas, namespace=namespace_name, ) output = docsearch.similarity_search(needs, k=1, namespace=namespace_name) # TODO: why metadata={"page": 0.0}) instead of {"page": 0}? assert output == [Document(page_content=needs, metadata={"page": 0.0})] @pytest.mark.vcr() def test_from_texts_with_scores(self, embedding_openai: OpenAIEmbeddings) -> None: """Test end to end construction and search with scores and IDs.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] docsearch = Pinecone.from_texts( texts, embedding_openai, index_name=index_name, metadatas=metadatas, namespace=namespace_name, ) output = docsearch.similarity_search_with_score( "foo", k=3, namespace=namespace_name ) docs = [o[0] for o in output] scores = [o[1] for o in output] sorted_documents = sorted(docs, key=lambda x: x.metadata["page"]) # TODO: why metadata={"page": 0.0}) instead of {"page": 0}, etc??? assert sorted_documents == [ Document(page_content="foo", metadata={"page": 0.0}), Document(page_content="bar", metadata={"page": 1.0}), Document(page_content="baz", metadata={"page": 2.0}), ] assert scores[0] > scores[1] > scores[2] def test_from_existing_index_with_namespaces( self, embedding_openai: OpenAIEmbeddings ) -> None: """Test that namespaces are properly handled.""" # Create two indexes with the same name but different namespaces texts_1 = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts_1))] Pinecone.from_texts( texts_1, embedding_openai, index_name=index_name, metadatas=metadatas, namespace=f"{index_name}-1", ) texts_2 = ["foo2", "bar2", "baz2"] metadatas = [{"page": i} for i in range(len(texts_2))] Pinecone.from_texts( texts_2, embedding_openai, index_name=index_name, metadatas=metadatas, namespace=f"{index_name}-2", ) # Search with namespace docsearch = Pinecone.from_existing_index( index_name=index_name, embedding=embedding_openai, namespace=f"{index_name}-1", ) output = docsearch.similarity_search("foo", k=20, namespace=f"{index_name}-1") # check that we don't get results from the other namespace page_contents = sorted(set([o.page_content for o in output])) assert all(content in ["foo", "bar", "baz"] for content in page_contents) assert all(content not in ["foo2", "bar2", "baz2"] for content in page_contents) def test_add_documents_with_ids( self, texts: List[str], embedding_openai: OpenAIEmbeddings ) -> None: ids = [uuid.uuid4().hex for _ in range(len(texts))] Pinecone.from_texts( texts=texts, ids=ids, embedding=embedding_openai, index_name=index_name, namespace=index_name, ) index_stats = self.index.describe_index_stats() assert index_stats["namespaces"][index_name]["vector_count"] == len(texts) ids_1 = [uuid.uuid4().hex for _ in range(len(texts))] Pinecone.from_texts( texts=texts, ids=ids_1, embedding=embedding_openai, index_name=index_name, namespace=index_name, ) index_stats = self.index.describe_index_stats() assert index_stats["namespaces"][index_name]["vector_count"] == len(texts) * 2