langchain/tests/integration_tests/vectorstores/test_pinecone.py

230 lines
8.1 KiB
Python
Raw Normal View History

import importlib
import os
2023-07-10 15:39:47 +00:00
import time
import uuid
from typing import List
import numpy as np
import pinecone
import pytest
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pinecone import Pinecone
index_name = "langchain-test-index" # name of the index
namespace_name = "langchain-test-namespace" # name of the namespace
dimension = 1536 # dimension of the embeddings
def reset_pinecone() -> None:
assert os.environ.get("PINECONE_API_KEY") is not None
assert os.environ.get("PINECONE_ENVIRONMENT") is not None
import pinecone
importlib.reload(pinecone)
pinecone.init(
api_key=os.environ.get("PINECONE_API_KEY"),
environment=os.environ.get("PINECONE_ENVIRONMENT"),
)
class TestPinecone:
index: pinecone.Index
@classmethod
def setup_class(cls) -> None:
reset_pinecone()
cls.index = pinecone.Index(index_name)
if index_name in pinecone.list_indexes():
index_stats = cls.index.describe_index_stats()
if index_stats["dimension"] == dimension:
# delete all the vectors in the index if the dimension is the same
# from all namespaces
index_stats = cls.index.describe_index_stats()
for _namespace_name in index_stats["namespaces"].keys():
cls.index.delete(delete_all=True, namespace=_namespace_name)
else:
pinecone.delete_index(index_name)
pinecone.create_index(name=index_name, dimension=dimension)
else:
pinecone.create_index(name=index_name, dimension=dimension)
# insure the index is empty
index_stats = cls.index.describe_index_stats()
assert index_stats["dimension"] == dimension
if index_stats["namespaces"].get(namespace_name) is not None:
assert index_stats["namespaces"][namespace_name]["vector_count"] == 0
@classmethod
def teardown_class(cls) -> None:
index_stats = cls.index.describe_index_stats()
for _namespace_name in index_stats["namespaces"].keys():
cls.index.delete(delete_all=True, namespace=_namespace_name)
reset_pinecone()
@pytest.fixture(autouse=True)
def setup(self) -> None:
# delete all the vectors in the index
index_stats = self.index.describe_index_stats()
for _namespace_name in index_stats["namespaces"].keys():
self.index.delete(delete_all=True, namespace=_namespace_name)
reset_pinecone()
@pytest.mark.vcr()
def test_from_texts(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
"""Test end to end construction and search."""
unique_id = uuid.uuid4().hex
needs = f"foobuu {unique_id} booo"
texts.insert(0, needs)
docsearch = Pinecone.from_texts(
texts=texts,
embedding=embedding_openai,
index_name=index_name,
namespace=namespace_name,
)
output = docsearch.similarity_search(unique_id, k=1, namespace=namespace_name)
assert output == [Document(page_content=needs)]
@pytest.mark.vcr()
def test_from_texts_with_metadatas(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
"""Test end to end construction and search."""
unique_id = uuid.uuid4().hex
needs = f"foobuu {unique_id} booo"
texts.insert(0, needs)
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
namespace=namespace_name,
)
output = docsearch.similarity_search(needs, k=1, namespace=namespace_name)
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}?
assert output == [Document(page_content=needs, metadata={"page": 0.0})]
@pytest.mark.vcr()
def test_from_texts_with_scores(self, embedding_openai: OpenAIEmbeddings) -> None:
"""Test end to end construction and search with scores and IDs."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
namespace=namespace_name,
)
output = docsearch.similarity_search_with_score(
"foo", k=3, namespace=namespace_name
)
docs = [o[0] for o in output]
scores = [o[1] for o in output]
sorted_documents = sorted(docs, key=lambda x: x.metadata["page"])
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}, etc???
assert sorted_documents == [
Document(page_content="foo", metadata={"page": 0.0}),
Document(page_content="bar", metadata={"page": 1.0}),
Document(page_content="baz", metadata={"page": 2.0}),
]
assert scores[0] > scores[1] > scores[2]
def test_from_existing_index_with_namespaces(
self, embedding_openai: OpenAIEmbeddings
) -> None:
"""Test that namespaces are properly handled."""
# Create two indexes with the same name but different namespaces
texts_1 = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts_1))]
Pinecone.from_texts(
texts_1,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
namespace=f"{index_name}-1",
)
texts_2 = ["foo2", "bar2", "baz2"]
metadatas = [{"page": i} for i in range(len(texts_2))]
Pinecone.from_texts(
texts_2,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
namespace=f"{index_name}-2",
)
# Search with namespace
docsearch = Pinecone.from_existing_index(
index_name=index_name,
embedding=embedding_openai,
namespace=f"{index_name}-1",
)
output = docsearch.similarity_search("foo", k=20, namespace=f"{index_name}-1")
# check that we don't get results from the other namespace
page_contents = sorted(set([o.page_content for o in output]))
assert all(content in ["foo", "bar", "baz"] for content in page_contents)
assert all(content not in ["foo2", "bar2", "baz2"] for content in page_contents)
def test_add_documents_with_ids(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
ids = [uuid.uuid4().hex for _ in range(len(texts))]
Pinecone.from_texts(
texts=texts,
ids=ids,
embedding=embedding_openai,
index_name=index_name,
namespace=index_name,
)
index_stats = self.index.describe_index_stats()
assert index_stats["namespaces"][index_name]["vector_count"] == len(texts)
ids_1 = [uuid.uuid4().hex for _ in range(len(texts))]
Pinecone.from_texts(
texts=texts,
ids=ids_1,
embedding=embedding_openai,
index_name=index_name,
namespace=index_name,
)
index_stats = self.index.describe_index_stats()
assert index_stats["namespaces"][index_name]["vector_count"] == len(texts) * 2
2023-07-10 15:39:47 +00:00
assert index_stats["total_vector_count"] == len(texts) * 2
Refactor vector storage to correctly handle relevancy scores (#6570) Description: This pull request aims to support generating the correct generic relevancy scores for different vector stores by refactoring the relevance score functions and their selection in the base class and subclasses of VectorStore. This is especially relevant with VectorStores that require a distance metric upon initialization. Note many of the current implenetations of `_similarity_search_with_relevance_scores` are not technically correct, as they just return `self.similarity_search_with_score(query, k, **kwargs)` without applying the relevant score function Also includes changes associated with: https://github.com/hwchase17/langchain/pull/6564 and https://github.com/hwchase17/langchain/pull/6494 See more indepth discussion in thread in #6494 Issue: https://github.com/hwchase17/langchain/issues/6526 https://github.com/hwchase17/langchain/issues/6481 https://github.com/hwchase17/langchain/issues/6346 Dependencies: None The changes include: - Properly handling score thresholding in FAISS `similarity_search_with_score_by_vector` for the corresponding distance metric. - Refactoring the `_similarity_search_with_relevance_scores` method in the base class and removing it from the subclasses for incorrectly implemented subclasses. - Adding a `_select_relevance_score_fn` method in the base class and implementing it in the subclasses to select the appropriate relevance score function based on the distance strategy. - Updating the `__init__` methods of the subclasses to set the `relevance_score_fn` attribute. - Removing the `_default_relevance_score_fn` function from the FAISS class and using the base class's `_euclidean_relevance_score_fn` instead. - Adding the `DistanceStrategy` enum to the `utils.py` file and updating the imports in the vector store classes. - Updating the tests to import the `DistanceStrategy` enum from the `utils.py` file. --------- Co-authored-by: Hanit <37485638+hanit-com@users.noreply.github.com>
2023-07-11 03:37:03 +00:00
@pytest.mark.vcr()
def test_relevance_score_bound(self, embedding_openai: OpenAIEmbeddings) -> None:
"""Ensures all relevance scores are between 0 and 1."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
)
# wait for the index to be ready
time.sleep(20)
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
assert all(
(1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output
)