langchain/tests/integration_tests/vectorstores/test_pinecone.py
Raymond Yuan 5171c3bcca
Refactor vector storage to correctly handle relevancy scores (#6570)
Description: This pull request aims to support generating the correct
generic relevancy scores for different vector stores by refactoring the
relevance score functions and their selection in the base class and
subclasses of VectorStore. This is especially relevant with VectorStores
that require a distance metric upon initialization. Note many of the
current implenetations of `_similarity_search_with_relevance_scores` are
not technically correct, as they just return
`self.similarity_search_with_score(query, k, **kwargs)` without applying
the relevant score function

Also includes changes associated with:
https://github.com/hwchase17/langchain/pull/6564 and
https://github.com/hwchase17/langchain/pull/6494

See more indepth discussion in thread in #6494 

Issue: 
https://github.com/hwchase17/langchain/issues/6526
https://github.com/hwchase17/langchain/issues/6481
https://github.com/hwchase17/langchain/issues/6346

Dependencies: None

The changes include:
- Properly handling score thresholding in FAISS
`similarity_search_with_score_by_vector` for the corresponding distance
metric.
- Refactoring the `_similarity_search_with_relevance_scores` method in
the base class and removing it from the subclasses for incorrectly
implemented subclasses.
- Adding a `_select_relevance_score_fn` method in the base class and
implementing it in the subclasses to select the appropriate relevance
score function based on the distance strategy.
- Updating the `__init__` methods of the subclasses to set the
`relevance_score_fn` attribute.
- Removing the `_default_relevance_score_fn` function from the FAISS
class and using the base class's `_euclidean_relevance_score_fn`
instead.
- Adding the `DistanceStrategy` enum to the `utils.py` file and updating
the imports in the vector store classes.
- Updating the tests to import the `DistanceStrategy` enum from the
`utils.py` file.

---------

Co-authored-by: Hanit <37485638+hanit-com@users.noreply.github.com>
2023-07-10 20:37:03 -07:00

176 lines
5.7 KiB
Python

import importlib
import os
import time
import uuid
import numpy as np
from typing import List
import pinecone
import pytest
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pinecone import Pinecone
index_name = "langchain-test-index" # name of the index
dimension = 1536 # dimension of the embeddings
def reset_pinecone() -> None:
assert os.environ.get("PINECONE_API_KEY") is not None
assert os.environ.get("PINECONE_ENVIRONMENT") is not None
import pinecone
importlib.reload(pinecone)
pinecone.init(
api_key=os.environ.get("PINECONE_API_KEY"),
environment=os.environ.get("PINECONE_ENVIRONMENT"),
)
class TestPinecone:
index: pinecone.Index
@classmethod
def setup_class(cls) -> None:
reset_pinecone()
cls.index = pinecone.Index(index_name)
if index_name in pinecone.list_indexes():
pinecone.delete_index(index_name)
pinecone.create_index(name=index_name, dimension=dimension)
# insure the index is empty
index_stats = cls.index.describe_index_stats()
assert index_stats["dimension"] == dimension
assert index_stats["total_vector_count"] == 0
@classmethod
def teardown_class(cls) -> None:
if index_name in pinecone.list_indexes():
pinecone.delete_index(index_name)
pinecone.create_index(index_name, dimension=dimension)
reset_pinecone()
@pytest.fixture(autouse=True)
def setup(self) -> None:
if index_name in pinecone.list_indexes():
pinecone.delete_index(index_name)
pinecone.create_index(index_name, dimension=dimension)
reset_pinecone()
@pytest.mark.vcr()
def test_from_texts(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
"""Test end to end construction and search."""
unique_id = uuid.uuid4().hex
needs = f"foobuu {unique_id} booo"
texts.insert(0, needs)
docsearch = Pinecone.from_texts(
texts=texts, embedding=embedding_openai, index_name=index_name
)
# wait for the index to be ready
time.sleep(20)
output = docsearch.similarity_search(unique_id, k=1)
assert output == [Document(page_content=needs)]
@pytest.mark.vcr()
def test_from_texts_with_metadatas(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
"""Test end to end construction and search."""
unique_id = uuid.uuid4().hex
needs = f"foobuu {unique_id} booo"
texts.insert(0, needs)
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
)
# wait for the index to be ready
time.sleep(20)
output = docsearch.similarity_search(needs, k=1)
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}?
assert output == [Document(page_content=needs, metadata={"page": 0.0})]
@pytest.mark.vcr()
def test_from_texts_with_scores(self, embedding_openai: OpenAIEmbeddings) -> None:
"""Test end to end construction and search with scores and IDs."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
)
# wait for the index to be ready
time.sleep(20)
output = docsearch.similarity_search_with_score("foo", k=3)
docs = [o[0] for o in output]
scores = [o[1] for o in output]
sorted_documents = sorted(docs, key=lambda x: x.metadata["page"])
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}, etc???
assert sorted_documents == [
Document(page_content="foo", metadata={"page": 0.0}),
Document(page_content="bar", metadata={"page": 1.0}),
Document(page_content="baz", metadata={"page": 2.0}),
]
assert scores[0] > scores[1] > scores[2]
def test_add_documents_with_ids(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
ids = [uuid.uuid4().hex for _ in range(len(texts))]
Pinecone.from_texts(
texts=texts, ids=ids, embedding=embedding_openai, index_name=index_name
)
# wait for the index to be ready
time.sleep(20)
index_stats = self.index.describe_index_stats()
assert index_stats["total_vector_count"] == len(texts)
ids_1 = [uuid.uuid4().hex for _ in range(len(texts))]
Pinecone.from_texts(
texts=texts,
ids=ids_1,
embedding=embedding_openai,
index_name=index_name,
)
# wait for the index to be ready
time.sleep(20)
index_stats = self.index.describe_index_stats()
assert index_stats["total_vector_count"] == len(texts) * 2
@pytest.mark.vcr()
def test_relevance_score_bound(self, embedding_openai: OpenAIEmbeddings) -> None:
"""Ensures all relevance scores are between 0 and 1."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
index_name=index_name,
metadatas=metadatas,
)
# wait for the index to be ready
time.sleep(20)
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
assert all(
(1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output
)