mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
Pinecone: Add V4 support (#7473)
This commit is contained in:
parent
5debd5043e
commit
9d13dcd17c
@ -40,7 +40,6 @@ class Pinecone(VectorStore):
|
||||
index: Any,
|
||||
embedding_function: Callable,
|
||||
text_key: str,
|
||||
namespace: Optional[str] = None,
|
||||
):
|
||||
"""Initialize with Pinecone client."""
|
||||
try:
|
||||
@ -58,14 +57,12 @@ class Pinecone(VectorStore):
|
||||
self._index = index
|
||||
self._embedding_function = embedding_function
|
||||
self._text_key = text_key
|
||||
self._namespace = namespace
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
namespace: Optional[str] = None,
|
||||
batch_size: int = 32,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
@ -75,14 +72,11 @@ class Pinecone(VectorStore):
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
ids: Optional list of ids to associate with the texts.
|
||||
namespace: Optional pinecone namespace to add the texts to.
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = self._namespace
|
||||
# Embed and create the documents
|
||||
docs = []
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
@ -92,7 +86,7 @@ class Pinecone(VectorStore):
|
||||
metadata[self._text_key] = text
|
||||
docs.append((ids[i], embedding, metadata))
|
||||
# upsert to Pinecone
|
||||
self._index.upsert(vectors=docs, namespace=namespace, batch_size=batch_size)
|
||||
self._index.upsert(vectors=docs, batch_size=batch_size)
|
||||
return ids
|
||||
|
||||
def similarity_search_with_score(
|
||||
@ -100,7 +94,6 @@ class Pinecone(VectorStore):
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
namespace: Optional[str] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return pinecone documents most similar to query, along with scores.
|
||||
|
||||
@ -108,20 +101,16 @@ class Pinecone(VectorStore):
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filter: Dictionary of argument(s) to filter on metadata
|
||||
namespace: Namespace to search in. Default will search in '' namespace.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = self._namespace
|
||||
query_obj = self._embedding_function(query)
|
||||
docs = []
|
||||
results = self._index.query(
|
||||
[query_obj],
|
||||
top_k=k,
|
||||
include_metadata=True,
|
||||
namespace=namespace,
|
||||
filter=filter,
|
||||
)
|
||||
for res in results["matches"]:
|
||||
@ -141,7 +130,6 @@ class Pinecone(VectorStore):
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
namespace: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return pinecone documents most similar to query.
|
||||
@ -150,13 +138,12 @@ class Pinecone(VectorStore):
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filter: Dictionary of argument(s) to filter on metadata
|
||||
namespace: Namespace to search in. Default will search in '' namespace.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query, k=k, filter=filter, namespace=namespace, **kwargs
|
||||
query, k=k, filter=filter, **kwargs
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
@ -176,7 +163,6 @@ class Pinecone(VectorStore):
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filter: Optional[dict] = None,
|
||||
namespace: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
@ -195,14 +181,11 @@ class Pinecone(VectorStore):
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = self._namespace
|
||||
results = self._index.query(
|
||||
[embedding],
|
||||
top_k=fetch_k,
|
||||
include_values=True,
|
||||
include_metadata=True,
|
||||
namespace=namespace,
|
||||
filter=filter,
|
||||
)
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
@ -224,7 +207,6 @@ class Pinecone(VectorStore):
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filter: Optional[dict] = None,
|
||||
namespace: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
@ -245,7 +227,7 @@ class Pinecone(VectorStore):
|
||||
"""
|
||||
embedding = self._embedding_function(query)
|
||||
return self.max_marginal_relevance_search_by_vector(
|
||||
embedding, k, fetch_k, lambda_mult, filter, namespace
|
||||
embedding, k, fetch_k, lambda_mult, filter
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -258,7 +240,6 @@ class Pinecone(VectorStore):
|
||||
batch_size: int = 32,
|
||||
text_key: str = "text",
|
||||
index_name: Optional[str] = None,
|
||||
namespace: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> Pinecone:
|
||||
"""Construct Pinecone wrapper from raw documents.
|
||||
@ -331,8 +312,8 @@ class Pinecone(VectorStore):
|
||||
to_upsert = zip(ids_batch, embeds, metadata)
|
||||
|
||||
# upsert to Pinecone
|
||||
index.upsert(vectors=list(to_upsert), namespace=namespace)
|
||||
return cls(index, embedding.embed_query, text_key, namespace)
|
||||
index.upsert(vectors=list(to_upsert))
|
||||
return cls(index, embedding.embed_query, text_key)
|
||||
|
||||
@classmethod
|
||||
def from_existing_index(
|
||||
@ -340,7 +321,6 @@ class Pinecone(VectorStore):
|
||||
index_name: str,
|
||||
embedding: Embeddings,
|
||||
text_key: str = "text",
|
||||
namespace: Optional[str] = None,
|
||||
) -> Pinecone:
|
||||
"""Load pinecone vectorstore from index name."""
|
||||
try:
|
||||
@ -350,38 +330,21 @@ class Pinecone(VectorStore):
|
||||
"Could not import pinecone python package. "
|
||||
"Please install it with `pip install pinecone-client`."
|
||||
)
|
||||
|
||||
return cls(
|
||||
pinecone.Index(index_name), embedding.embed_query, text_key, namespace
|
||||
)
|
||||
return cls(pinecone.Index(index_name), embedding.embed_query, text_key)
|
||||
|
||||
def delete(
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
delete_all: Optional[bool] = None,
|
||||
namespace: Optional[str] = None,
|
||||
filter: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Delete by vector IDs or filter.
|
||||
"""Delete by vector IDs
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
filter: Dictionary of conditions to filter vectors to delete.
|
||||
"""
|
||||
if ids is None:
|
||||
raise ValueError("Ids must be provided.")
|
||||
|
||||
if namespace is None:
|
||||
namespace = self._namespace
|
||||
|
||||
if delete_all:
|
||||
self._index.delete(delete_all=True, namespace=namespace, **kwargs)
|
||||
elif ids is not None:
|
||||
chunk_size = 1000
|
||||
for i in range(0, len(ids), chunk_size):
|
||||
chunk = ids[i : i + chunk_size]
|
||||
self._index.delete(ids=chunk, namespace=namespace, **kwargs)
|
||||
elif filter is not None:
|
||||
self._index.delete(filter=filter, namespace=namespace, **kwargs)
|
||||
else:
|
||||
raise ValueError("Either ids, delete_all, or filter must be provided.")
|
||||
|
||||
return None
|
||||
chunk_size = 1000
|
||||
for i in range(0, len(ids), chunk_size):
|
||||
chunk = ids[i : i + chunk_size]
|
||||
self._index.delete(ids=chunk, **kwargs)
|
||||
|
@ -1,5 +1,6 @@
|
||||
import importlib
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from typing import List
|
||||
|
||||
@ -11,7 +12,6 @@ from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.vectorstores.pinecone import Pinecone
|
||||
|
||||
index_name = "langchain-test-index" # name of the index
|
||||
namespace_name = "langchain-test-namespace" # name of the namespace
|
||||
dimension = 1536 # dimension of the embeddings
|
||||
|
||||
|
||||
@ -39,40 +39,28 @@ class TestPinecone:
|
||||
cls.index = pinecone.Index(index_name)
|
||||
|
||||
if index_name in pinecone.list_indexes():
|
||||
index_stats = cls.index.describe_index_stats()
|
||||
if index_stats["dimension"] == dimension:
|
||||
# delete all the vectors in the index if the dimension is the same
|
||||
# from all namespaces
|
||||
index_stats = cls.index.describe_index_stats()
|
||||
for _namespace_name in index_stats["namespaces"].keys():
|
||||
cls.index.delete(delete_all=True, namespace=_namespace_name)
|
||||
pinecone.delete_index(index_name)
|
||||
|
||||
else:
|
||||
pinecone.delete_index(index_name)
|
||||
pinecone.create_index(name=index_name, dimension=dimension)
|
||||
else:
|
||||
pinecone.create_index(name=index_name, dimension=dimension)
|
||||
pinecone.create_index(name=index_name, dimension=dimension)
|
||||
|
||||
# insure the index is empty
|
||||
index_stats = cls.index.describe_index_stats()
|
||||
assert index_stats["dimension"] == dimension
|
||||
if index_stats["namespaces"].get(namespace_name) is not None:
|
||||
assert index_stats["namespaces"][namespace_name]["vector_count"] == 0
|
||||
assert index_stats["total_vector_count"] == 0
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls) -> None:
|
||||
index_stats = cls.index.describe_index_stats()
|
||||
for _namespace_name in index_stats["namespaces"].keys():
|
||||
cls.index.delete(delete_all=True, namespace=_namespace_name)
|
||||
if index_name in pinecone.list_indexes():
|
||||
pinecone.delete_index(index_name)
|
||||
pinecone.create_index(index_name, dimension=dimension)
|
||||
|
||||
reset_pinecone()
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self) -> None:
|
||||
# delete all the vectors in the index
|
||||
index_stats = self.index.describe_index_stats()
|
||||
for _namespace_name in index_stats["namespaces"].keys():
|
||||
self.index.delete(delete_all=True, namespace=_namespace_name)
|
||||
if index_name in pinecone.list_indexes():
|
||||
pinecone.delete_index(index_name)
|
||||
pinecone.create_index(index_name, dimension=dimension)
|
||||
|
||||
reset_pinecone()
|
||||
|
||||
@ -86,12 +74,11 @@ class TestPinecone:
|
||||
texts.insert(0, needs)
|
||||
|
||||
docsearch = Pinecone.from_texts(
|
||||
texts=texts,
|
||||
embedding=embedding_openai,
|
||||
index_name=index_name,
|
||||
namespace=namespace_name,
|
||||
texts=texts, embedding=embedding_openai, index_name=index_name
|
||||
)
|
||||
output = docsearch.similarity_search(unique_id, k=1, namespace=namespace_name)
|
||||
# wait for the index to be ready
|
||||
time.sleep(20)
|
||||
output = docsearch.similarity_search(unique_id, k=1)
|
||||
assert output == [Document(page_content=needs)]
|
||||
|
||||
@pytest.mark.vcr()
|
||||
@ -110,9 +97,10 @@ class TestPinecone:
|
||||
embedding_openai,
|
||||
index_name=index_name,
|
||||
metadatas=metadatas,
|
||||
namespace=namespace_name,
|
||||
)
|
||||
output = docsearch.similarity_search(needs, k=1, namespace=namespace_name)
|
||||
# wait for the index to be ready
|
||||
time.sleep(20)
|
||||
output = docsearch.similarity_search(needs, k=1)
|
||||
|
||||
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}?
|
||||
assert output == [Document(page_content=needs, metadata={"page": 0.0})]
|
||||
@ -127,11 +115,10 @@ class TestPinecone:
|
||||
embedding_openai,
|
||||
index_name=index_name,
|
||||
metadatas=metadatas,
|
||||
namespace=namespace_name,
|
||||
)
|
||||
output = docsearch.similarity_search_with_score(
|
||||
"foo", k=3, namespace=namespace_name
|
||||
)
|
||||
# wait for the index to be ready
|
||||
time.sleep(20)
|
||||
output = docsearch.similarity_search_with_score("foo", k=3)
|
||||
docs = [o[0] for o in output]
|
||||
scores = [o[1] for o in output]
|
||||
sorted_documents = sorted(docs, key=lambda x: x.metadata["page"])
|
||||
@ -144,57 +131,17 @@ class TestPinecone:
|
||||
]
|
||||
assert scores[0] > scores[1] > scores[2]
|
||||
|
||||
def test_from_existing_index_with_namespaces(
|
||||
self, embedding_openai: OpenAIEmbeddings
|
||||
) -> None:
|
||||
"""Test that namespaces are properly handled."""
|
||||
# Create two indexes with the same name but different namespaces
|
||||
texts_1 = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts_1))]
|
||||
Pinecone.from_texts(
|
||||
texts_1,
|
||||
embedding_openai,
|
||||
index_name=index_name,
|
||||
metadatas=metadatas,
|
||||
namespace=f"{index_name}-1",
|
||||
)
|
||||
|
||||
texts_2 = ["foo2", "bar2", "baz2"]
|
||||
metadatas = [{"page": i} for i in range(len(texts_2))]
|
||||
|
||||
Pinecone.from_texts(
|
||||
texts_2,
|
||||
embedding_openai,
|
||||
index_name=index_name,
|
||||
metadatas=metadatas,
|
||||
namespace=f"{index_name}-2",
|
||||
)
|
||||
|
||||
# Search with namespace
|
||||
docsearch = Pinecone.from_existing_index(
|
||||
index_name=index_name,
|
||||
embedding=embedding_openai,
|
||||
namespace=f"{index_name}-1",
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=20, namespace=f"{index_name}-1")
|
||||
# check that we don't get results from the other namespace
|
||||
page_contents = sorted(set([o.page_content for o in output]))
|
||||
assert all(content in ["foo", "bar", "baz"] for content in page_contents)
|
||||
assert all(content not in ["foo2", "bar2", "baz2"] for content in page_contents)
|
||||
|
||||
def test_add_documents_with_ids(
|
||||
self, texts: List[str], embedding_openai: OpenAIEmbeddings
|
||||
) -> None:
|
||||
ids = [uuid.uuid4().hex for _ in range(len(texts))]
|
||||
Pinecone.from_texts(
|
||||
texts=texts,
|
||||
ids=ids,
|
||||
embedding=embedding_openai,
|
||||
index_name=index_name,
|
||||
namespace=index_name,
|
||||
texts=texts, ids=ids, embedding=embedding_openai, index_name=index_name
|
||||
)
|
||||
# wait for the index to be ready
|
||||
time.sleep(20)
|
||||
index_stats = self.index.describe_index_stats()
|
||||
assert index_stats["namespaces"][index_name]["vector_count"] == len(texts)
|
||||
assert index_stats["total_vector_count"] == len(texts)
|
||||
|
||||
ids_1 = [uuid.uuid4().hex for _ in range(len(texts))]
|
||||
Pinecone.from_texts(
|
||||
@ -202,7 +149,8 @@ class TestPinecone:
|
||||
ids=ids_1,
|
||||
embedding=embedding_openai,
|
||||
index_name=index_name,
|
||||
namespace=index_name,
|
||||
)
|
||||
# wait for the index to be ready
|
||||
time.sleep(20)
|
||||
index_stats = self.index.describe_index_stats()
|
||||
assert index_stats["namespaces"][index_name]["vector_count"] == len(texts) * 2
|
||||
assert index_stats["total_vector_count"] == len(texts) * 2
|
||||
|
Loading…
Reference in New Issue
Block a user