Add ids parameter for pinecone from_texts / add_texts (#659)

Allow optionally specifying a list of ids for pinecone rather than
having them randomly generated.
This also permits editing the embedding/metadata of existing pinecone
entries, by id.
harrison/document-split
iocuydi 1 year ago committed by GitHub
parent 54d7f1c933
commit 69998b5fad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -54,6 +54,7 @@ class Pinecone(VectorStore):
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
namespace: Optional[str] = None,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
@ -61,6 +62,7 @@ class Pinecone(VectorStore):
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids to associate with the texts.
namespace: Optional pinecone namespace to add the texts to.
Returns:
@ -69,14 +71,12 @@ class Pinecone(VectorStore):
"""
# Embed and create the documents
docs = []
ids = []
ids = ids or [str(uuid.uuid4()) for _ in texts]
for i, text in enumerate(texts):
id = str(uuid.uuid4())
embedding = self._embedding_function(text)
metadata = metadatas[i] if metadatas else {}
metadata[self._text_key] = text
docs.append((id, embedding, metadata))
ids.append(id)
docs.append((ids[i], embedding, metadata))
# upsert to Pinecone
self._index.upsert(vectors=docs, namespace=namespace)
return ids
@ -153,6 +153,7 @@ class Pinecone(VectorStore):
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
batch_size: int = 32,
text_key: str = "text",
index_name: Optional[str] = None,
@ -197,7 +198,11 @@ class Pinecone(VectorStore):
i_end = min(i + batch_size, len(texts))
# get batch of texts and ids
lines_batch = texts[i : i + batch_size]
ids_batch = [str(uuid.uuid4()) for n in range(i, i_end)]
# create ids if not provided
if ids:
ids_batch = ids[i : i + batch_size]
else:
ids_batch = [str(uuid.uuid4()) for n in range(i, i_end)]
# create embeddings
embeds = embedding.embed_documents(lines_batch)
# prep metadata and upsert batch

Loading…
Cancel
Save