pinecone: init pkg (#16556)

<!-- Thank you for contributing to LangChain!

Please title your PR "<package>: <description>", where <package> is
whichever of langchain, community, core, experimental, etc. is being
modified.

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes if applicable,
  - **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->
This commit is contained in:
Erick Friis 2024-02-05 11:55:01 -08:00 committed by GitHub
parent 1183769cf7
commit 6ffd5b15bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 2560 additions and 104 deletions

View File

@ -13,7 +13,16 @@
"This notebook shows how to use functionality related to the `Pinecone` vector database.\n", "This notebook shows how to use functionality related to the `Pinecone` vector database.\n",
"\n", "\n",
"To use Pinecone, you must have an API key. \n", "To use Pinecone, you must have an API key. \n",
"Here are the [installation instructions](https://docs.pinecone.io/docs/quickstart)." "Here are the [installation instructions](https://docs.pinecone.io/docs/quickstart).\n",
"\n",
"Set the following environment variables to make using the `Pinecone` integration easier:\n",
"\n",
"- `PINECONE_API_KEY`: Your Pinecone API key.\n",
"- `PINECONE_INDEX_NAME`: The name of the index you want to use.\n",
"\n",
"And to follow along in this doc, you should also set\n",
"\n",
"- `OPENAI_API_KEY`: Your OpenAI API key, for using `OpenAIEmbeddings`"
] ]
}, },
{ {
@ -25,74 +34,27 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"%pip install --upgrade --quiet pinecone-client langchain-openai tiktoken langchain" "%pip install --upgrade --quiet langchain-pinecone langchain-openai langchain"
] ]
}, },
{ {
"cell_type": "code",
"execution_count": null,
"id": "c1e38361-c1fe-4ac6-86e9-c90ebaf7ae87",
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"\n",
"os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Pinecone API Key:\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "02a536e0-d603-4d79-b18b-1ed562977b40",
"metadata": {},
"outputs": [],
"source": [
"os.environ[\"PINECONE_ENV\"] = getpass.getpass(\"Pinecone Environment:\")"
]
},
{
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "320af802-9271-46ee-948f-d2453933d44b", "id": "42f2ea67",
"metadata": {}, "metadata": {},
"source": [ "source": [
"We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key." "First, let's split our state of the union document into chunked `docs`."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"id": "ffea66e4-bc23-46a9-9580-b348dfe7b7a7",
"metadata": {},
"outputs": [],
"source": [
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aac9563e",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain_community.document_loaders import TextLoader\n",
"from langchain_community.vectorstores import Pinecone\n",
"from langchain_openai import OpenAIEmbeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3c3999a", "id": "a3c3999a",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain_community.document_loaders import TextLoader\n", "from langchain_community.document_loaders import TextLoader\n",
"from langchain_openai import OpenAIEmbeddings\n",
"\n", "\n",
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n", "loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
"documents = loader.load()\n", "documents = loader.load()\n",
@ -103,43 +65,52 @@
] ]
}, },
{ {
"cell_type": "code", "cell_type": "markdown",
"execution_count": null, "id": "3a4d377f",
"id": "6e104aee",
"metadata": {}, "metadata": {},
"outputs": [],
"source": [ "source": [
"import pinecone\n", "Now let's assume you have your Pinecone index set up with `dimension=1536`.\n",
"\n", "\n",
"# initialize pinecone\n", "We can connect to our Pinecone index and insert those chunked docs as contents with `Pinecone.from_documents`."
"pinecone.init(\n",
" api_key=os.getenv(\"PINECONE_API_KEY\"), # find at app.pinecone.io\n",
" environment=os.getenv(\"PINECONE_ENV\"), # next to api key in console\n",
")\n",
"\n",
"index_name = \"langchain-demo\"\n",
"\n",
"# First, check if our index already exists. If it doesn't, we create it\n",
"if index_name not in pinecone.list_indexes():\n",
" # we create a new index\n",
" pinecone.create_index(name=index_name, metric=\"cosine\", dimension=1536)\n",
"# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`\n",
"docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)\n",
"\n",
"# if you already have an index, you can load it like this\n",
"# docsearch = Pinecone.from_existing_index(index_name, embeddings)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"id": "9c608226", "id": "6e104aee",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from langchain_pinecone import Pinecone\n",
"\n",
"index_name = \"langchain-test-index\"\n",
"\n",
"docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ffbcb3fb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while youre at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
"\n",
"Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n"
]
}
],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)\n",
"print(docs[0].page_content)" "print(docs[0].page_content)"
] ]
}, },
@ -156,15 +127,25 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"id": "38a7a60e", "id": "38a7a60e",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"['24631802-4bad-44a7-a4ba-fd71f00cc160']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"index = pinecone.Index(\"langchain-demo\")\n", "vectorstore = Pinecone(index_name=index_name, embedding=embeddings)\n",
"vectorstore = Pinecone(index, embeddings.embed_query, \"text\")\n",
"\n", "\n",
"vectorstore.add_texts(\"More text!\")" "vectorstore.add_texts([\"More text!\"])"
] ]
}, },
{ {
@ -180,10 +161,91 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 9,
"id": "a359ed74", "id": "a359ed74",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"## Document 0\n",
"\n",
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while youre at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
"\n",
"Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n",
"\n",
"## Document 1\n",
"\n",
"And Im taking robust action to make sure the pain of our sanctions is targeted at Russias economy. And I will use every tool at our disposal to protect American businesses and consumers. \n",
"\n",
"Tonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world. \n",
"\n",
"America will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our allies. \n",
"\n",
"These steps will help blunt gas prices here at home. And I know the news about whats happening can seem alarming. \n",
"\n",
"But I want you to know that we are going to be okay. \n",
"\n",
"When the history of this era is written Putins war on Ukraine will have left Russia weaker and the rest of the world stronger. \n",
"\n",
"While it shouldnt have taken something so terrible for people around the world to see whats at stake now everyone sees it clearly.\n",
"\n",
"## Document 2\n",
"\n",
"We cant change how divided weve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n",
"\n",
"I recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n",
"\n",
"They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n",
"\n",
"Officer Mora was 27 years old. \n",
"\n",
"Officer Rivera was 22. \n",
"\n",
"Both Dominican Americans whod grown up on the same streets they later chose to patrol as police officers. \n",
"\n",
"I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n",
"\n",
"Ive worked on these issues a long time. \n",
"\n",
"I know what works: Investing in crime prevention and community police officers wholl walk the beat, wholl know the neighborhood, and who can restore trust and safety.\n",
"\n",
"## Document 3\n",
"\n",
"One was stationed at bases and breathing in toxic smoke from “burn pits” that incinerated wastes of war—medical and hazard material, jet fuel, and more. \n",
"\n",
"When they came home, many of the worlds fittest and best trained warriors were never the same. \n",
"\n",
"Headaches. Numbness. Dizziness. \n",
"\n",
"A cancer that would put them in a flag-draped coffin. \n",
"\n",
"I know. \n",
"\n",
"One of those soldiers was my son Major Beau Biden. \n",
"\n",
"We dont know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. \n",
"\n",
"But Im committed to finding out everything we can. \n",
"\n",
"Committed to military families like Danielle Robinson from Ohio. \n",
"\n",
"The widow of Sergeant First Class Heath Robinson. \n",
"\n",
"He was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. \n",
"\n",
"Stationed near Baghdad, just yards from burn pits the size of football fields. \n",
"\n",
"Heaths widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter.\n"
]
}
],
"source": [ "source": [
"retriever = docsearch.as_retriever(search_type=\"mmr\")\n", "retriever = docsearch.as_retriever(search_type=\"mmr\")\n",
"matched_docs = retriever.get_relevant_documents(query)\n", "matched_docs = retriever.get_relevant_documents(query)\n",
@ -203,15 +265,56 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 10,
"id": "9ca82740", "id": "9ca82740",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1. Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while youre at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
"\n",
"Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence. \n",
"\n",
"2. We cant change how divided weve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n",
"\n",
"I recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n",
"\n",
"They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n",
"\n",
"Officer Mora was 27 years old. \n",
"\n",
"Officer Rivera was 22. \n",
"\n",
"Both Dominican Americans whod grown up on the same streets they later chose to patrol as police officers. \n",
"\n",
"I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n",
"\n",
"Ive worked on these issues a long time. \n",
"\n",
"I know what works: Investing in crime prevention and community police officers wholl walk the beat, wholl know the neighborhood, and who can restore trust and safety. \n",
"\n"
]
}
],
"source": [ "source": [
"found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)\n", "found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)\n",
"for i, doc in enumerate(found_docs):\n", "for i, doc in enumerate(found_docs):\n",
" print(f\"{i + 1}.\", doc.page_content, \"\\n\")" " print(f\"{i + 1}.\", doc.page_content, \"\\n\")"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0fd750b",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -230,7 +333,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6" "version": "3.11.4"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -7,6 +7,7 @@ import warnings
from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Tuple, Union from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Tuple, Union
import numpy as np import numpy as np
from langchain_core._api.deprecation import deprecated
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings from langchain_core.embeddings import Embeddings
from langchain_core.utils.iter import batch_iterate from langchain_core.utils.iter import batch_iterate
@ -41,24 +42,16 @@ def _is_pinecone_v3() -> bool:
return version.parse(pinecone_client_version) >= version.parse("3.0.0.dev") return version.parse(pinecone_client_version) >= version.parse("3.0.0.dev")
@deprecated(
since="0.0.18", removal="0.2.0", alternative_import="langchain_pinecone.Pinecone"
)
class Pinecone(VectorStore): class Pinecone(VectorStore):
"""`Pinecone` vector store. """`Pinecone` vector store.
To use, you should have the ``pinecone-client`` python package installed. To use, you should have the ``pinecone-client`` python package installed.
Example: This version of Pinecone is deprecated. Please use `langchain_pinecone.Pinecone`
.. code-block:: python instead.
from langchain_community.vectorstores import Pinecone
from langchain_community.embeddings.openai import OpenAIEmbeddings
import pinecone
# The environment should be the one specified next to the API key
# in your Pinecone console
pinecone.init(api_key="***", environment="...")
index = pinecone.Index("langchain-demo")
embeddings = OpenAIEmbeddings()
vectorstore = Pinecone(index, embeddings, "text")
""" """
def __init__( def __init__(

1
libs/partners/pinecone/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 LangChain, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,61 @@
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
# Default target executed when no arguments are given to make.
all: help
# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests/
integration_tests: TEST_FILE = tests/integration_tests/
test integration_tests:
poetry run pytest $(TEST_FILE)
tests:
poetry run pytest $(TEST_FILE)
######################
# LINTING AND FORMATTING
######################
# Define a variable for Python and notebook files.
PYTHON_FILES=.
MYPY_CACHE=.mypy_cache
lint format: PYTHON_FILES=.
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/pinecone --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
lint_package: PYTHON_FILES=langchain_pinecone
lint_tests: PYTHON_FILES=tests
lint_tests: MYPY_CACHE=.mypy_cache_test
lint lint_diff lint_package lint_tests:
poetry run ruff .
poetry run ruff format $(PYTHON_FILES) --diff
poetry run ruff --select I $(PYTHON_FILES)
mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
format format_diff:
poetry run ruff format $(PYTHON_FILES)
poetry run ruff --select I --fix $(PYTHON_FILES)
spell_check:
poetry run codespell --toml pyproject.toml
spell_fix:
poetry run codespell --toml pyproject.toml -w
check_imports: $(shell find langchain_pinecone -name '*.py')
poetry run python ./scripts/check_imports.py $^
######################
# HELP
######################
help:
@echo '----'
@echo 'check_imports - check imports'
@echo 'format - run code formatters'
@echo 'lint - run linters'
@echo 'test - run unit tests'
@echo 'tests - run unit tests'
@echo 'test TEST_FILE=<test_file> - run all tests in file'

View File

@ -0,0 +1,27 @@
# langchain-pinecone
This package contains the LangChain integration with Pinecone.
## Installation
```bash
pip install -U langchain-pinecone
```
And you should configure credentials by setting the following environment variables:
- `PINECONE_API_KEY`
- `PINECONE_INDEX_NAME`
- `PINECONE_ENVIRONMENT`
## Usage
The `Pinecone` class exposes the connection to the Pinecone vector store.
```python
from langchain_pinecone import Pinecone
embeddings = ... # use a LangChain Embeddings class
vectorstore = Pinecone(embeddings=embeddings)
```

View File

@ -0,0 +1,5 @@
from langchain_pinecone.vectorstores import Pinecone
__all__ = [
"Pinecone",
]

View File

@ -0,0 +1,71 @@
from enum import Enum
from typing import List, Union
import numpy as np
import simsimd # type: ignore
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
class DistanceStrategy(str, Enum):
"""Enumerator of the Distance strategies for calculating distances
between vectors."""
EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT"
COSINE = "COSINE"
def maximal_marginal_relevance(
query_embedding: np.ndarray,
embedding_list: list,
lambda_mult: float = 0.5,
k: int = 4,
) -> List[int]:
"""Calculate maximal marginal relevance."""
if min(k, len(embedding_list)) <= 0:
return []
if query_embedding.ndim == 1:
query_embedding = np.expand_dims(query_embedding, axis=0)
similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
most_similar = int(np.argmax(similarity_to_query))
idxs = [most_similar]
selected = np.array([embedding_list[most_similar]])
while len(idxs) < min(k, len(embedding_list)):
best_score = -np.inf
idx_to_add = -1
similarity_to_selected = cosine_similarity(embedding_list, selected)
for i, query_score in enumerate(similarity_to_query):
if i in idxs:
continue
redundant_score = max(similarity_to_selected[i])
equation_score = (
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
)
if equation_score > best_score:
best_score = equation_score
idx_to_add = i
idxs.append(idx_to_add)
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
return idxs
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
"""Row-wise cosine similarity between two equal-width matrices."""
if len(X) == 0 or len(Y) == 0:
return np.array([])
X = np.array(X)
Y = np.array(Y)
if X.shape[1] != Y.shape[1]:
raise ValueError(
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
f"and Y has shape {Y.shape}."
)
X = np.array(X, dtype=np.float32)
Y = np.array(Y, dtype=np.float32)
Z = 1 - simsimd.cdist(X, Y, metric="cosine")
if isinstance(Z, float):
return np.array([Z])
return Z

View File

@ -0,0 +1,487 @@
from __future__ import annotations
import logging
import os
import uuid
from typing import (
TYPE_CHECKING,
Any,
Callable,
Iterable,
List,
Optional,
Tuple,
TypeVar,
)
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.utils.iter import batch_iterate
from langchain_core.vectorstores import VectorStore
from pinecone import Pinecone as PineconeClient # type: ignore
from langchain_pinecone._utilities import DistanceStrategy, maximal_marginal_relevance
if TYPE_CHECKING:
from pinecone import Index
logger = logging.getLogger(__name__)
VST = TypeVar("VST", bound=VectorStore)
class Pinecone(VectorStore):
"""`Pinecone` vector store.
Example:
.. code-block:: python
from langchain_pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
index_name = "my-index"
namespace = "my-namespace"
vectorstore = Pinecone(
index_name=index_name,
embedding=embedding,
namespace=namespace,
)
"""
def __init__(
self,
# setting default params to bypass having to pass in
# the index and embedding objects - manually throw
# exceptions if they are not passed in or set in environment
# (keeping param for backwards compatibility)
index: Optional[Any] = None,
embedding: Optional[Embeddings] = None,
text_key: Optional[str] = "text",
namespace: Optional[str] = None,
distance_strategy: Optional[DistanceStrategy] = DistanceStrategy.COSINE,
*,
pinecone_api_key: Optional[str] = None,
index_name: Optional[str] = None,
):
if embedding is None:
raise ValueError("Embedding must be provided")
self._embedding = embedding
if text_key is None:
raise ValueError("Text key must be provided")
self._text_key = text_key
self._namespace = namespace
self.distance_strategy = distance_strategy
if index:
# supports old way of initializing externally
self._index = index
else:
# all internal initialization
_pinecone_api_key = (
pinecone_api_key or os.environ.get("PINECONE_API_KEY") or ""
)
if not _pinecone_api_key:
raise ValueError(
"Pinecone API key must be provided in either `pinecone_api_key` "
"or `PINECONE_API_KEY` environment variable"
)
_index_name = index_name or os.environ.get("PINECONE_INDEX_NAME") or ""
if not _index_name:
raise ValueError(
"Pinecone index name must be provided in either `index_name` "
"or `PINECONE_INDEX_NAME` environment variable"
)
# needs
client = PineconeClient(api_key=_pinecone_api_key)
self._index = client.Index(_index_name)
@property
def embeddings(self) -> Optional[Embeddings]:
"""Access the query embedding object if available."""
return self._embedding
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
namespace: Optional[str] = None,
batch_size: int = 32,
embedding_chunk_size: int = 1000,
*,
async_req: bool = True,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Upsert optimization is done by chunking the embeddings and upserting them.
This is done to avoid memory issues and optimize using HTTP based embeddings.
For OpenAI embeddings, use pool_threads>4 when constructing the pinecone.Index,
embedding_chunk_size>1000 and batch_size~64 for best performance.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
ids: Optional list of ids to associate with the texts.
namespace: Optional pinecone namespace to add the texts to.
batch_size: Batch size to use when adding the texts to the vectorstore.
embedding_chunk_size: Chunk size to use when embedding the texts.
Returns:
List of ids from adding the texts into the vectorstore.
"""
if namespace is None:
namespace = self._namespace
texts = list(texts)
ids = ids or [str(uuid.uuid4()) for _ in texts]
metadatas = metadatas or [{} for _ in texts]
for metadata, text in zip(metadatas, texts):
metadata[self._text_key] = text
# For loops to avoid memory issues and optimize when using HTTP based embeddings
# The first loop runs the embeddings, it benefits when using OpenAI embeddings
# The second loops runs the pinecone upsert asynchronously.
for i in range(0, len(texts), embedding_chunk_size):
chunk_texts = texts[i : i + embedding_chunk_size]
chunk_ids = ids[i : i + embedding_chunk_size]
chunk_metadatas = metadatas[i : i + embedding_chunk_size]
embeddings = self._embedding.embed_documents(chunk_texts)
async_res = [
self._index.upsert(
vectors=batch,
namespace=namespace,
async_req=async_req,
**kwargs,
)
for batch in batch_iterate(
batch_size, zip(chunk_ids, embeddings, chunk_metadatas)
)
]
[res.get() for res in async_res]
return ids
def similarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
) -> List[Tuple[Document, float]]:
"""Return pinecone documents most similar to query, along with scores.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Dictionary of argument(s) to filter on metadata
namespace: Namespace to search in. Default will search in '' namespace.
Returns:
List of Documents most similar to the query and score for each
"""
return self.similarity_search_by_vector_with_score(
self._embedding.embed_query(query), k=k, filter=filter, namespace=namespace
)
def similarity_search_by_vector_with_score(
self,
embedding: List[float],
*,
k: int = 4,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
) -> List[Tuple[Document, float]]:
"""Return pinecone documents most similar to embedding, along with scores."""
if namespace is None:
namespace = self._namespace
docs = []
results = self._index.query(
vector=embedding,
top_k=k,
include_metadata=True,
namespace=namespace,
filter=filter,
)
for res in results["matches"]:
metadata = res["metadata"]
if self._text_key in metadata:
text = metadata.pop(self._text_key)
score = res["score"]
docs.append((Document(page_content=text, metadata=metadata), score))
else:
logger.warning(
f"Found document with no `{self._text_key}` key. Skipping."
)
return docs
def similarity_search(
self,
query: str,
k: int = 4,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return pinecone documents most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Dictionary of argument(s) to filter on metadata
namespace: Namespace to search in. Default will search in '' namespace.
Returns:
List of Documents most similar to the query and score for each
"""
docs_and_scores = self.similarity_search_with_score(
query, k=k, filter=filter, namespace=namespace, **kwargs
)
return [doc for doc, _ in docs_and_scores]
def _select_relevance_score_fn(self) -> Callable[[float], float]:
"""
The 'correct' relevance function
may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
"""
if self.distance_strategy == DistanceStrategy.COSINE:
return self._cosine_relevance_score_fn
elif self.distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
return self._max_inner_product_relevance_score_fn
elif self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
return self._euclidean_relevance_score_fn
else:
raise ValueError(
"Unknown distance strategy, must be cosine, max_inner_product "
"(dot product), or euclidean"
)
@staticmethod
def _cosine_relevance_score_fn(score: float) -> float:
"""Pinecone returns cosine similarity scores between [-1,1]"""
return (score + 1) / 2
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
if namespace is None:
namespace = self._namespace
results = self._index.query(
vector=[embedding],
top_k=fetch_k,
include_values=True,
include_metadata=True,
namespace=namespace,
filter=filter,
)
mmr_selected = maximal_marginal_relevance(
np.array([embedding], dtype=np.float32),
[item["values"] for item in results["matches"]],
k=k,
lambda_mult=lambda_mult,
)
selected = [results["matches"][i]["metadata"] for i in mmr_selected]
return [
Document(page_content=metadata.pop((self._text_key)), metadata=metadata)
for metadata in selected
]
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
filter: Optional[dict] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
Returns:
List of Documents selected by maximal marginal relevance.
"""
embedding = self._embedding.embed_query(query)
return self.max_marginal_relevance_search_by_vector(
embedding, k, fetch_k, lambda_mult, filter, namespace
)
@classmethod
def get_pinecone_index(
cls,
index_name: Optional[str],
pool_threads: int = 4,
*,
pinecone_api_key: Optional[str] = None,
) -> Index:
"""Return a Pinecone Index instance.
Args:
index_name: Name of the index to use.
pool_threads: Number of threads to use for index upsert.
Returns:
Pinecone Index instance."""
_pinecone_api_key = pinecone_api_key or os.environ.get("PINECONE_API_KEY") or ""
client = PineconeClient(api_key=_pinecone_api_key, pool_threads=pool_threads)
indexes = client.list_indexes()
index_names = [i.name for i in indexes.index_list["indexes"]]
if index_name in index_names:
index = client.Index(index_name)
elif len(index_names) == 0:
raise ValueError(
"No active indexes found in your Pinecone project, "
"are you sure you're using the right Pinecone API key and Environment? "
"Please double check your Pinecone dashboard."
)
else:
raise ValueError(
f"Index '{index_name}' not found in your Pinecone project. "
f"Did you mean one of the following indexes: {', '.join(index_names)}"
)
return index
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
batch_size: int = 32,
text_key: str = "text",
namespace: Optional[str] = None,
index_name: Optional[str] = None,
upsert_kwargs: Optional[dict] = None,
pool_threads: int = 4,
embeddings_chunk_size: int = 1000,
**kwargs: Any,
) -> Pinecone:
"""Construct Pinecone wrapper from raw documents.
This is a user friendly interface that:
1. Embeds documents.
2. Adds the documents to a provided Pinecone index
This is intended to be a quick way to get started.
The `pool_threads` affects the speed of the upsert operations.
Example:
.. code-block:: python
from langchain_community.vectorstores import Pinecone
from langchain_community.embeddings import OpenAIEmbeddings
import pinecone
# The environment should be the one specified next to the API key
# in your Pinecone console
pinecone.init(api_key="***", environment="...")
embeddings = OpenAIEmbeddings()
pinecone = Pinecone.from_texts(
texts,
embeddings,
index_name="langchain-demo"
)
"""
pinecone_index = cls.get_pinecone_index(index_name, pool_threads)
pinecone = cls(pinecone_index, embedding, text_key, namespace, **kwargs)
pinecone.add_texts(
texts,
metadatas=metadatas,
ids=ids,
namespace=namespace,
batch_size=batch_size,
embedding_chunk_size=embeddings_chunk_size,
**(upsert_kwargs or {}),
)
return pinecone
@classmethod
def from_existing_index(
cls,
index_name: str,
embedding: Embeddings,
text_key: str = "text",
namespace: Optional[str] = None,
pool_threads: int = 4,
) -> Pinecone:
"""Load pinecone vectorstore from index name."""
pinecone_index = cls.get_pinecone_index(index_name, pool_threads)
return cls(pinecone_index, embedding, text_key, namespace)
def delete(
self,
ids: Optional[List[str]] = None,
delete_all: Optional[bool] = None,
namespace: Optional[str] = None,
filter: Optional[dict] = None,
**kwargs: Any,
) -> None:
"""Delete by vector IDs or filter.
Args:
ids: List of ids to delete.
filter: Dictionary of conditions to filter vectors to delete.
"""
if namespace is None:
namespace = self._namespace
if delete_all:
self._index.delete(delete_all=True, namespace=namespace, **kwargs)
elif ids is not None:
chunk_size = 1000
for i in range(0, len(ids), chunk_size):
chunk = ids[i : i + chunk_size]
self._index.delete(ids=chunk, namespace=namespace, **kwargs)
elif filter is not None:
self._index.delete(filter=filter, namespace=namespace, **kwargs)
else:
raise ValueError("Either ids, delete_all, or filter must be provided.")
return None

1213
libs/partners/pinecone/poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,97 @@
[tool.poetry]
name = "langchain-pinecone"
version = "0.0.1"
description = "An integration package connecting Pinecone and LangChain"
authors = []
readme = "README.md"
repository = "https://github.com/langchain-ai/langchain"
license = "MIT"
[tool.poetry.urls]
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/pinecone"
[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
langchain-core = ">=0.0.12"
pinecone-client = {version = "^3", python = ">=3.8,<3.13"}
simsimd = "^3.6.3"
numpy = "^1"
[tool.poetry.group.test]
optional = true
[tool.poetry.group.test.dependencies]
pytest = "^7.3.0"
freezegun = "^1.2.2"
pytest-mock = "^3.10.0"
syrupy = "^4.0.2"
pytest-watcher = "^0.3.4"
pytest-asyncio = "^0.21.1"
langchain-core = {path = "../../core", develop = true}
[tool.poetry.group.codespell]
optional = true
[tool.poetry.group.codespell.dependencies]
codespell = "^2.2.0"
[tool.poetry.group.test_integration]
optional = true
[tool.poetry.group.test_integration.dependencies]
langchain-openai = ">=0.0.3,<0.1"
[tool.poetry.group.lint]
optional = true
[tool.poetry.group.lint.dependencies]
ruff = "^0.1.5"
[tool.poetry.group.typing.dependencies]
mypy = "^0.991"
langchain-core = {path = "../../core", develop = true}
[tool.poetry.group.dev]
optional = true
[tool.poetry.group.dev.dependencies]
langchain-core = {path = "../../core", develop = true}
[tool.ruff]
select = [
"E", # pycodestyle
"F", # pyflakes
"I", # isort
]
[tool.mypy]
disallow_untyped_defs = "True"
[tool.coverage.run]
omit = [
"tests/*",
]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.pytest.ini_options]
# --strict-markers will raise errors on unknown marks.
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
#
# https://docs.pytest.org/en/7.1.x/reference/reference.html
# --strict-config any warnings encountered while parsing the `pytest`
# section of the configuration file raise errors.
#
# https://github.com/tophat/syrupy
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
addopts = "--snapshot-warn-unused --strict-markers --strict-config --durations=5"
# Registering custom markers.
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
markers = [
"requires: mark tests as requiring a specific library",
"asyncio: mark tests as requiring asyncio",
"compile: mark placeholder test used to compile integration tests without running them",
]
asyncio_mode = "auto"

View File

@ -0,0 +1,17 @@
import sys
import traceback
from importlib.machinery import SourceFileLoader
if __name__ == "__main__":
files = sys.argv[1:]
has_failure = False
for file in files:
try:
SourceFileLoader("x", file).load_module()
except Exception:
has_faillure = True
print(file)
traceback.print_exc()
print()
sys.exit(1 if has_failure else 0)

View File

@ -0,0 +1,27 @@
#!/bin/bash
#
# This script searches for lines starting with "import pydantic" or "from pydantic"
# in tracked files within a Git repository.
#
# Usage: ./scripts/check_pydantic.sh /path/to/repository
# Check if a path argument is provided
if [ $# -ne 1 ]; then
echo "Usage: $0 /path/to/repository"
exit 1
fi
repository_path="$1"
# Search for lines matching the pattern within the specified repository
result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic')
# Check if any matching lines were found
if [ -n "$result" ]; then
echo "ERROR: The following lines need to be updated:"
echo "$result"
echo "Please replace the code with an import from langchain_core.pydantic_v1."
echo "For example, replace 'from pydantic import BaseModel'"
echo "with 'from langchain_core.pydantic_v1 import BaseModel'"
exit 1
fi

View File

@ -0,0 +1,17 @@
#!/bin/bash
set -eu
# Initialize a variable to keep track of errors
errors=0
# make sure not importing from langchain or langchain_experimental
git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
# Decide on an exit status based on the errors
if [ "$errors" -gt 0 ]; then
exit 1
else
exit 0
fi

View File

View File

@ -0,0 +1,7 @@
import pytest
@pytest.mark.compile
def test_placeholder() -> None:
"""Used for compiling integration tests without running any real tests."""
pass

View File

@ -0,0 +1,288 @@
import os
import time
import uuid
from typing import List
import numpy as np
import pinecone # type: ignore
import pytest
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from pinecone import PodSpec
from langchain_pinecone import Pinecone
INDEX_NAME = "langchain-test-index" # name of the index
NAMESPACE_NAME = "langchain-test-namespace" # name of the namespace
DIMENSION = 1536 # dimension of the embeddings
DEFAULT_SLEEP = 20
class TestPinecone:
index: "pinecone.Index"
@classmethod
def setup_class(cls) -> None:
import pinecone
client = pinecone.Pinecone(api_key=os.environ["PINECONE_API_KEY"])
index_list = client.list_indexes()
for i in index_list:
if i["name"] == INDEX_NAME:
client.delete_index(INDEX_NAME)
break
client.create_index(
name=INDEX_NAME,
dimension=DIMENSION,
metric="cosine",
spec=PodSpec(environment=os.environ["PINECONE_ENVIRONMENT"]),
)
cls.index = client.Index(INDEX_NAME)
# insure the index is empty
index_stats = cls.index.describe_index_stats()
assert index_stats["dimension"] == DIMENSION
if index_stats["namespaces"].get(NAMESPACE_NAME) is not None:
assert index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == 0
@classmethod
def teardown_class(cls) -> None:
index_stats = cls.index.describe_index_stats()
for _namespace_name in index_stats["namespaces"].keys():
cls.index.delete(delete_all=True, namespace=_namespace_name)
@pytest.fixture(autouse=True)
def setup(self) -> None:
# delete all the vectors in the index
print("called")
self.index.delete(delete_all=True, namespace=NAMESPACE_NAME)
# index_stats = self.index.describe_index_stats()
# for _namespace_name in index_stats["namespaces"].keys():
# self.index.delete(delete_all=True, namespace=_namespace_name)
time.sleep(DEFAULT_SLEEP) # prevent race condition with previous step
# index_stats = self.index.describe_index_stats
@pytest.fixture
def embedding_openai(self) -> OpenAIEmbeddings:
return OpenAIEmbeddings()
@pytest.fixture
def texts(self) -> List[str]:
return ["foo", "bar", "baz"]
def test_from_texts(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
"""Test end to end construction and search."""
unique_id = uuid.uuid4().hex
needs = f"foobuu {unique_id} booo"
texts.insert(0, needs)
docsearch = Pinecone.from_texts(
texts=texts,
embedding=embedding_openai,
index_name=INDEX_NAME,
namespace=NAMESPACE_NAME,
)
time.sleep(DEFAULT_SLEEP) # prevent race condition
output = docsearch.similarity_search(unique_id, k=1, namespace=NAMESPACE_NAME)
assert output == [Document(page_content=needs)]
def test_from_texts_with_metadatas(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
"""Test end to end construction and search."""
unique_id = uuid.uuid4().hex
needs = f"foobuu {unique_id} booo"
texts = [needs] + texts
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
index_name=INDEX_NAME,
metadatas=metadatas,
namespace=NAMESPACE_NAME,
)
time.sleep(DEFAULT_SLEEP) # prevent race condition
output = docsearch.similarity_search(needs, k=1, namespace=NAMESPACE_NAME)
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}?
assert output == [Document(page_content=needs, metadata={"page": 0.0})]
def test_from_texts_with_scores(self, embedding_openai: OpenAIEmbeddings) -> None:
"""Test end to end construction and search with scores and IDs."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
print("metadatas", metadatas)
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
index_name=INDEX_NAME,
metadatas=metadatas,
namespace=NAMESPACE_NAME,
)
print(texts)
time.sleep(DEFAULT_SLEEP) # prevent race condition
output = docsearch.similarity_search_with_score(
"foo", k=3, namespace=NAMESPACE_NAME
)
docs = [o[0] for o in output]
scores = [o[1] for o in output]
sorted_documents = sorted(docs, key=lambda x: x.metadata["page"])
print(sorted_documents)
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}, etc???
assert sorted_documents == [
Document(page_content="foo", metadata={"page": 0.0}),
Document(page_content="bar", metadata={"page": 1.0}),
Document(page_content="baz", metadata={"page": 2.0}),
]
assert scores[0] > scores[1] > scores[2]
def test_from_existing_index_with_namespaces(
self, embedding_openai: OpenAIEmbeddings
) -> None:
"""Test that namespaces are properly handled."""
# Create two indexes with the same name but different namespaces
texts_1 = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts_1))]
Pinecone.from_texts(
texts_1,
embedding_openai,
index_name=INDEX_NAME,
metadatas=metadatas,
namespace=f"{INDEX_NAME}-1",
)
texts_2 = ["foo2", "bar2", "baz2"]
metadatas = [{"page": i} for i in range(len(texts_2))]
Pinecone.from_texts(
texts_2,
embedding_openai,
index_name=INDEX_NAME,
metadatas=metadatas,
namespace=f"{INDEX_NAME}-2",
)
time.sleep(DEFAULT_SLEEP) # prevent race condition
# Search with namespace
docsearch = Pinecone.from_existing_index(
index_name=INDEX_NAME,
embedding=embedding_openai,
namespace=f"{INDEX_NAME}-1",
)
output = docsearch.similarity_search("foo", k=20, namespace=f"{INDEX_NAME}-1")
# check that we don't get results from the other namespace
page_contents = sorted(set([o.page_content for o in output]))
assert all(content in ["foo", "bar", "baz"] for content in page_contents)
assert all(content not in ["foo2", "bar2", "baz2"] for content in page_contents)
def test_add_documents_with_ids(
self, texts: List[str], embedding_openai: OpenAIEmbeddings
) -> None:
ids = [uuid.uuid4().hex for _ in range(len(texts))]
Pinecone.from_texts(
texts=texts,
ids=ids,
embedding=embedding_openai,
index_name=INDEX_NAME,
namespace=NAMESPACE_NAME,
)
time.sleep(DEFAULT_SLEEP) # prevent race condition
index_stats = self.index.describe_index_stats()
assert index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == len(texts)
ids_1 = [uuid.uuid4().hex for _ in range(len(texts))]
Pinecone.from_texts(
texts=[t + "-1" for t in texts],
ids=ids_1,
embedding=embedding_openai,
index_name=INDEX_NAME,
namespace=NAMESPACE_NAME,
)
time.sleep(DEFAULT_SLEEP) # prevent race condition
index_stats = self.index.describe_index_stats()
assert (
index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == len(texts) * 2
)
# only focused on this namespace now
# assert index_stats["total_vector_count"] == len(texts) * 2
@pytest.mark.xfail(reason="relevance score just over 1")
def test_relevance_score_bound(self, embedding_openai: OpenAIEmbeddings) -> None:
"""Ensures all relevance scores are between 0 and 1."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
index_name=INDEX_NAME,
metadatas=metadatas,
)
# wait for the index to be ready
time.sleep(DEFAULT_SLEEP)
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
print(output)
assert all(
(1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output
)
@pytest.mark.skipif(reason="slow to run for benchmark")
@pytest.mark.parametrize(
"pool_threads,batch_size,embeddings_chunk_size,data_multiplier",
[
(
1,
32,
32,
1000,
), # simulate single threaded with embeddings_chunk_size = batch_size = 32
(
1,
32,
1000,
1000,
), # simulate single threaded with embeddings_chunk_size = 1000
(
4,
32,
1000,
1000,
), # simulate 4 threaded with embeddings_chunk_size = 1000
(20, 64, 5000, 1000),
], # simulate 20 threaded with embeddings_chunk_size = 5000
)
def test_from_texts_with_metadatas_benchmark(
self,
pool_threads: int,
batch_size: int,
embeddings_chunk_size: int,
data_multiplier: int,
documents: List[Document],
embedding_openai: OpenAIEmbeddings,
) -> None:
"""Test end to end construction and search."""
texts = [document.page_content for document in documents] * data_multiplier
uuids = [uuid.uuid4().hex for _ in range(len(texts))]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Pinecone.from_texts(
texts,
embedding_openai,
ids=uuids,
metadatas=metadatas,
index_name=INDEX_NAME,
namespace=NAMESPACE_NAME,
pool_threads=pool_threads,
batch_size=batch_size,
embeddings_chunk_size=embeddings_chunk_size,
)
query = "What did the president say about Ketanji Brown Jackson"
_ = docsearch.similarity_search(query, k=1, namespace=NAMESPACE_NAME)

View File

@ -0,0 +1,9 @@
from langchain_pinecone import __all__
EXPECTED_ALL = [
"Pinecone",
]
def test_all_imports() -> None:
assert sorted(EXPECTED_ALL) == sorted(__all__)

View File

@ -0,0 +1,12 @@
from unittest.mock import Mock
from langchain_pinecone.vectorstores import Pinecone
def test_initialization() -> None:
"""Test integration vectorstore initialization."""
# mock index
index = Mock()
embedding = Mock()
text_key = "xyz"
Pinecone(index, embedding, text_key)