mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
pinecone: init pkg (#16556)
<!-- Thank you for contributing to LangChain! Please title your PR "<package>: <description>", where <package> is whichever of langchain, community, core, experimental, etc. is being modified. Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes if applicable, - **Dependencies:** any dependencies required for this change, - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` from the root of the package you've modified to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. -->
This commit is contained in:
parent
1183769cf7
commit
6ffd5b15bc
@ -13,7 +13,16 @@
|
||||
"This notebook shows how to use functionality related to the `Pinecone` vector database.\n",
|
||||
"\n",
|
||||
"To use Pinecone, you must have an API key. \n",
|
||||
"Here are the [installation instructions](https://docs.pinecone.io/docs/quickstart)."
|
||||
"Here are the [installation instructions](https://docs.pinecone.io/docs/quickstart).\n",
|
||||
"\n",
|
||||
"Set the following environment variables to make using the `Pinecone` integration easier:\n",
|
||||
"\n",
|
||||
"- `PINECONE_API_KEY`: Your Pinecone API key.\n",
|
||||
"- `PINECONE_INDEX_NAME`: The name of the index you want to use.\n",
|
||||
"\n",
|
||||
"And to follow along in this doc, you should also set\n",
|
||||
"\n",
|
||||
"- `OPENAI_API_KEY`: Your OpenAI API key, for using `OpenAIEmbeddings`"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -25,74 +34,27 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install --upgrade --quiet pinecone-client langchain-openai tiktoken langchain"
|
||||
"%pip install --upgrade --quiet langchain-pinecone langchain-openai langchain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c1e38361-c1fe-4ac6-86e9-c90ebaf7ae87",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"PINECONE_API_KEY\"] = getpass.getpass(\"Pinecone API Key:\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "02a536e0-d603-4d79-b18b-1ed562977b40",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"os.environ[\"PINECONE_ENV\"] = getpass.getpass(\"Pinecone Environment:\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "320af802-9271-46ee-948f-d2453933d44b",
|
||||
"id": "42f2ea67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key."
|
||||
"First, let's split our state of the union document into chunked `docs`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ffea66e4-bc23-46a9-9580-b348dfe7b7a7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aac9563e",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain_community.document_loaders import TextLoader\n",
|
||||
"from langchain_community.vectorstores import Pinecone\n",
|
||||
"from langchain_openai import OpenAIEmbeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"id": "a3c3999a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain_community.document_loaders import TextLoader\n",
|
||||
"from langchain_openai import OpenAIEmbeddings\n",
|
||||
"\n",
|
||||
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
|
||||
"documents = loader.load()\n",
|
||||
@ -103,43 +65,52 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6e104aee",
|
||||
"cell_type": "markdown",
|
||||
"id": "3a4d377f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pinecone\n",
|
||||
"Now let's assume you have your Pinecone index set up with `dimension=1536`.\n",
|
||||
"\n",
|
||||
"# initialize pinecone\n",
|
||||
"pinecone.init(\n",
|
||||
" api_key=os.getenv(\"PINECONE_API_KEY\"), # find at app.pinecone.io\n",
|
||||
" environment=os.getenv(\"PINECONE_ENV\"), # next to api key in console\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"index_name = \"langchain-demo\"\n",
|
||||
"\n",
|
||||
"# First, check if our index already exists. If it doesn't, we create it\n",
|
||||
"if index_name not in pinecone.list_indexes():\n",
|
||||
" # we create a new index\n",
|
||||
" pinecone.create_index(name=index_name, metric=\"cosine\", dimension=1536)\n",
|
||||
"# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`\n",
|
||||
"docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)\n",
|
||||
"\n",
|
||||
"# if you already have an index, you can load it like this\n",
|
||||
"# docsearch = Pinecone.from_existing_index(index_name, embeddings)\n",
|
||||
"\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
"We can connect to our Pinecone index and insert those chunked docs as contents with `Pinecone.from_documents`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c608226",
|
||||
"execution_count": 6,
|
||||
"id": "6e104aee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_pinecone import Pinecone\n",
|
||||
"\n",
|
||||
"index_name = \"langchain-test-index\"\n",
|
||||
"\n",
|
||||
"docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ffbcb3fb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
|
||||
"\n",
|
||||
"Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
|
||||
"\n",
|
||||
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
|
||||
"\n",
|
||||
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = docsearch.similarity_search(query)\n",
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
@ -156,15 +127,25 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"id": "38a7a60e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['24631802-4bad-44a7-a4ba-fd71f00cc160']"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"index = pinecone.Index(\"langchain-demo\")\n",
|
||||
"vectorstore = Pinecone(index, embeddings.embed_query, \"text\")\n",
|
||||
"vectorstore = Pinecone(index_name=index_name, embedding=embeddings)\n",
|
||||
"\n",
|
||||
"vectorstore.add_texts(\"More text!\")"
|
||||
"vectorstore.add_texts([\"More text!\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -180,10 +161,91 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"id": "a359ed74",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"## Document 0\n",
|
||||
"\n",
|
||||
"Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
|
||||
"\n",
|
||||
"Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
|
||||
"\n",
|
||||
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
|
||||
"\n",
|
||||
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n",
|
||||
"\n",
|
||||
"## Document 1\n",
|
||||
"\n",
|
||||
"And I’m taking robust action to make sure the pain of our sanctions is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers. \n",
|
||||
"\n",
|
||||
"Tonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world. \n",
|
||||
"\n",
|
||||
"America will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our allies. \n",
|
||||
"\n",
|
||||
"These steps will help blunt gas prices here at home. And I know the news about what’s happening can seem alarming. \n",
|
||||
"\n",
|
||||
"But I want you to know that we are going to be okay. \n",
|
||||
"\n",
|
||||
"When the history of this era is written Putin’s war on Ukraine will have left Russia weaker and the rest of the world stronger. \n",
|
||||
"\n",
|
||||
"While it shouldn’t have taken something so terrible for people around the world to see what’s at stake now everyone sees it clearly.\n",
|
||||
"\n",
|
||||
"## Document 2\n",
|
||||
"\n",
|
||||
"We can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n",
|
||||
"\n",
|
||||
"I recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n",
|
||||
"\n",
|
||||
"They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n",
|
||||
"\n",
|
||||
"Officer Mora was 27 years old. \n",
|
||||
"\n",
|
||||
"Officer Rivera was 22. \n",
|
||||
"\n",
|
||||
"Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n",
|
||||
"\n",
|
||||
"I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n",
|
||||
"\n",
|
||||
"I’ve worked on these issues a long time. \n",
|
||||
"\n",
|
||||
"I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety.\n",
|
||||
"\n",
|
||||
"## Document 3\n",
|
||||
"\n",
|
||||
"One was stationed at bases and breathing in toxic smoke from “burn pits” that incinerated wastes of war—medical and hazard material, jet fuel, and more. \n",
|
||||
"\n",
|
||||
"When they came home, many of the world’s fittest and best trained warriors were never the same. \n",
|
||||
"\n",
|
||||
"Headaches. Numbness. Dizziness. \n",
|
||||
"\n",
|
||||
"A cancer that would put them in a flag-draped coffin. \n",
|
||||
"\n",
|
||||
"I know. \n",
|
||||
"\n",
|
||||
"One of those soldiers was my son Major Beau Biden. \n",
|
||||
"\n",
|
||||
"We don’t know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. \n",
|
||||
"\n",
|
||||
"But I’m committed to finding out everything we can. \n",
|
||||
"\n",
|
||||
"Committed to military families like Danielle Robinson from Ohio. \n",
|
||||
"\n",
|
||||
"The widow of Sergeant First Class Heath Robinson. \n",
|
||||
"\n",
|
||||
"He was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. \n",
|
||||
"\n",
|
||||
"Stationed near Baghdad, just yards from burn pits the size of football fields. \n",
|
||||
"\n",
|
||||
"Heath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"retriever = docsearch.as_retriever(search_type=\"mmr\")\n",
|
||||
"matched_docs = retriever.get_relevant_documents(query)\n",
|
||||
@ -203,15 +265,56 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 10,
|
||||
"id": "9ca82740",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1. Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
|
||||
"\n",
|
||||
"Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
|
||||
"\n",
|
||||
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
|
||||
"\n",
|
||||
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. \n",
|
||||
"\n",
|
||||
"2. We can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n",
|
||||
"\n",
|
||||
"I recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n",
|
||||
"\n",
|
||||
"They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n",
|
||||
"\n",
|
||||
"Officer Mora was 27 years old. \n",
|
||||
"\n",
|
||||
"Officer Rivera was 22. \n",
|
||||
"\n",
|
||||
"Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n",
|
||||
"\n",
|
||||
"I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n",
|
||||
"\n",
|
||||
"I’ve worked on these issues a long time. \n",
|
||||
"\n",
|
||||
"I know what works: Investing in crime prevention and community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety. \n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"found_docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10)\n",
|
||||
"for i, doc in enumerate(found_docs):\n",
|
||||
" print(f\"{i + 1}.\", doc.page_content, \"\\n\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b0fd750b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -230,7 +333,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.11.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -7,6 +7,7 @@ import warnings
|
||||
from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils.iter import batch_iterate
|
||||
@ -41,24 +42,16 @@ def _is_pinecone_v3() -> bool:
|
||||
return version.parse(pinecone_client_version) >= version.parse("3.0.0.dev")
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.0.18", removal="0.2.0", alternative_import="langchain_pinecone.Pinecone"
|
||||
)
|
||||
class Pinecone(VectorStore):
|
||||
"""`Pinecone` vector store.
|
||||
|
||||
To use, you should have the ``pinecone-client`` python package installed.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.vectorstores import Pinecone
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
import pinecone
|
||||
|
||||
# The environment should be the one specified next to the API key
|
||||
# in your Pinecone console
|
||||
pinecone.init(api_key="***", environment="...")
|
||||
index = pinecone.Index("langchain-demo")
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = Pinecone(index, embeddings, "text")
|
||||
This version of Pinecone is deprecated. Please use `langchain_pinecone.Pinecone`
|
||||
instead.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
|
1
libs/partners/pinecone/.gitignore
vendored
Normal file
1
libs/partners/pinecone/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
__pycache__
|
21
libs/partners/pinecone/LICENSE
Normal file
21
libs/partners/pinecone/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 LangChain, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
61
libs/partners/pinecone/Makefile
Normal file
61
libs/partners/pinecone/Makefile
Normal file
@ -0,0 +1,61 @@
|
||||
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
|
||||
|
||||
# Default target executed when no arguments are given to make.
|
||||
all: help
|
||||
|
||||
# Define a variable for the test file path.
|
||||
TEST_FILE ?= tests/unit_tests/
|
||||
|
||||
integration_tests: TEST_FILE = tests/integration_tests/
|
||||
|
||||
test integration_tests:
|
||||
poetry run pytest $(TEST_FILE)
|
||||
|
||||
tests:
|
||||
poetry run pytest $(TEST_FILE)
|
||||
|
||||
|
||||
######################
|
||||
# LINTING AND FORMATTING
|
||||
######################
|
||||
|
||||
# Define a variable for Python and notebook files.
|
||||
PYTHON_FILES=.
|
||||
MYPY_CACHE=.mypy_cache
|
||||
lint format: PYTHON_FILES=.
|
||||
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/pinecone --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
|
||||
lint_package: PYTHON_FILES=langchain_pinecone
|
||||
lint_tests: PYTHON_FILES=tests
|
||||
lint_tests: MYPY_CACHE=.mypy_cache_test
|
||||
|
||||
lint lint_diff lint_package lint_tests:
|
||||
poetry run ruff .
|
||||
poetry run ruff format $(PYTHON_FILES) --diff
|
||||
poetry run ruff --select I $(PYTHON_FILES)
|
||||
mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
|
||||
|
||||
format format_diff:
|
||||
poetry run ruff format $(PYTHON_FILES)
|
||||
poetry run ruff --select I --fix $(PYTHON_FILES)
|
||||
|
||||
spell_check:
|
||||
poetry run codespell --toml pyproject.toml
|
||||
|
||||
spell_fix:
|
||||
poetry run codespell --toml pyproject.toml -w
|
||||
|
||||
check_imports: $(shell find langchain_pinecone -name '*.py')
|
||||
poetry run python ./scripts/check_imports.py $^
|
||||
|
||||
######################
|
||||
# HELP
|
||||
######################
|
||||
|
||||
help:
|
||||
@echo '----'
|
||||
@echo 'check_imports - check imports'
|
||||
@echo 'format - run code formatters'
|
||||
@echo 'lint - run linters'
|
||||
@echo 'test - run unit tests'
|
||||
@echo 'tests - run unit tests'
|
||||
@echo 'test TEST_FILE=<test_file> - run all tests in file'
|
27
libs/partners/pinecone/README.md
Normal file
27
libs/partners/pinecone/README.md
Normal file
@ -0,0 +1,27 @@
|
||||
# langchain-pinecone
|
||||
|
||||
This package contains the LangChain integration with Pinecone.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install -U langchain-pinecone
|
||||
```
|
||||
|
||||
And you should configure credentials by setting the following environment variables:
|
||||
|
||||
- `PINECONE_API_KEY`
|
||||
- `PINECONE_INDEX_NAME`
|
||||
- `PINECONE_ENVIRONMENT`
|
||||
|
||||
## Usage
|
||||
|
||||
The `Pinecone` class exposes the connection to the Pinecone vector store.
|
||||
|
||||
```python
|
||||
from langchain_pinecone import Pinecone
|
||||
|
||||
embeddings = ... # use a LangChain Embeddings class
|
||||
|
||||
vectorstore = Pinecone(embeddings=embeddings)
|
||||
```
|
5
libs/partners/pinecone/langchain_pinecone/__init__.py
Normal file
5
libs/partners/pinecone/langchain_pinecone/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from langchain_pinecone.vectorstores import Pinecone
|
||||
|
||||
__all__ = [
|
||||
"Pinecone",
|
||||
]
|
71
libs/partners/pinecone/langchain_pinecone/_utilities.py
Normal file
71
libs/partners/pinecone/langchain_pinecone/_utilities.py
Normal file
@ -0,0 +1,71 @@
|
||||
from enum import Enum
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
import simsimd # type: ignore
|
||||
|
||||
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
|
||||
|
||||
|
||||
class DistanceStrategy(str, Enum):
|
||||
"""Enumerator of the Distance strategies for calculating distances
|
||||
between vectors."""
|
||||
|
||||
EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
|
||||
MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT"
|
||||
COSINE = "COSINE"
|
||||
|
||||
|
||||
def maximal_marginal_relevance(
|
||||
query_embedding: np.ndarray,
|
||||
embedding_list: list,
|
||||
lambda_mult: float = 0.5,
|
||||
k: int = 4,
|
||||
) -> List[int]:
|
||||
"""Calculate maximal marginal relevance."""
|
||||
if min(k, len(embedding_list)) <= 0:
|
||||
return []
|
||||
if query_embedding.ndim == 1:
|
||||
query_embedding = np.expand_dims(query_embedding, axis=0)
|
||||
similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
|
||||
most_similar = int(np.argmax(similarity_to_query))
|
||||
idxs = [most_similar]
|
||||
selected = np.array([embedding_list[most_similar]])
|
||||
while len(idxs) < min(k, len(embedding_list)):
|
||||
best_score = -np.inf
|
||||
idx_to_add = -1
|
||||
similarity_to_selected = cosine_similarity(embedding_list, selected)
|
||||
for i, query_score in enumerate(similarity_to_query):
|
||||
if i in idxs:
|
||||
continue
|
||||
redundant_score = max(similarity_to_selected[i])
|
||||
equation_score = (
|
||||
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
|
||||
)
|
||||
if equation_score > best_score:
|
||||
best_score = equation_score
|
||||
idx_to_add = i
|
||||
idxs.append(idx_to_add)
|
||||
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
||||
return idxs
|
||||
|
||||
|
||||
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
|
||||
"""Row-wise cosine similarity between two equal-width matrices."""
|
||||
if len(X) == 0 or len(Y) == 0:
|
||||
return np.array([])
|
||||
|
||||
X = np.array(X)
|
||||
Y = np.array(Y)
|
||||
if X.shape[1] != Y.shape[1]:
|
||||
raise ValueError(
|
||||
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
|
||||
f"and Y has shape {Y.shape}."
|
||||
)
|
||||
|
||||
X = np.array(X, dtype=np.float32)
|
||||
Y = np.array(Y, dtype=np.float32)
|
||||
Z = 1 - simsimd.cdist(X, Y, metric="cosine")
|
||||
if isinstance(Z, float):
|
||||
return np.array([Z])
|
||||
return Z
|
0
libs/partners/pinecone/langchain_pinecone/py.typed
Normal file
0
libs/partners/pinecone/langchain_pinecone/py.typed
Normal file
487
libs/partners/pinecone/langchain_pinecone/vectorstores.py
Normal file
487
libs/partners/pinecone/langchain_pinecone/vectorstores.py
Normal file
@ -0,0 +1,487 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
TypeVar,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.utils.iter import batch_iterate
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
from pinecone import Pinecone as PineconeClient # type: ignore
|
||||
|
||||
from langchain_pinecone._utilities import DistanceStrategy, maximal_marginal_relevance
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pinecone import Index
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
VST = TypeVar("VST", bound=VectorStore)
|
||||
|
||||
|
||||
class Pinecone(VectorStore):
|
||||
"""`Pinecone` vector store.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_pinecone import Pinecone
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
|
||||
embeddings = OpenAIEmbeddings()
|
||||
index_name = "my-index"
|
||||
namespace = "my-namespace"
|
||||
vectorstore = Pinecone(
|
||||
index_name=index_name,
|
||||
embedding=embedding,
|
||||
namespace=namespace,
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
# setting default params to bypass having to pass in
|
||||
# the index and embedding objects - manually throw
|
||||
# exceptions if they are not passed in or set in environment
|
||||
# (keeping param for backwards compatibility)
|
||||
index: Optional[Any] = None,
|
||||
embedding: Optional[Embeddings] = None,
|
||||
text_key: Optional[str] = "text",
|
||||
namespace: Optional[str] = None,
|
||||
distance_strategy: Optional[DistanceStrategy] = DistanceStrategy.COSINE,
|
||||
*,
|
||||
pinecone_api_key: Optional[str] = None,
|
||||
index_name: Optional[str] = None,
|
||||
):
|
||||
if embedding is None:
|
||||
raise ValueError("Embedding must be provided")
|
||||
self._embedding = embedding
|
||||
if text_key is None:
|
||||
raise ValueError("Text key must be provided")
|
||||
self._text_key = text_key
|
||||
|
||||
self._namespace = namespace
|
||||
self.distance_strategy = distance_strategy
|
||||
|
||||
if index:
|
||||
# supports old way of initializing externally
|
||||
self._index = index
|
||||
else:
|
||||
# all internal initialization
|
||||
_pinecone_api_key = (
|
||||
pinecone_api_key or os.environ.get("PINECONE_API_KEY") or ""
|
||||
)
|
||||
if not _pinecone_api_key:
|
||||
raise ValueError(
|
||||
"Pinecone API key must be provided in either `pinecone_api_key` "
|
||||
"or `PINECONE_API_KEY` environment variable"
|
||||
)
|
||||
|
||||
_index_name = index_name or os.environ.get("PINECONE_INDEX_NAME") or ""
|
||||
if not _index_name:
|
||||
raise ValueError(
|
||||
"Pinecone index name must be provided in either `index_name` "
|
||||
"or `PINECONE_INDEX_NAME` environment variable"
|
||||
)
|
||||
|
||||
# needs
|
||||
client = PineconeClient(api_key=_pinecone_api_key)
|
||||
self._index = client.Index(_index_name)
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Optional[Embeddings]:
|
||||
"""Access the query embedding object if available."""
|
||||
return self._embedding
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
namespace: Optional[str] = None,
|
||||
batch_size: int = 32,
|
||||
embedding_chunk_size: int = 1000,
|
||||
*,
|
||||
async_req: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
Upsert optimization is done by chunking the embeddings and upserting them.
|
||||
This is done to avoid memory issues and optimize using HTTP based embeddings.
|
||||
For OpenAI embeddings, use pool_threads>4 when constructing the pinecone.Index,
|
||||
embedding_chunk_size>1000 and batch_size~64 for best performance.
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
ids: Optional list of ids to associate with the texts.
|
||||
namespace: Optional pinecone namespace to add the texts to.
|
||||
batch_size: Batch size to use when adding the texts to the vectorstore.
|
||||
embedding_chunk_size: Chunk size to use when embedding the texts.
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = self._namespace
|
||||
|
||||
texts = list(texts)
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
metadatas = metadatas or [{} for _ in texts]
|
||||
for metadata, text in zip(metadatas, texts):
|
||||
metadata[self._text_key] = text
|
||||
|
||||
# For loops to avoid memory issues and optimize when using HTTP based embeddings
|
||||
# The first loop runs the embeddings, it benefits when using OpenAI embeddings
|
||||
# The second loops runs the pinecone upsert asynchronously.
|
||||
for i in range(0, len(texts), embedding_chunk_size):
|
||||
chunk_texts = texts[i : i + embedding_chunk_size]
|
||||
chunk_ids = ids[i : i + embedding_chunk_size]
|
||||
chunk_metadatas = metadatas[i : i + embedding_chunk_size]
|
||||
embeddings = self._embedding.embed_documents(chunk_texts)
|
||||
async_res = [
|
||||
self._index.upsert(
|
||||
vectors=batch,
|
||||
namespace=namespace,
|
||||
async_req=async_req,
|
||||
**kwargs,
|
||||
)
|
||||
for batch in batch_iterate(
|
||||
batch_size, zip(chunk_ids, embeddings, chunk_metadatas)
|
||||
)
|
||||
]
|
||||
[res.get() for res in async_res]
|
||||
|
||||
return ids
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
namespace: Optional[str] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return pinecone documents most similar to query, along with scores.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filter: Dictionary of argument(s) to filter on metadata
|
||||
namespace: Namespace to search in. Default will search in '' namespace.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
return self.similarity_search_by_vector_with_score(
|
||||
self._embedding.embed_query(query), k=k, filter=filter, namespace=namespace
|
||||
)
|
||||
|
||||
def similarity_search_by_vector_with_score(
|
||||
self,
|
||||
embedding: List[float],
|
||||
*,
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
namespace: Optional[str] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return pinecone documents most similar to embedding, along with scores."""
|
||||
|
||||
if namespace is None:
|
||||
namespace = self._namespace
|
||||
docs = []
|
||||
results = self._index.query(
|
||||
vector=embedding,
|
||||
top_k=k,
|
||||
include_metadata=True,
|
||||
namespace=namespace,
|
||||
filter=filter,
|
||||
)
|
||||
for res in results["matches"]:
|
||||
metadata = res["metadata"]
|
||||
if self._text_key in metadata:
|
||||
text = metadata.pop(self._text_key)
|
||||
score = res["score"]
|
||||
docs.append((Document(page_content=text, metadata=metadata), score))
|
||||
else:
|
||||
logger.warning(
|
||||
f"Found document with no `{self._text_key}` key. Skipping."
|
||||
)
|
||||
return docs
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[dict] = None,
|
||||
namespace: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return pinecone documents most similar to query.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filter: Dictionary of argument(s) to filter on metadata
|
||||
namespace: Namespace to search in. Default will search in '' namespace.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query and score for each
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query, k=k, filter=filter, namespace=namespace, **kwargs
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
||||
"""
|
||||
The 'correct' relevance function
|
||||
may differ depending on a few things, including:
|
||||
- the distance / similarity metric used by the VectorStore
|
||||
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
|
||||
- embedding dimensionality
|
||||
- etc.
|
||||
"""
|
||||
|
||||
if self.distance_strategy == DistanceStrategy.COSINE:
|
||||
return self._cosine_relevance_score_fn
|
||||
elif self.distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
|
||||
return self._max_inner_product_relevance_score_fn
|
||||
elif self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
|
||||
return self._euclidean_relevance_score_fn
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unknown distance strategy, must be cosine, max_inner_product "
|
||||
"(dot product), or euclidean"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _cosine_relevance_score_fn(score: float) -> float:
|
||||
"""Pinecone returns cosine similarity scores between [-1,1]"""
|
||||
return (score + 1) / 2
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filter: Optional[dict] = None,
|
||||
namespace: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = self._namespace
|
||||
results = self._index.query(
|
||||
vector=[embedding],
|
||||
top_k=fetch_k,
|
||||
include_values=True,
|
||||
include_metadata=True,
|
||||
namespace=namespace,
|
||||
filter=filter,
|
||||
)
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
np.array([embedding], dtype=np.float32),
|
||||
[item["values"] for item in results["matches"]],
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
selected = [results["matches"][i]["metadata"] for i in mmr_selected]
|
||||
return [
|
||||
Document(page_content=metadata.pop((self._text_key)), metadata=metadata)
|
||||
for metadata in selected
|
||||
]
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
filter: Optional[dict] = None,
|
||||
namespace: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
embedding = self._embedding.embed_query(query)
|
||||
return self.max_marginal_relevance_search_by_vector(
|
||||
embedding, k, fetch_k, lambda_mult, filter, namespace
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_pinecone_index(
|
||||
cls,
|
||||
index_name: Optional[str],
|
||||
pool_threads: int = 4,
|
||||
*,
|
||||
pinecone_api_key: Optional[str] = None,
|
||||
) -> Index:
|
||||
"""Return a Pinecone Index instance.
|
||||
|
||||
Args:
|
||||
index_name: Name of the index to use.
|
||||
pool_threads: Number of threads to use for index upsert.
|
||||
Returns:
|
||||
Pinecone Index instance."""
|
||||
_pinecone_api_key = pinecone_api_key or os.environ.get("PINECONE_API_KEY") or ""
|
||||
client = PineconeClient(api_key=_pinecone_api_key, pool_threads=pool_threads)
|
||||
indexes = client.list_indexes()
|
||||
index_names = [i.name for i in indexes.index_list["indexes"]]
|
||||
|
||||
if index_name in index_names:
|
||||
index = client.Index(index_name)
|
||||
elif len(index_names) == 0:
|
||||
raise ValueError(
|
||||
"No active indexes found in your Pinecone project, "
|
||||
"are you sure you're using the right Pinecone API key and Environment? "
|
||||
"Please double check your Pinecone dashboard."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Index '{index_name}' not found in your Pinecone project. "
|
||||
f"Did you mean one of the following indexes: {', '.join(index_names)}"
|
||||
)
|
||||
return index
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
batch_size: int = 32,
|
||||
text_key: str = "text",
|
||||
namespace: Optional[str] = None,
|
||||
index_name: Optional[str] = None,
|
||||
upsert_kwargs: Optional[dict] = None,
|
||||
pool_threads: int = 4,
|
||||
embeddings_chunk_size: int = 1000,
|
||||
**kwargs: Any,
|
||||
) -> Pinecone:
|
||||
"""Construct Pinecone wrapper from raw documents.
|
||||
|
||||
This is a user friendly interface that:
|
||||
1. Embeds documents.
|
||||
2. Adds the documents to a provided Pinecone index
|
||||
|
||||
This is intended to be a quick way to get started.
|
||||
|
||||
The `pool_threads` affects the speed of the upsert operations.
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.vectorstores import Pinecone
|
||||
from langchain_community.embeddings import OpenAIEmbeddings
|
||||
import pinecone
|
||||
|
||||
# The environment should be the one specified next to the API key
|
||||
# in your Pinecone console
|
||||
pinecone.init(api_key="***", environment="...")
|
||||
embeddings = OpenAIEmbeddings()
|
||||
pinecone = Pinecone.from_texts(
|
||||
texts,
|
||||
embeddings,
|
||||
index_name="langchain-demo"
|
||||
)
|
||||
"""
|
||||
pinecone_index = cls.get_pinecone_index(index_name, pool_threads)
|
||||
pinecone = cls(pinecone_index, embedding, text_key, namespace, **kwargs)
|
||||
|
||||
pinecone.add_texts(
|
||||
texts,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
namespace=namespace,
|
||||
batch_size=batch_size,
|
||||
embedding_chunk_size=embeddings_chunk_size,
|
||||
**(upsert_kwargs or {}),
|
||||
)
|
||||
return pinecone
|
||||
|
||||
@classmethod
|
||||
def from_existing_index(
|
||||
cls,
|
||||
index_name: str,
|
||||
embedding: Embeddings,
|
||||
text_key: str = "text",
|
||||
namespace: Optional[str] = None,
|
||||
pool_threads: int = 4,
|
||||
) -> Pinecone:
|
||||
"""Load pinecone vectorstore from index name."""
|
||||
pinecone_index = cls.get_pinecone_index(index_name, pool_threads)
|
||||
return cls(pinecone_index, embedding, text_key, namespace)
|
||||
|
||||
def delete(
|
||||
self,
|
||||
ids: Optional[List[str]] = None,
|
||||
delete_all: Optional[bool] = None,
|
||||
namespace: Optional[str] = None,
|
||||
filter: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Delete by vector IDs or filter.
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
filter: Dictionary of conditions to filter vectors to delete.
|
||||
"""
|
||||
|
||||
if namespace is None:
|
||||
namespace = self._namespace
|
||||
|
||||
if delete_all:
|
||||
self._index.delete(delete_all=True, namespace=namespace, **kwargs)
|
||||
elif ids is not None:
|
||||
chunk_size = 1000
|
||||
for i in range(0, len(ids), chunk_size):
|
||||
chunk = ids[i : i + chunk_size]
|
||||
self._index.delete(ids=chunk, namespace=namespace, **kwargs)
|
||||
elif filter is not None:
|
||||
self._index.delete(filter=filter, namespace=namespace, **kwargs)
|
||||
else:
|
||||
raise ValueError("Either ids, delete_all, or filter must be provided.")
|
||||
|
||||
return None
|
1213
libs/partners/pinecone/poetry.lock
generated
Normal file
1213
libs/partners/pinecone/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
97
libs/partners/pinecone/pyproject.toml
Normal file
97
libs/partners/pinecone/pyproject.toml
Normal file
@ -0,0 +1,97 @@
|
||||
[tool.poetry]
|
||||
name = "langchain-pinecone"
|
||||
version = "0.0.1"
|
||||
description = "An integration package connecting Pinecone and LangChain"
|
||||
authors = []
|
||||
readme = "README.md"
|
||||
repository = "https://github.com/langchain-ai/langchain"
|
||||
license = "MIT"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/pinecone"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain-core = ">=0.0.12"
|
||||
pinecone-client = {version = "^3", python = ">=3.8,<3.13"}
|
||||
simsimd = "^3.6.3"
|
||||
numpy = "^1"
|
||||
|
||||
[tool.poetry.group.test]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
pytest = "^7.3.0"
|
||||
freezegun = "^1.2.2"
|
||||
pytest-mock = "^3.10.0"
|
||||
syrupy = "^4.0.2"
|
||||
pytest-watcher = "^0.3.4"
|
||||
pytest-asyncio = "^0.21.1"
|
||||
langchain-core = {path = "../../core", develop = true}
|
||||
|
||||
[tool.poetry.group.codespell]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.codespell.dependencies]
|
||||
codespell = "^2.2.0"
|
||||
|
||||
[tool.poetry.group.test_integration]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.test_integration.dependencies]
|
||||
langchain-openai = ">=0.0.3,<0.1"
|
||||
|
||||
[tool.poetry.group.lint]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.lint.dependencies]
|
||||
ruff = "^0.1.5"
|
||||
|
||||
[tool.poetry.group.typing.dependencies]
|
||||
mypy = "^0.991"
|
||||
langchain-core = {path = "../../core", develop = true}
|
||||
|
||||
[tool.poetry.group.dev]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
langchain-core = {path = "../../core", develop = true}
|
||||
|
||||
[tool.ruff]
|
||||
select = [
|
||||
"E", # pycodestyle
|
||||
"F", # pyflakes
|
||||
"I", # isort
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
disallow_untyped_defs = "True"
|
||||
|
||||
[tool.coverage.run]
|
||||
omit = [
|
||||
"tests/*",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
# --strict-markers will raise errors on unknown marks.
|
||||
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
|
||||
#
|
||||
# https://docs.pytest.org/en/7.1.x/reference/reference.html
|
||||
# --strict-config any warnings encountered while parsing the `pytest`
|
||||
# section of the configuration file raise errors.
|
||||
#
|
||||
# https://github.com/tophat/syrupy
|
||||
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
|
||||
addopts = "--snapshot-warn-unused --strict-markers --strict-config --durations=5"
|
||||
# Registering custom markers.
|
||||
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
|
||||
markers = [
|
||||
"requires: mark tests as requiring a specific library",
|
||||
"asyncio: mark tests as requiring asyncio",
|
||||
"compile: mark placeholder test used to compile integration tests without running them",
|
||||
]
|
||||
asyncio_mode = "auto"
|
17
libs/partners/pinecone/scripts/check_imports.py
Normal file
17
libs/partners/pinecone/scripts/check_imports.py
Normal file
@ -0,0 +1,17 @@
|
||||
import sys
|
||||
import traceback
|
||||
from importlib.machinery import SourceFileLoader
|
||||
|
||||
if __name__ == "__main__":
|
||||
files = sys.argv[1:]
|
||||
has_failure = False
|
||||
for file in files:
|
||||
try:
|
||||
SourceFileLoader("x", file).load_module()
|
||||
except Exception:
|
||||
has_faillure = True
|
||||
print(file)
|
||||
traceback.print_exc()
|
||||
print()
|
||||
|
||||
sys.exit(1 if has_failure else 0)
|
27
libs/partners/pinecone/scripts/check_pydantic.sh
Executable file
27
libs/partners/pinecone/scripts/check_pydantic.sh
Executable file
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# This script searches for lines starting with "import pydantic" or "from pydantic"
|
||||
# in tracked files within a Git repository.
|
||||
#
|
||||
# Usage: ./scripts/check_pydantic.sh /path/to/repository
|
||||
|
||||
# Check if a path argument is provided
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 /path/to/repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
repository_path="$1"
|
||||
|
||||
# Search for lines matching the pattern within the specified repository
|
||||
result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic')
|
||||
|
||||
# Check if any matching lines were found
|
||||
if [ -n "$result" ]; then
|
||||
echo "ERROR: The following lines need to be updated:"
|
||||
echo "$result"
|
||||
echo "Please replace the code with an import from langchain_core.pydantic_v1."
|
||||
echo "For example, replace 'from pydantic import BaseModel'"
|
||||
echo "with 'from langchain_core.pydantic_v1 import BaseModel'"
|
||||
exit 1
|
||||
fi
|
17
libs/partners/pinecone/scripts/lint_imports.sh
Executable file
17
libs/partners/pinecone/scripts/lint_imports.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
# Initialize a variable to keep track of errors
|
||||
errors=0
|
||||
|
||||
# make sure not importing from langchain or langchain_experimental
|
||||
git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
|
||||
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
|
||||
|
||||
# Decide on an exit status based on the errors
|
||||
if [ "$errors" -gt 0 ]; then
|
||||
exit 1
|
||||
else
|
||||
exit 0
|
||||
fi
|
0
libs/partners/pinecone/tests/__init__.py
Normal file
0
libs/partners/pinecone/tests/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.compile
|
||||
def test_placeholder() -> None:
|
||||
"""Used for compiling integration tests without running any real tests."""
|
||||
pass
|
@ -0,0 +1,288 @@
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import pinecone # type: ignore
|
||||
import pytest
|
||||
from langchain_core.documents import Document
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from pinecone import PodSpec
|
||||
|
||||
from langchain_pinecone import Pinecone
|
||||
|
||||
INDEX_NAME = "langchain-test-index" # name of the index
|
||||
NAMESPACE_NAME = "langchain-test-namespace" # name of the namespace
|
||||
DIMENSION = 1536 # dimension of the embeddings
|
||||
|
||||
DEFAULT_SLEEP = 20
|
||||
|
||||
|
||||
class TestPinecone:
|
||||
index: "pinecone.Index"
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls) -> None:
|
||||
import pinecone
|
||||
|
||||
client = pinecone.Pinecone(api_key=os.environ["PINECONE_API_KEY"])
|
||||
index_list = client.list_indexes()
|
||||
for i in index_list:
|
||||
if i["name"] == INDEX_NAME:
|
||||
client.delete_index(INDEX_NAME)
|
||||
break
|
||||
client.create_index(
|
||||
name=INDEX_NAME,
|
||||
dimension=DIMENSION,
|
||||
metric="cosine",
|
||||
spec=PodSpec(environment=os.environ["PINECONE_ENVIRONMENT"]),
|
||||
)
|
||||
|
||||
cls.index = client.Index(INDEX_NAME)
|
||||
|
||||
# insure the index is empty
|
||||
index_stats = cls.index.describe_index_stats()
|
||||
assert index_stats["dimension"] == DIMENSION
|
||||
if index_stats["namespaces"].get(NAMESPACE_NAME) is not None:
|
||||
assert index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == 0
|
||||
|
||||
@classmethod
|
||||
def teardown_class(cls) -> None:
|
||||
index_stats = cls.index.describe_index_stats()
|
||||
for _namespace_name in index_stats["namespaces"].keys():
|
||||
cls.index.delete(delete_all=True, namespace=_namespace_name)
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self) -> None:
|
||||
# delete all the vectors in the index
|
||||
print("called")
|
||||
self.index.delete(delete_all=True, namespace=NAMESPACE_NAME)
|
||||
# index_stats = self.index.describe_index_stats()
|
||||
# for _namespace_name in index_stats["namespaces"].keys():
|
||||
# self.index.delete(delete_all=True, namespace=_namespace_name)
|
||||
time.sleep(DEFAULT_SLEEP) # prevent race condition with previous step
|
||||
# index_stats = self.index.describe_index_stats
|
||||
|
||||
@pytest.fixture
|
||||
def embedding_openai(self) -> OpenAIEmbeddings:
|
||||
return OpenAIEmbeddings()
|
||||
|
||||
@pytest.fixture
|
||||
def texts(self) -> List[str]:
|
||||
return ["foo", "bar", "baz"]
|
||||
|
||||
def test_from_texts(
|
||||
self, texts: List[str], embedding_openai: OpenAIEmbeddings
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
unique_id = uuid.uuid4().hex
|
||||
needs = f"foobuu {unique_id} booo"
|
||||
texts.insert(0, needs)
|
||||
|
||||
docsearch = Pinecone.from_texts(
|
||||
texts=texts,
|
||||
embedding=embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
namespace=NAMESPACE_NAME,
|
||||
)
|
||||
time.sleep(DEFAULT_SLEEP) # prevent race condition
|
||||
output = docsearch.similarity_search(unique_id, k=1, namespace=NAMESPACE_NAME)
|
||||
assert output == [Document(page_content=needs)]
|
||||
|
||||
def test_from_texts_with_metadatas(
|
||||
self, texts: List[str], embedding_openai: OpenAIEmbeddings
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
unique_id = uuid.uuid4().hex
|
||||
needs = f"foobuu {unique_id} booo"
|
||||
texts = [needs] + texts
|
||||
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = Pinecone.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
metadatas=metadatas,
|
||||
namespace=NAMESPACE_NAME,
|
||||
)
|
||||
time.sleep(DEFAULT_SLEEP) # prevent race condition
|
||||
output = docsearch.similarity_search(needs, k=1, namespace=NAMESPACE_NAME)
|
||||
|
||||
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}?
|
||||
assert output == [Document(page_content=needs, metadata={"page": 0.0})]
|
||||
|
||||
def test_from_texts_with_scores(self, embedding_openai: OpenAIEmbeddings) -> None:
|
||||
"""Test end to end construction and search with scores and IDs."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
print("metadatas", metadatas)
|
||||
docsearch = Pinecone.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
metadatas=metadatas,
|
||||
namespace=NAMESPACE_NAME,
|
||||
)
|
||||
print(texts)
|
||||
time.sleep(DEFAULT_SLEEP) # prevent race condition
|
||||
output = docsearch.similarity_search_with_score(
|
||||
"foo", k=3, namespace=NAMESPACE_NAME
|
||||
)
|
||||
docs = [o[0] for o in output]
|
||||
scores = [o[1] for o in output]
|
||||
sorted_documents = sorted(docs, key=lambda x: x.metadata["page"])
|
||||
print(sorted_documents)
|
||||
|
||||
# TODO: why metadata={"page": 0.0}) instead of {"page": 0}, etc???
|
||||
assert sorted_documents == [
|
||||
Document(page_content="foo", metadata={"page": 0.0}),
|
||||
Document(page_content="bar", metadata={"page": 1.0}),
|
||||
Document(page_content="baz", metadata={"page": 2.0}),
|
||||
]
|
||||
assert scores[0] > scores[1] > scores[2]
|
||||
|
||||
def test_from_existing_index_with_namespaces(
|
||||
self, embedding_openai: OpenAIEmbeddings
|
||||
) -> None:
|
||||
"""Test that namespaces are properly handled."""
|
||||
# Create two indexes with the same name but different namespaces
|
||||
texts_1 = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts_1))]
|
||||
Pinecone.from_texts(
|
||||
texts_1,
|
||||
embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
metadatas=metadatas,
|
||||
namespace=f"{INDEX_NAME}-1",
|
||||
)
|
||||
|
||||
texts_2 = ["foo2", "bar2", "baz2"]
|
||||
metadatas = [{"page": i} for i in range(len(texts_2))]
|
||||
|
||||
Pinecone.from_texts(
|
||||
texts_2,
|
||||
embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
metadatas=metadatas,
|
||||
namespace=f"{INDEX_NAME}-2",
|
||||
)
|
||||
|
||||
time.sleep(DEFAULT_SLEEP) # prevent race condition
|
||||
|
||||
# Search with namespace
|
||||
docsearch = Pinecone.from_existing_index(
|
||||
index_name=INDEX_NAME,
|
||||
embedding=embedding_openai,
|
||||
namespace=f"{INDEX_NAME}-1",
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=20, namespace=f"{INDEX_NAME}-1")
|
||||
# check that we don't get results from the other namespace
|
||||
page_contents = sorted(set([o.page_content for o in output]))
|
||||
assert all(content in ["foo", "bar", "baz"] for content in page_contents)
|
||||
assert all(content not in ["foo2", "bar2", "baz2"] for content in page_contents)
|
||||
|
||||
def test_add_documents_with_ids(
|
||||
self, texts: List[str], embedding_openai: OpenAIEmbeddings
|
||||
) -> None:
|
||||
ids = [uuid.uuid4().hex for _ in range(len(texts))]
|
||||
Pinecone.from_texts(
|
||||
texts=texts,
|
||||
ids=ids,
|
||||
embedding=embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
namespace=NAMESPACE_NAME,
|
||||
)
|
||||
time.sleep(DEFAULT_SLEEP) # prevent race condition
|
||||
index_stats = self.index.describe_index_stats()
|
||||
assert index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == len(texts)
|
||||
|
||||
ids_1 = [uuid.uuid4().hex for _ in range(len(texts))]
|
||||
Pinecone.from_texts(
|
||||
texts=[t + "-1" for t in texts],
|
||||
ids=ids_1,
|
||||
embedding=embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
namespace=NAMESPACE_NAME,
|
||||
)
|
||||
time.sleep(DEFAULT_SLEEP) # prevent race condition
|
||||
index_stats = self.index.describe_index_stats()
|
||||
assert (
|
||||
index_stats["namespaces"][NAMESPACE_NAME]["vector_count"] == len(texts) * 2
|
||||
)
|
||||
# only focused on this namespace now
|
||||
# assert index_stats["total_vector_count"] == len(texts) * 2
|
||||
|
||||
@pytest.mark.xfail(reason="relevance score just over 1")
|
||||
def test_relevance_score_bound(self, embedding_openai: OpenAIEmbeddings) -> None:
|
||||
"""Ensures all relevance scores are between 0 and 1."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = Pinecone.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
index_name=INDEX_NAME,
|
||||
metadatas=metadatas,
|
||||
)
|
||||
# wait for the index to be ready
|
||||
time.sleep(DEFAULT_SLEEP)
|
||||
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
|
||||
print(output)
|
||||
assert all(
|
||||
(1 >= score or np.isclose(score, 1)) and score >= 0 for _, score in output
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(reason="slow to run for benchmark")
|
||||
@pytest.mark.parametrize(
|
||||
"pool_threads,batch_size,embeddings_chunk_size,data_multiplier",
|
||||
[
|
||||
(
|
||||
1,
|
||||
32,
|
||||
32,
|
||||
1000,
|
||||
), # simulate single threaded with embeddings_chunk_size = batch_size = 32
|
||||
(
|
||||
1,
|
||||
32,
|
||||
1000,
|
||||
1000,
|
||||
), # simulate single threaded with embeddings_chunk_size = 1000
|
||||
(
|
||||
4,
|
||||
32,
|
||||
1000,
|
||||
1000,
|
||||
), # simulate 4 threaded with embeddings_chunk_size = 1000
|
||||
(20, 64, 5000, 1000),
|
||||
], # simulate 20 threaded with embeddings_chunk_size = 5000
|
||||
)
|
||||
def test_from_texts_with_metadatas_benchmark(
|
||||
self,
|
||||
pool_threads: int,
|
||||
batch_size: int,
|
||||
embeddings_chunk_size: int,
|
||||
data_multiplier: int,
|
||||
documents: List[Document],
|
||||
embedding_openai: OpenAIEmbeddings,
|
||||
) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
|
||||
texts = [document.page_content for document in documents] * data_multiplier
|
||||
uuids = [uuid.uuid4().hex for _ in range(len(texts))]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = Pinecone.from_texts(
|
||||
texts,
|
||||
embedding_openai,
|
||||
ids=uuids,
|
||||
metadatas=metadatas,
|
||||
index_name=INDEX_NAME,
|
||||
namespace=NAMESPACE_NAME,
|
||||
pool_threads=pool_threads,
|
||||
batch_size=batch_size,
|
||||
embeddings_chunk_size=embeddings_chunk_size,
|
||||
)
|
||||
|
||||
query = "What did the president say about Ketanji Brown Jackson"
|
||||
_ = docsearch.similarity_search(query, k=1, namespace=NAMESPACE_NAME)
|
0
libs/partners/pinecone/tests/unit_tests/__init__.py
Normal file
0
libs/partners/pinecone/tests/unit_tests/__init__.py
Normal file
9
libs/partners/pinecone/tests/unit_tests/test_imports.py
Normal file
9
libs/partners/pinecone/tests/unit_tests/test_imports.py
Normal file
@ -0,0 +1,9 @@
|
||||
from langchain_pinecone import __all__
|
||||
|
||||
EXPECTED_ALL = [
|
||||
"Pinecone",
|
||||
]
|
||||
|
||||
|
||||
def test_all_imports() -> None:
|
||||
assert sorted(EXPECTED_ALL) == sorted(__all__)
|
12
libs/partners/pinecone/tests/unit_tests/test_vectorstores.py
Normal file
12
libs/partners/pinecone/tests/unit_tests/test_vectorstores.py
Normal file
@ -0,0 +1,12 @@
|
||||
from unittest.mock import Mock
|
||||
|
||||
from langchain_pinecone.vectorstores import Pinecone
|
||||
|
||||
|
||||
def test_initialization() -> None:
|
||||
"""Test integration vectorstore initialization."""
|
||||
# mock index
|
||||
index = Mock()
|
||||
embedding = Mock()
|
||||
text_key = "xyz"
|
||||
Pinecone(index, embedding, text_key)
|
Loading…
Reference in New Issue
Block a user