diff --git a/docs/extras/integrations/providers/vectara/index.mdx b/docs/extras/integrations/providers/vectara/index.mdx index 0e30735f0b..abd8283735 100644 --- a/docs/extras/integrations/providers/vectara/index.mdx +++ b/docs/extras/integrations/providers/vectara/index.mdx @@ -11,9 +11,10 @@ What is Vectara? - You can use Vectara's integration with LangChain as a Vector store or using the Retriever abstraction. ## Installation and Setup -To use Vectara with LangChain no special installation steps are required. You just have to provide your customer_id, corpus ID, and an API key created within the Vectara console to enable indexing and searching. +To use Vectara with LangChain no special installation steps are required. +To get started, follow our [quickstart](https://docs.vectara.com/docs/quickstart) guide to create an account, a corpus and an API key. +Once you have these, you can provide them as arguments to the Vectara vectorstore, or you can set them as environment variables. -Alternatively these can be provided as environment variables - export `VECTARA_CUSTOMER_ID`="your_customer_id" - export `VECTARA_CORPUS_ID`="your_corpus_id" - export `VECTARA_API_KEY`="your-vectara-api-key" diff --git a/docs/extras/integrations/vectorstores/vectara.ipynb b/docs/extras/integrations/vectorstores/vectara.ipynb index 0741c1b199..e95504860b 100644 --- a/docs/extras/integrations/vectorstores/vectara.ipynb +++ b/docs/extras/integrations/vectorstores/vectara.ipynb @@ -26,7 +26,7 @@ "source": [ "# Setup\n", "\n", - "You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps:\n", + "You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps (see our [quickstart](https://docs.vectara.com/docs/quickstart) guide):\n", "1. [Sign up](https://console.vectara.com/signup) for a Vectara account if you don't already have one. Once you have completed your sign up you will have a Vectara customer ID. You can find your customer ID by clicking on your name, on the top-right of the Vectara console window.\n", "2. Within your account you can create one or more corpora. Each corpus represents an area that stores text data upon ingest from input documents. To create a corpus, use the **\"Create Corpus\"** button. You then provide a name to your corpus as well as a description. Optionally you can define filtering attributes and apply some advanced options. If you click on your created corpus, you can see its name and corpus ID right on the top.\n", "3. Next you'll need to create API keys to access the corpus. Click on the **\"Authorization\"** tab in the corpus view and then the **\"Create API Key\"** button. Give your key a name, and choose whether you want query only or query+index for your key. Click \"Create\" and you now have an active API key. Keep this key confidential. \n", @@ -47,7 +47,7 @@ "os.environ[\"VECTARA_API_KEY\"] = getpass.getpass(\"Vectara API Key:\")\n", "```\n", "\n", - "2. Add them to the Vectara vectorstore constructor:\n", + "1. Provide them as arguments when creating the Vectara vectorstore object:\n", "\n", "```python\n", "vectorstore = Vectara(\n", @@ -65,13 +65,22 @@ "source": [ "## Connecting to Vectara from LangChain\n", "\n", - "To get started, let's ingest the documents using the from_documents() method.\n", - "We assume here that you've added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and query+indexing VECTARA_API_KEY as environment variables." + "In this example, we assume that you've created an account and a corpus, and added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and VECTARA_API_KEY (created with permissions for both indexing and query) as environment variables.\n", + "\n", + "The corpus has 3 fields defined as metadata for filtering:\n", + "* url: a string field containing the source URL of the document (where relevant)\n", + "* speech: a string field containing the name of the speech\n", + "* author: the name of the author\n", + "\n", + "Let's start by ingesting 3 documents into the corpus:\n", + "1. The State of the Union speech from 2022, available in the LangChain repository as a text file\n", + "2. The \"I have a dream\" speech by Dr. Kind\n", + "3. The \"We shall Fight on the Beaches\" speech by Winston Churchil" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "04a1f1a0", "metadata": {}, "outputs": [], @@ -79,12 +88,17 @@ "from langchain.embeddings import FakeEmbeddings\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.vectorstores import Vectara\n", - "from langchain.document_loaders import TextLoader" + "from langchain.document_loaders import TextLoader\n", + "\n", + "from langchain.llms import OpenAI\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "be0a4973", "metadata": {}, "outputs": [], @@ -97,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "8429667e", "metadata": { "ExecuteTime": { @@ -111,7 +125,7 @@ "vectara = Vectara.from_documents(\n", " docs,\n", " embedding=FakeEmbeddings(size=768),\n", - " doc_metadata={\"speech\": \"state-of-the-union\"},\n", + " doc_metadata={\"speech\": \"state-of-the-union\", \"author\": \"Biden\"},\n", ")" ] }, @@ -130,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "85ef3468", "metadata": {}, "outputs": [], @@ -142,14 +156,16 @@ " [\n", " \"https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf\",\n", " \"I-have-a-dream\",\n", + " \"Dr. King\"\n", " ],\n", " [\n", " \"https://www.parkwayschools.net/cms/lib/MO01931486/Centricity/Domain/1578/Churchill_Beaches_Speech.pdf\",\n", " \"we shall fight on the beaches\",\n", + " \"Churchil\"\n", " ],\n", "]\n", "files_list = []\n", - "for url, _ in urls:\n", + "for url, _, _ in urls:\n", " name = tempfile.NamedTemporaryFile().name\n", " urllib.request.urlretrieve(url, name)\n", " files_list.append(name)\n", @@ -157,7 +173,7 @@ "docsearch: Vectara = Vectara.from_files(\n", " files=files_list,\n", " embedding=FakeEmbeddings(size=768),\n", - " metadatas=[{\"url\": url, \"speech\": title} for url, title in urls],\n", + " metadatas=[{\"url\": url, \"speech\": title, \"author\": author} for url, title, author in urls],\n", ")" ] }, @@ -178,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "a8c513ab", "metadata": { "ExecuteTime": { @@ -197,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "fc516993", "metadata": { "ExecuteTime": { @@ -231,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "8804a21d", "metadata": { "ExecuteTime": { @@ -249,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "756a6887", "metadata": { "ExecuteTime": { @@ -264,7 +280,7 @@ "text": [ "Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice.\n", "\n", - "Score: 0.786569\n" + "Score: 0.8299499\n" ] } ], @@ -284,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "47784de5", "metadata": {}, "outputs": [ @@ -307,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "3e22949f", "metadata": {}, "outputs": [ @@ -315,7 +331,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "With this threshold of 0.2 we have 3 documents\n" + "With this threshold of 0.2 we have 5 documents\n" ] } ], @@ -340,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "9427195f", "metadata": { "ExecuteTime": { @@ -352,10 +368,10 @@ { "data": { "text/plain": [ - "VectaraRetriever(tags=['Vectara'], metadata=None, vectorstore=, search_type='similarity', search_kwargs={'lambda_val': 0.025, 'k': 5, 'filter': '', 'n_sentence_context': '2'})" + "VectaraRetriever(tags=['Vectara'], metadata=None, vectorstore=, search_type='similarity', search_kwargs={'lambda_val': 0.025, 'k': 5, 'filter': '', 'n_sentence_context': '2'})" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -367,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "f3c70c31", "metadata": { "ExecuteTime": { @@ -379,10 +395,10 @@ { "data": { "text/plain": [ - "Document(page_content='Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '596', 'len': '97', 'speech': 'state-of-the-union'})" + "Document(page_content='Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '596', 'len': '97', 'speech': 'state-of-the-union', 'author': 'Biden'})" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -392,10 +408,118 @@ "retriever.get_relevant_documents(query)[0]" ] }, + { + "cell_type": "markdown", + "id": "e944c26a", + "metadata": {}, + "source": [ + "## Using Vectara as a SelfQuery Retriever" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8be674de", + "metadata": {}, + "outputs": [], + "source": [ + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"speech\",\n", + " description=\"what name of the speech\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"author\",\n", + " description=\"author of the speech\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + "]\n", + "document_content_description = \"the text of the speech\"\n", + "\n", + "vectordb = Vectara()\n", + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(llm, vectara, \n", + " document_content_description, metadata_field_info, \n", + " verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f8938999", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ofer/dev/langchain/libs/langchain/langchain/chains/llm.py:278: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='freedom' filter=Comparison(comparator=, attribute='author', value='Biden') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Well I know this nation. We will meet the test. To protect freedom and liberty, to expand fairness and opportunity. We will save democracy. As hard as these times have been, I am more optimistic about America today than I have been my whole life.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '346', 'len': '67', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='To our fellow Ukrainian Americans who forge a deep bond that connects our two nations we stand with you. Putin may circle Kyiv with tanks, but he will never gain the hearts and souls of the Ukrainian people. He will never extinguish their love of freedom. He will never weaken the resolve of the free world. We meet tonight in an America that has lived through two of the hardest years this nation has ever faced.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '740', 'len': '47', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='But most importantly as Americans. With a duty to one another to the American people to the Constitution. And with an unwavering resolve that freedom will always triumph over tyranny. Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '413', 'len': '77', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='We can do this. \\n\\nMy fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. In this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. We have fought for freedom, expanded liberty, defeated totalitarianism and terror. And built the strongest, freest, and most prosperous nation the world has ever known. Now is the hour. \\n\\nOur moment of responsibility.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '906', 'len': '82', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. We cannot let this happen. Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '0', 'len': '63', 'speech': 'state-of-the-union', 'author': 'Biden'})]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever.get_relevant_documents(\"what did Biden say about the freedom?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a97037fb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='freedom' filter=Comparison(comparator=, attribute='author', value='Dr. King') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='And if America is to be a great nation, this must become true. So\\nlet freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty\\nmountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let\\nfreedom ring from the snowcapped Rockies of Colorado.', metadata={'lang': 'eng', 'section': '3', 'offset': '1534', 'len': '55', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='And if America is to be a great nation, this must become true. So\\nlet freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty\\nmountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let\\nfreedom ring from the snowcapped Rockies of Colorado.', metadata={'lang': 'eng', 'section': '3', 'offset': '1534', 'len': '55', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='Let freedom ring from the curvaceous slopes of\\nCalifornia. But not only that. Let freedom ring from Stone Mountain of Georgia. Let freedom ring from Lookout\\nMountain of Tennessee. Let freedom ring from every hill and molehill of Mississippi, from every\\nmountain side. Let freedom ring . . .\\nWhen we allow freedom to ring—when we let it ring from every city and every hamlet, from every state\\nand every city, we will be able to speed up that day when all of God’s children, black men and white\\nmen, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the\\nold Negro spiritual, “Free at last, Free at last, Great God a-mighty, We are free at last.”', metadata={'lang': 'eng', 'section': '3', 'offset': '1842', 'len': '52', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='Let freedom ring from the curvaceous slopes of\\nCalifornia. But not only that. Let freedom ring from Stone Mountain of Georgia. Let freedom ring from Lookout\\nMountain of Tennessee. Let freedom ring from every hill and molehill of Mississippi, from every\\nmountain side. Let freedom ring . . .\\nWhen we allow freedom to ring—when we let it ring from every city and every hamlet, from every state\\nand every city, we will be able to speed up that day when all of God’s children, black men and white\\nmen, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the\\nold Negro spiritual, “Free at last, Free at last, Great God a-mighty, We are free at last.”', metadata={'lang': 'eng', 'section': '3', 'offset': '1842', 'len': '52', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='Let freedom ring from the mighty\\nmountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let\\nfreedom ring from the snowcapped Rockies of Colorado. Let freedom ring from the curvaceous slopes of\\nCalifornia. But not only that. Let freedom ring from Stone Mountain of Georgia.', metadata={'lang': 'eng', 'section': '3', 'offset': '1657', 'len': '57', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'})]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever.get_relevant_documents(\"what did Dr. King say about the freedom?\")" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "2300e785", + "id": "f6d17e90", "metadata": {}, "outputs": [], "source": [] diff --git a/docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb b/docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb new file mode 100644 index 0000000000..1e9128dc6f --- /dev/null +++ b/docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb @@ -0,0 +1,440 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "13afcae7", + "metadata": {}, + "source": [ + "# Vectara self-querying \n", + "\n", + ">[Vectara](https://docs.vectara.com/docs/) is a GenAI platform for developers. It provides a simple API to build Grounded Generation (aka Retrieval-augmented-generation) applications.\n", + "\n", + "In the notebook we'll demo the `SelfQueryRetriever` wrapped around a Vectara vector store. " + ] + }, + { + "cell_type": "markdown", + "id": "68e75fb9", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps (see our [quickstart](https://docs.vectara.com/docs/quickstart) guide):\n", + "1. [Sign up](https://console.vectara.com/signup) for a Vectara account if you don't already have one. Once you have completed your sign up you will have a Vectara customer ID. You can find your customer ID by clicking on your name, on the top-right of the Vectara console window.\n", + "2. Within your account you can create one or more corpora. Each corpus represents an area that stores text data upon ingest from input documents. To create a corpus, use the **\"Create Corpus\"** button. You then provide a name to your corpus as well as a description. Optionally you can define filtering attributes and apply some advanced options. If you click on your created corpus, you can see its name and corpus ID right on the top.\n", + "3. Next you'll need to create API keys to access the corpus. Click on the **\"Authorization\"** tab in the corpus view and then the **\"Create API Key\"** button. Give your key a name, and choose whether you want query only or query+index for your key. Click \"Create\" and you now have an active API key. Keep this key confidential. \n", + "\n", + "To use LangChain with Vectara, you'll need to have these three values: customer ID, corpus ID and api_key.\n", + "You can provide those to LangChain in two ways:\n", + "\n", + "1. Include in your environment these three variables: `VECTARA_CUSTOMER_ID`, `VECTARA_CORPUS_ID` and `VECTARA_API_KEY`.\n", + "\n", + "> For example, you can set these variables using os.environ and getpass as follows:\n", + "\n", + "```python\n", + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"VECTARA_CUSTOMER_ID\"] = getpass.getpass(\"Vectara Customer ID:\")\n", + "os.environ[\"VECTARA_CORPUS_ID\"] = getpass.getpass(\"Vectara Corpus ID:\")\n", + "os.environ[\"VECTARA_API_KEY\"] = getpass.getpass(\"Vectara API Key:\")\n", + "```\n", + "\n", + "1. Provide them as arguments when creating the Vectara vectorstore object:\n", + "\n", + "```python\n", + "vectorstore = Vectara(\n", + " vectara_customer_id=vectara_customer_id,\n", + " vectara_corpus_id=vectara_corpus_id,\n", + " vectara_api_key=vectara_api_key\n", + " )\n", + "```\n", + "\n", + "**Note:** The self-query retriever requires you to have `lark` installed (`pip install lark`). " + ] + }, + { + "cell_type": "markdown", + "id": "742ac16d", + "metadata": {}, + "source": [ + "## Connecting to Vectara from LangChain\n", + "\n", + "In this example, we assume that you've created an account and a corpus, and added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and VECTARA_API_KEY (created with permissions for both indexing and query) as environment variables.\n", + "\n", + "The corpus has 4 fields defined as metadata for filtering: year, director, rating, and genre\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cb4a5787", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.embeddings import FakeEmbeddings\n", + "from langchain.schema import Document\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import Vectara\n", + "from langchain.document_loaders import TextLoader\n", + "\n", + "from langchain.llms import OpenAI\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bcbe04d9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = [\n", + " Document(\n", + " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", + " metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", + " metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2},\n", + " ),\n", + " Document(\n", + " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", + " metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6},\n", + " ),\n", + " Document(\n", + " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", + " metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3},\n", + " ),\n", + " Document(\n", + " page_content=\"Toys come alive and have a blast doing so\",\n", + " metadata={\"year\": 1995, \"genre\": \"animated\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", + " metadata={\n", + " \"year\": 1979,\n", + " \"rating\": 9.9,\n", + " \"director\": \"Andrei Tarkovsky\",\n", + " \"genre\": \"science fiction\",\n", + " },\n", + " ),\n", + "]\n", + "\n", + "vectara = Vectara()\n", + "for doc in docs:\n", + " vectara.add_texts([doc.page_content], embedding=FakeEmbeddings(size=768), doc_metadata=doc.metadata)" + ] + }, + { + "cell_type": "markdown", + "id": "5ecaab6d", + "metadata": {}, + "source": [ + "## Creating our self-querying retriever\n", + "Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "86e34dbf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.llms import OpenAI\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n", + "\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"genre\",\n", + " description=\"The genre of the movie\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"year\",\n", + " description=\"The year the movie was released\",\n", + " type=\"integer\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"director\",\n", + " description=\"The name of the movie director\",\n", + " type=\"string\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n", + " ),\n", + "]\n", + "document_content_description = \"Brief summary of a movie\"\n", + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(\n", + " llm, vectara, document_content_description, metadata_field_info, verbose=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ea9df8d4", + "metadata": {}, + "source": [ + "## Testing it out\n", + "And now we can try actually using our retriever!" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38a126e9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ofer/dev/langchain/libs/langchain/langchain/chains/llm.py:278: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'lang': 'eng', 'offset': '0', 'len': '66', 'year': '1993', 'rating': '7.7', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...', metadata={'lang': 'eng', 'offset': '0', 'len': '76', 'year': '2010', 'director': 'Christopher Nolan', 'rating': '8.2', 'source': 'langchain'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'lang': 'eng', 'offset': '0', 'len': '116', 'year': '2006', 'director': 'Satoshi Kon', 'rating': '8.6', 'source': 'langchain'})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"What are some movies about dinosaurs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fc3f1e6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='rating', value=8.5) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'lang': 'eng', 'offset': '0', 'len': '116', 'year': '2006', 'director': 'Satoshi Kon', 'rating': '8.6', 'source': 'langchain'})]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a filter\n", + "retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b19d4da0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='women' filter=Comparison(comparator=, attribute='director', value='Greta Gerwig') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'lang': 'eng', 'offset': '0', 'len': '82', 'year': '2019', 'director': 'Greta Gerwig', 'rating': '8.3', 'source': 'langchain'})]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and a filter\n", + "retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f900e40e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='rating', value=8.5), Comparison(comparator=, attribute='genre', value='science fiction')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a highly rated (above 8.5) science fiction film?\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "12a51522", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='toys' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='year', value=1990), Comparison(comparator=, attribute='year', value=2005), Comparison(comparator=, attribute='genre', value='animated')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a movie after 1990 but before 2005 that's all about toys, and preferably is animated\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51", + "metadata": {}, + "source": [ + "## Filter k\n", + "\n", + "We can also use the self query retriever to specify `k`: the number of documents to fetch.\n", + "\n", + "We can do this by passing `enable_limit=True` to the constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bff36b88-b506-4877-9c63-e5a1a8d78e64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "retriever = SelfQueryRetriever.from_llm(\n", + " llm,\n", + " vectara,\n", + " document_content_description,\n", + " metadata_field_info,\n", + " enable_limit=True,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2758d229-4f97-499c-819f-888acaf8ee10", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=2\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'lang': 'eng', 'offset': '0', 'len': '66', 'year': '1993', 'rating': '7.7', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"what are two movies about dinosaurs\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/retrievers/self_query/base.py b/libs/langchain/langchain/retrievers/self_query/base.py index 0251bff52c..a34c756bd4 100644 --- a/libs/langchain/langchain/retrievers/self_query/base.py +++ b/libs/langchain/langchain/retrievers/self_query/base.py @@ -16,6 +16,7 @@ from langchain.retrievers.self_query.milvus import MilvusTranslator from langchain.retrievers.self_query.myscale import MyScaleTranslator from langchain.retrievers.self_query.pinecone import PineconeTranslator from langchain.retrievers.self_query.qdrant import QdrantTranslator +from langchain.retrievers.self_query.vectara import VectaraTranslator from langchain.retrievers.self_query.weaviate import WeaviateTranslator from langchain.schema import BaseRetriever, Document from langchain.schema.language_model import BaseLanguageModel @@ -28,6 +29,7 @@ from langchain.vectorstores import ( MyScale, Pinecone, Qdrant, + Vectara, VectorStore, Weaviate, ) @@ -41,6 +43,7 @@ def _get_builtin_translator(vectorstore: VectorStore) -> Visitor: Chroma: ChromaTranslator, DashVector: DashvectorTranslator, Weaviate: WeaviateTranslator, + Vectara: VectaraTranslator, Qdrant: QdrantTranslator, MyScale: MyScaleTranslator, DeepLake: DeepLakeTranslator, diff --git a/libs/langchain/langchain/retrievers/self_query/vectara.py b/libs/langchain/langchain/retrievers/self_query/vectara.py new file mode 100644 index 0000000000..73dc46ff59 --- /dev/null +++ b/libs/langchain/langchain/retrievers/self_query/vectara.py @@ -0,0 +1,69 @@ +from typing import Tuple, Union + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, + Visitor, +) + + +def process_value(value: Union[int, float, str]) -> str: + if isinstance(value, str): + return f"'{value}'" + else: + return str(value) + + +class VectaraTranslator(Visitor): + """Translate `Vectara` internal query language elements to valid filters.""" + + allowed_operators = [Operator.AND, Operator.OR] + """Subset of allowed logical operators.""" + allowed_comparators = [ + Comparator.EQ, + Comparator.NE, + Comparator.GT, + Comparator.GTE, + Comparator.LT, + Comparator.LTE, + ] + """Subset of allowed logical comparators.""" + + def _format_func(self, func: Union[Operator, Comparator]) -> str: + map_dict = { + Operator.AND: " and ", + Operator.OR: " or ", + Comparator.EQ: "=", + Comparator.NE: "!=", + Comparator.GT: ">", + Comparator.GTE: ">=", + Comparator.LT: "<", + Comparator.LTE: "<=", + } + self._validate_func(func) + return map_dict[func] + + def visit_operation(self, operation: Operation) -> str: + args = [arg.accept(self) for arg in operation.arguments] + operator = self._format_func(operation.operator) + return "( " + operator.join(args) + " )" + + def visit_comparison(self, comparison: Comparison) -> str: + comparator = self._format_func(comparison.comparator) + processed_value = process_value(comparison.value) + attribute = comparison.attribute + return ( + "( " + "doc." + attribute + " " + comparator + " " + processed_value + " )" + ) + + def visit_structured_query( + self, structured_query: StructuredQuery + ) -> Tuple[str, dict]: + if structured_query.filter is None: + kwargs = {} + else: + kwargs = {"filter": structured_query.filter.accept(self)} + return structured_query.query, kwargs diff --git a/libs/langchain/langchain/vectorstores/vectara.py b/libs/langchain/langchain/vectorstores/vectara.py index 457511b104..3e8a2549e2 100644 --- a/libs/langchain/langchain/vectorstores/vectara.py +++ b/libs/langchain/langchain/vectorstores/vectara.py @@ -396,8 +396,12 @@ class Vectara(VectorStore): vectara_api_key=api_key, ) """ - # Note: Vectara generates its own embeddings, so we ignore the provided - # embeddings (required by interface) + # Notes: + # * Vectara generates its own embeddings, so we ignore the provided + # embeddings (required by interface) + # * when metadatas[] are provided they are associated with each "part" + # in Vectara. doc_metadata can be used to provide additional metadata + # for the document itself (applies to all "texts" in this call) doc_metadata = kwargs.pop("doc_metadata", {}) vectara = cls(**kwargs) vectara.add_texts(texts, metadatas, doc_metadata=doc_metadata, **kwargs) diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py b/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py index 03f5070120..2858b41e8e 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py @@ -34,8 +34,6 @@ def test_load_returns_list_of_documents(sample_data_frame: pl.DataFrame) -> None def test_load_converts_dataframe_columns_to_document_metadata( sample_data_frame: pl.DataFrame, ) -> None: - import polars as pl - loader = PolarsDataFrameLoader(sample_data_frame) docs = loader.load() diff --git a/libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py b/libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py new file mode 100644 index 0000000000..05c15f26ac --- /dev/null +++ b/libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py @@ -0,0 +1,71 @@ +from typing import Dict, Tuple + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, +) +from langchain.retrievers.self_query.vectara import VectaraTranslator + +DEFAULT_TRANSLATOR = VectaraTranslator() + + +def test_visit_comparison() -> None: + comp = Comparison(comparator=Comparator.LT, attribute="foo", value="1") + expected = "( doc.foo < '1' )" + actual = DEFAULT_TRANSLATOR.visit_comparison(comp) + assert expected == actual + + +def test_visit_operation() -> None: + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=1), + ], + ) + expected = "( ( doc.foo < 2 ) and ( doc.bar = 'baz' ) and ( doc.abc < 1 ) )" + actual = DEFAULT_TRANSLATOR.visit_operation(op) + assert expected == actual + + +def test_visit_structured_query() -> None: + query = "What is the capital of France?" + structured_query = StructuredQuery( + query=query, + filter=None, + limit=None, + ) + expected: Tuple[str, Dict] = (query, {}) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=1) + expected = (query, {"filter": "( doc.foo < 1 )"}) + structured_query = StructuredQuery( + query=query, + filter=comp, + limit=None, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=1), + ], + ) + structured_query = StructuredQuery(query=query, filter=op, limit=None) + expected = ( + query, + {"filter": "( ( doc.foo < 2 ) and ( doc.bar = 'baz' ) and ( doc.abc < 1 ) )"}, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual