From f8cf09a2300621185dde10704a84b553d93afe30 Mon Sep 17 00:00:00 2001 From: Ofer Mendelevitch Date: Sat, 10 Jun 2023 16:27:01 -0700 Subject: [PATCH] Update to Vectara integration (#5950) This PR updates the Vectara integration (@hwchase17 ): * Adds reuse of requests.session to imrpove efficiency and speed. * Utilizes Vectara's low-level API (instead of standard API) to better match user's specific chunking with LangChain * Now add_texts puts all the texts into a single Vectara document so indexing is much faster. * updated variables names from alpha to lambda_val (to be consistent with Vectara docs) and added n_context_sentence so it's available to use if needed. * Updates to documentation and tests --------- Co-authored-by: Harrison Chase --- docs/integrations/vectara.md | 24 +++- docs/integrations/vectara/vectara_chat.ipynb | 52 ++++---- .../vectara/vectara_text_generation.ipynb | 5 +- .../vectorstores/examples/vectara.ipynb | 44 ++++--- .../vectorstores/examples/weaviate.ipynb | 7 +- langchain/vectorstores/vectara.py | 121 +++++++++++------- .../vectorstores/test_vectara.py | 4 +- 7 files changed, 161 insertions(+), 96 deletions(-) diff --git a/docs/integrations/vectara.md b/docs/integrations/vectara.md index 4dde4faa..6c601264 100644 --- a/docs/integrations/vectara.md +++ b/docs/integrations/vectara.md @@ -4,7 +4,7 @@ What is Vectara? **Vectara Overview:** -- Vectara is developer-first API platform for building conversational search applications +- Vectara is developer-first API platform for building GenAI applications - To use Vectara - first [sign up](https://console.vectara.com/signup) and create an account. Then create a corpus and an API key for indexing and searching. - You can use Vectara's [indexing API](https://docs.vectara.com/docs/indexing-apis/indexing) to add documents into Vectara's index - You can use Vectara's [Search API](https://docs.vectara.com/docs/search-apis/search) to query Vectara's index (which also supports Hybrid search implicitly). @@ -13,6 +13,13 @@ What is Vectara? ## Installation and Setup To use Vectara with LangChain no special installation steps are required. You just have to provide your customer_id, corpus ID, and an API key created within the Vectara console to enable indexing and searching. +Alternatively these can be provided as environment variables +- export `VECTARA_CUSTOMER_ID`="your_customer_id" +- export `VECTARA_CORPUS_ID`="your_corpus_id" +- export `VECTARA_API_KEY`="your-vectara-api-key" + +## Usage + ### VectorStore There exists a wrapper around the Vectara platform, allowing you to use it as a vectorstore, whether for semantic search or example selection. @@ -32,8 +39,21 @@ vectara = Vectara( ``` The customer_id, corpus_id and api_key are optional, and if they are not supplied will be read from the environment variables `VECTARA_CUSTOMER_ID`, `VECTARA_CORPUS_ID` and `VECTARA_API_KEY`, respectively. +To query the vectorstore, you can use the `similarity_search` method (or `similarity_search_with_score`), which takes a query string and returns a list of results: +```python +results = vectara.similarity_score("what is LangChain?") +``` + +`similarity_search_with_score` also supports the following additional arguments: +- `k`: number of results to return (defaults to 5) +- `lambda_val`: the [lexical matching](https://docs.vectara.com/docs/api-reference/search-apis/lexical-matching) factor for hybrid search (defaults to 0.025) +- `filter`: a [filter](https://docs.vectara.com/docs/common-use-cases/filtering-by-metadata/filter-overview) to apply to the results (default None) +- `n_sentence_context`: number of sentences to include before/after the actual matching segment when returning results. This defaults to 0 so as to return the exact text segment that matches, but can be used with other values e.g. 2 or 3 to return adjacent text segments. + +The results are returned as a list of relevant documents, and a relevance score of each document. + -For a more detailed walkthrough of the Vectara wrapper, see one of the two example notebooks: +For a more detailed examples of using the Vectara wrapper, see one of these two sample notebooks: * [Chat Over Documents with Vectara](./vectara/vectara_chat.html) * [Vectara Text Generation](./vectara/vectara_text_generation.html) diff --git a/docs/integrations/vectara/vectara_chat.ipynb b/docs/integrations/vectara/vectara_chat.ipynb index 9a3318fc..1af862b3 100644 --- a/docs/integrations/vectara/vectara_chat.ipynb +++ b/docs/integrations/vectara/vectara_chat.ipynb @@ -102,21 +102,11 @@ "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "openai_api_key = os.environ['OPENAI_API_KEY']\n", "llm = OpenAI(openai_api_key=openai_api_key, temperature=0)\n", - "retriever = VectaraRetriever(vectorstore, alpha=0.025, k=5, filter=None)\n", - "\n", - "print(type(vectorstore))\n", + "retriever = vectorstore.as_retriever(lambda_val=0.025, k=5, filter=None)\n", "d = retriever.get_relevant_documents('What did the president say about Ketanji Brown Jackson')\n", "\n", "qa = ConversationalRetrievalChain.from_llm(llm, retriever, memory=memory)" @@ -142,7 +132,7 @@ { "data": { "text/plain": [ - "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, and a former federal public defender.\"" + "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.\"" ] }, "execution_count": 7, @@ -174,7 +164,7 @@ { "data": { "text/plain": [ - "' Justice Stephen Breyer.'" + "' Justice Stephen Breyer'" ] }, "execution_count": 9, @@ -241,7 +231,7 @@ { "data": { "text/plain": [ - "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, and a former federal public defender.\"" + "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.\"" ] }, "execution_count": 12, @@ -286,7 +276,7 @@ { "data": { "text/plain": [ - "' Justice Stephen Breyer.'" + "' Justice Stephen Breyer'" ] }, "execution_count": 14, @@ -344,7 +334,7 @@ { "data": { "text/plain": [ - "Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice. A former federal public defender.', metadata={'source': '../../modules/state_of_the_union.txt'})" + "Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt'})" ] }, "execution_count": 17, @@ -392,6 +382,24 @@ "result = qa({\"question\": query, \"chat_history\": chat_history, \"vectordbkwargs\": vectordbkwargs})" ] }, + { + "cell_type": "code", + "execution_count": 35, + "id": "24ebdaec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " The president said that Ketanji Brown Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.\n" + ] + } + ], + "source": [ + "print(result['answer'])" + ] + }, { "cell_type": "markdown", "id": "99b96dae", @@ -459,7 +467,7 @@ { "data": { "text/plain": [ - "' The president did not mention Ketanji Brown Jackson.'" + "\" The president said that he nominated Circuit Court of Appeals Judge Ketanji Brown Jackson, who he described as one of the nation's top legal minds, to continue Justice Breyer's legacy of excellence.\"" ] }, "execution_count": 23, @@ -538,7 +546,7 @@ { "data": { "text/plain": [ - "' The president did not mention Ketanji Brown Jackson.\\nSOURCES: ../../modules/state_of_the_union.txt'" + "\" The president said that he nominated Circuit Court of Appeals Judge Ketanji Brown Jackson, who he described as one of the nation's top legal minds, and that she will continue Justice Breyer's legacy of excellence.\\nSOURCES: ../../../state_of_the_union.txt\"" ] }, "execution_count": 27, @@ -598,7 +606,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, and a former federal public defender." + " The president said that Ketanji Brown Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence." ] } ], @@ -620,7 +628,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " Justice Stephen Breyer." + " Justice Stephen Breyer" ] } ], @@ -681,7 +689,7 @@ { "data": { "text/plain": [ - "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, and a former federal public defender.\"" + "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.\"" ] }, "execution_count": 33, diff --git a/docs/integrations/vectara/vectara_text_generation.ipynb b/docs/integrations/vectara/vectara_text_generation.ipynb index bd70d1d5..438bad75 100644 --- a/docs/integrations/vectara/vectara_text_generation.ipynb +++ b/docs/integrations/vectara/vectara_text_generation.ipynb @@ -6,7 +6,7 @@ "source": [ "# Vectara Text Generation\n", "\n", - "This notebook is based on [chat_vector_db](https://github.com/hwchase17/langchain/blob/master/docs/modules/chains/index_examples/question_answering.ipynb) and adapted to Vectara." + "This notebook is based on [text generation](https://github.com/hwchase17/langchain/blob/master/docs/modules/chains/index_examples/vector_db_text_generation.ipynb) notebook and adapted to Vectara." ] }, { @@ -24,6 +24,7 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", "from langchain.llms import OpenAI\n", "from langchain.docstore.document import Document\n", "import requests\n", @@ -159,7 +160,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[{'text': '\\n\\nEnvironment variables are an essential part of any development workflow. They provide a way to store and access information that is specific to the environment in which the code is running. This can be especially useful when working with different versions of a language or framework, or when running code on different machines.\\n\\nThe Deno CLI tasks extension provides a way to easily manage environment variables when running Deno commands. This extension provides a task definition for allowing you to create tasks that execute the `deno` CLI from within the editor. The template for the Deno CLI tasks has the following interface, which can be configured in a `tasks.json` within your workspace:\\n\\nThe task definition includes the `type` field, which should be set to `deno`, and the `command` field, which is the `deno` command to run (e.g. `run`, `test`, `cache`, etc.). Additionally, you can specify additional arguments to pass on the command line, the current working directory to execute the command, and any environment variables.\\n\\nUsing environment variables with the Deno CLI tasks extension is a great way to ensure that your code is running in the correct environment. For example, if you are running a test suite,'}, {'text': '\\n\\nEnvironment variables are an important part of any programming language, and they can be used to store and access data in a variety of ways. In this blog post, we\\'ll be taking a look at environment variables specifically for the shell.\\n\\nShell variables are similar to environment variables, but they won\\'t be exported to spawned commands. They are defined with the following syntax:\\n\\n```sh\\nVAR_NAME=value\\n```\\n\\nShell variables can be used to store and access data in a variety of ways. For example, you can use them to store values that you want to re-use, but don\\'t want to be available in any spawned processes.\\n\\nFor example, if you wanted to store a value and then use it in a command, you could do something like this:\\n\\n```sh\\nVAR=hello && echo $VAR && deno eval \"console.log(\\'Deno: \\' + Deno.env.get(\\'VAR\\'))\"\\n```\\n\\nThis would output the following:\\n\\n```\\nhello\\nDeno: undefined\\n```\\n\\nAs you can see, the value stored in the shell variable is not available in the spawned process.\\n\\n'}, {'text': '\\n\\nWhen it comes to developing applications, environment variables are an essential part of the process. Environment variables are used to store information that can be used by applications and scripts to customize their behavior. This is especially important when it comes to developing applications with Deno, as there are several environment variables that can impact the behavior of Deno.\\n\\nThe most important environment variable for Deno is `DENO_AUTH_TOKENS`. This environment variable is used to store authentication tokens that are used to access remote resources. This is especially important when it comes to accessing remote APIs or databases. Without the proper authentication tokens, Deno will not be able to access the remote resources.\\n\\nAnother important environment variable for Deno is `DENO_DIR`. This environment variable is used to store the directory where Deno will store its files. This includes the Deno executable, the Deno cache, and the Deno configuration files. By setting this environment variable, you can ensure that Deno will always be able to find the files it needs.\\n\\nFinally, there is the `DENO_PLUGINS` environment variable. This environment variable is used to store the list of plugins that Deno will use. This is important for customizing the'}, {'text': '\\n\\nEnvironment variables are a great way to store and access sensitive information in your Deno applications. Deno offers built-in support for environment variables with `Deno.env`, and you can also use a `.env` file to store and access environment variables. In this blog post, we\\'ll explore both of these options and how to use them in your Deno applications.\\n\\n## Built-in `Deno.env`\\n\\nThe Deno runtime offers built-in support for environment variables with [`Deno.env`](https://deno.land/api@v1.25.3?s=Deno.env). `Deno.env` has getter and setter methods. Here is example usage:\\n\\n```ts\\nDeno.env.set(\"FIREBASE_API_KEY\", \"examplekey123\");\\nDeno.env.set(\"FIREBASE_AUTH_DOMAIN\", \"firebasedomain.com\");\\n\\nconsole.log(Deno.env.get(\"FIREBASE_API_KEY\")); // examplekey123\\nconsole.log(Deno.env.get(\"FIREBASE_AUTH_'}]\n" + "[{'text': '\\n\\nEnvironment variables are a powerful tool for managing configuration settings in your applications. They allow you to store and access values from anywhere in your code, making it easier to keep your codebase organized and maintainable.\\n\\nHowever, there are times when you may want to use environment variables specifically for a single command. This is where shell variables come in. Shell variables are similar to environment variables, but they won\\'t be exported to spawned commands. They are defined with the following syntax:\\n\\n```sh\\nVAR_NAME=value\\n```\\n\\nFor example, if you wanted to use a shell variable instead of an environment variable in a command, you could do something like this:\\n\\n```sh\\nVAR=hello && echo $VAR && deno eval \"console.log(\\'Deno: \\' + Deno.env.get(\\'VAR\\'))\"\\n```\\n\\nThis would output the following:\\n\\n```\\nhello\\nDeno: undefined\\n```\\n\\nShell variables can be useful when you want to re-use a value, but don\\'t want it available in any spawned processes.\\n\\nAnother way to use environment variables is through pipelines. Pipelines provide a way to pipe the'}, {'text': '\\n\\nEnvironment variables are a great way to store and access sensitive information in your applications. They are also useful for configuring applications and managing different environments. In Deno, there are two ways to use environment variables: the built-in `Deno.env` and the `.env` file.\\n\\nThe `Deno.env` is a built-in feature of the Deno runtime that allows you to set and get environment variables. It has getter and setter methods that you can use to access and set environment variables. For example, you can set the `FIREBASE_API_KEY` and `FIREBASE_AUTH_DOMAIN` environment variables like this:\\n\\n```ts\\nDeno.env.set(\"FIREBASE_API_KEY\", \"examplekey123\");\\nDeno.env.set(\"FIREBASE_AUTH_DOMAIN\", \"firebasedomain.com\");\\n\\nconsole.log(Deno.env.get(\"FIREBASE_API_KEY\")); // examplekey123\\nconsole.log(Deno.env.get(\"FIREBASE_AUTH_DOMAIN\")); // firebasedomain'}, {'text': \"\\n\\nEnvironment variables are a powerful tool for managing configuration and settings in your applications. They allow you to store and access values that can be used in your code, and they can be set and changed without having to modify your code.\\n\\nIn Deno, environment variables are defined using the `export` command. For example, to set a variable called `VAR_NAME` to the value `value`, you would use the following command:\\n\\n```sh\\nexport VAR_NAME=value\\n```\\n\\nYou can then access the value of the environment variable in your code using the `Deno.env.get()` method. For example, if you wanted to log the value of the `VAR_NAME` variable, you could use the following code:\\n\\n```js\\nconsole.log(Deno.env.get('VAR_NAME'));\\n```\\n\\nYou can also set environment variables for a single command. To do this, you can list the environment variables before the command, like so:\\n\\n```\\nVAR=hello VAR2=bye deno run main.ts\\n```\\n\\nThis will set the environment variables `VAR` and `V\"}, {'text': \"\\n\\nEnvironment variables are a powerful tool for managing settings and configuration in your applications. They can be used to store information such as user preferences, application settings, and even passwords. In this blog post, we'll discuss how to make Deno scripts executable with a hashbang (shebang).\\n\\nA hashbang is a line of code that is placed at the beginning of a script. It tells the system which interpreter to use when running the script. In the case of Deno, the hashbang should be `#!/usr/bin/env -S deno run --allow-env`. This tells the system to use the Deno interpreter and to allow the script to access environment variables.\\n\\nOnce the hashbang is in place, you may need to give the script execution permissions. On Linux, this can be done with the command `sudo chmod +x hashbang.ts`. After that, you can execute the script by calling it like any other command: `./hashbang.ts`.\\n\\nIn the example program, we give the context permission to access the environment variables and print the Deno installation path. This is done by using the `Deno.env.get()` function, which returns the value of the specified environment\"}]\n" ] } ], diff --git a/docs/modules/indexes/vectorstores/examples/vectara.ipynb b/docs/modules/indexes/vectorstores/examples/vectara.ipynb index 6551f55c..034a7dae 100644 --- a/docs/modules/indexes/vectorstores/examples/vectara.ipynb +++ b/docs/modules/indexes/vectorstores/examples/vectara.ipynb @@ -101,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "8429667e", "metadata": { "ExecuteTime": { @@ -133,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "a8c513ab", "metadata": { "ExecuteTime": { @@ -145,12 +145,12 @@ "outputs": [], "source": [ "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "found_docs = vectara.similarity_search(query)" + "found_docs = vectara.similarity_search(query, n_sentence_context=0)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "fc516993", "metadata": { "ExecuteTime": { @@ -164,7 +164,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice. A former federal public defender.\n" + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" ] } ], @@ -185,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "8804a21d", "metadata": { "ExecuteTime": { @@ -201,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "756a6887", "metadata": { "ExecuteTime": { @@ -214,9 +220,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice. A former federal public defender.\n", + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", "\n", - "Score: 1.0046461\n" + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "\n", + "Score: 0.7129974\n" ] } ], @@ -239,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "id": "9427195f", "metadata": { "ExecuteTime": { @@ -251,10 +263,10 @@ { "data": { "text/plain": [ - "VectorStoreRetriever(vectorstore=, search_type='similarity', search_kwargs={})" + "VectaraRetriever(vectorstore=, search_type='similarity', search_kwargs={'lambda_val': 0.025, 'k': 5, 'filter': '', 'n_sentence_context': '0'})" ] }, - "execution_count": 11, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -266,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, "id": "f3c70c31", "metadata": { "ExecuteTime": { @@ -278,10 +290,10 @@ { "data": { "text/plain": [ - "Document(page_content='Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice. A former federal public defender.', metadata={'source': '../../modules/state_of_the_union.txt'})" + "Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt'})" ] }, - "execution_count": 15, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -316,7 +328,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.10.9" } }, "nbformat": 4, diff --git a/docs/modules/indexes/vectorstores/examples/weaviate.ipynb b/docs/modules/indexes/vectorstores/examples/weaviate.ipynb index 2b151716..b73957ed 100644 --- a/docs/modules/indexes/vectorstores/examples/weaviate.ipynb +++ b/docs/modules/indexes/vectorstores/examples/weaviate.ipynb @@ -209,7 +209,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "8fc3487b", "metadata": {}, @@ -218,7 +217,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "281c0fcc", "metadata": {}, @@ -236,7 +234,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "503e2e75", "metadata": {}, @@ -273,7 +270,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "fbd7a6cb", "metadata": {}, @@ -282,7 +278,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "f349acb9", "metadata": {}, @@ -384,7 +379,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.9" } }, "nbformat": 4, diff --git a/langchain/vectorstores/vectara.py b/langchain/vectorstores/vectara.py index acd7e63d..ed6551a8 100644 --- a/langchain/vectorstores/vectara.py +++ b/langchain/vectorstores/vectara.py @@ -55,6 +55,8 @@ class Vectara(VectorStore): else: logging.debug(f"Using corpus id {self._vectara_corpus_id}") self._session = requests.Session() # to reuse connections + adapter = requests.adapters.HTTPAdapter(max_retries=3) + self._session.mount("http://", adapter) def _get_post_headers(self) -> dict: """Returns headers that should be attached to each post request.""" @@ -95,19 +97,15 @@ class Vectara(VectorStore): return False return True - def _index_doc(self, doc_id: str, text: str, metadata: dict) -> bool: + def _index_doc(self, doc: dict) -> bool: request: dict[str, Any] = {} request["customer_id"] = self._vectara_customer_id request["corpus_id"] = self._vectara_corpus_id - request["document"] = { - "document_id": doc_id, - "metadataJson": json.dumps(metadata), - "section": [{"text": text, "metadataJson": json.dumps(metadata)}], - } + request["document"] = doc response = self._session.post( headers=self._get_post_headers(), - url="https://api.vectara.io/v1/index", + url="https://api.vectara.io/v1/core/index", data=json.dumps(request), timeout=30, verify=True, @@ -138,22 +136,33 @@ class Vectara(VectorStore): List of ids from adding the texts into the vectorstore. """ - ids = [md5(text.encode("utf-8")).hexdigest() for text in texts] - for i, doc in enumerate(texts): - doc_id = ids[i] - metadata = metadatas[i] if metadatas else {} - succeeded = self._index_doc(doc_id, doc, metadata) - if not succeeded: - self._delete_doc(doc_id) - self._index_doc(doc_id, doc, metadata) - return ids + doc_hash = md5() + for t in texts: + doc_hash.update(t.encode()) + doc_id = doc_hash.hexdigest() + if metadatas is None: + metadatas = [{} for _ in texts] + doc = { + "document_id": doc_id, + "metadataJson": json.dumps({"source": "langchain"}), + "parts": [ + {"text": text, "metadataJson": json.dumps(md)} + for text, md in zip(texts, metadatas) + ], + } + succeeded = self._index_doc(doc) + if not succeeded: + self._delete_doc(doc_id) + self._index_doc(doc) + return [doc_id] def similarity_search_with_score( self, query: str, k: int = 5, - alpha: float = 0.025, + lambda_val: float = 0.025, filter: Optional[str] = None, + n_sentence_context: int = 0, **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return Vectara documents most similar to query, along with scores. @@ -161,42 +170,45 @@ class Vectara(VectorStore): Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 5. - alpha: parameter for hybrid search (called "lambda" in Vectara - documentation). + lambda_val: lexical match parameter for hybrid search. filter: Dictionary of argument(s) to filter on metadata. For example a filter can be "doc.rating > 3.0 and part.lang = 'deu'"} see https://docs.vectara.com/docs/search-apis/sql/filter-overview for more details. + n_sentence_context: number of sentences before/after the matching segment + to add Returns: List of Documents most similar to the query and score for each. """ + data = json.dumps( + { + "query": [ + { + "query": query, + "start": 0, + "num_results": k, + "context_config": { + "sentences_before": n_sentence_context, + "sentences_after": n_sentence_context, + }, + "corpus_key": [ + { + "customer_id": self._vectara_customer_id, + "corpus_id": self._vectara_corpus_id, + "metadataFilter": filter, + "lexical_interpolation_config": {"lambda": lambda_val}, + } + ], + } + ] + } + ) + response = self._session.post( headers=self._get_post_headers(), url="https://api.vectara.io/v1/query", - data=json.dumps( - { - "query": [ - { - "query": query, - "start": 0, - "num_results": k, - "context_config": { - "sentences_before": 3, - "sentences_after": 3, - }, - "corpus_key": [ - { - "customer_id": self._vectara_customer_id, - "corpus_id": self._vectara_corpus_id, - "metadataFilter": filter, - "lexical_interpolation_config": {"lambda": alpha}, - } - ], - } - ] - } - ), + data=data, timeout=10, ) @@ -231,8 +243,9 @@ class Vectara(VectorStore): self, query: str, k: int = 5, - alpha: float = 0.025, + lambda_val: float = 0.025, filter: Optional[str] = None, + n_sentence_context: int = 0, **kwargs: Any, ) -> List[Document]: """Return Vectara documents most similar to query, along with scores. @@ -244,12 +257,19 @@ class Vectara(VectorStore): filter can be "doc.rating > 3.0 and part.lang = 'deu'"} see https://docs.vectara.com/docs/search-apis/sql/filter-overview for more details. + n_sentence_context: number of sentences before/after the matching segment + to add Returns: List of Documents most similar to the query """ docs_and_scores = self.similarity_search_with_score( - query, k=k, alpha=alpha, filter=filter, **kwargs + query, + k=k, + lamnbda_val=lambda_val, + filter=filter, + n_sentence_context=n_sentence_context, + **kwargs, ) return [doc for doc, _ in docs_and_scores] @@ -286,15 +306,22 @@ class Vectara(VectorStore): class VectaraRetriever(VectorStoreRetriever): vectorstore: Vectara - search_kwargs: dict = Field(default_factory=lambda: {"alpha": 0.025, "k": 5}) + search_kwargs: dict = Field( + default_factory=lambda: { + "lambda_val": 0.025, + "k": 5, + "filter": "", + "n_sentence_context": "0", + } + ) """Search params. k: Number of Documents to return. Defaults to 5. - alpha: parameter for hybrid search (called "lambda" in Vectara - documentation). + lambda_val: lexical match parameter for hybrid search. filter: Dictionary of argument(s) to filter on metadata. For example a filter can be "doc.rating > 3.0 and part.lang = 'deu'"} see https://docs.vectara.com/docs/search-apis/sql/filter-overview for more details. + n_sentence_context: number of sentences before/after the matching segment to add """ def add_texts( diff --git a/tests/integration_tests/vectorstores/test_vectara.py b/tests/integration_tests/vectorstores/test_vectara.py index aaa5eaa5..2a08194d 100644 --- a/tests/integration_tests/vectorstores/test_vectara.py +++ b/tests/integration_tests/vectorstores/test_vectara.py @@ -27,7 +27,9 @@ def test_vectara_add_documents() -> None: ) # finally do a similarity search to see if all works okay - output = docsearch.similarity_search("large language model", k=2) + output = docsearch.similarity_search( + "large language model", k=2, n_sentence_context=0 + ) assert output[0].page_content == "large language model" assert output[0].metadata == {"abbr": "llm"} assert output[1].page_content == "information retrieval"