mirror of https://github.com/hwchase17/langchain
Vectara (#5069)
# Vectara Integration This PR provides integration with Vectara. Implemented here are: * langchain/vectorstore/vectara.py * tests/integration_tests/vectorstores/test_vectara.py * langchain/retrievers/vectara_retriever.py And two IPYNB notebooks to do more testing: * docs/modules/chains/index_examples/vectara_text_generation.ipynb * docs/modules/indexes/vectorstores/examples/vectara.ipynb --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>pull/5200/head
parent
9c4b43b494
commit
c81fb88035
@ -0,0 +1,40 @@
|
|||||||
|
# Vectara
|
||||||
|
|
||||||
|
|
||||||
|
What is Vectara?
|
||||||
|
|
||||||
|
**Vectara Overview:**
|
||||||
|
- Vectara is developer-first API platform for building conversational search applications
|
||||||
|
- To use Vectara - first [sign up](https://console.vectara.com/signup) and create an account. Then create a corpus and an API key for indexing and searching.
|
||||||
|
- You can use Vectara's [indexing API](https://docs.vectara.com/docs/indexing-apis/indexing) to add documents into Vectara's index
|
||||||
|
- You can use Vectara's [Search API](https://docs.vectara.com/docs/search-apis/search) to query Vectara's index (which also supports Hybrid search implicitly).
|
||||||
|
- You can use Vectara's integration with LangChain as a Vector store or using the Retriever abstraction.
|
||||||
|
|
||||||
|
## Installation and Setup
|
||||||
|
To use Vectara with LangChain no special installation steps are required. You just have to provide your customer_id, corpus ID, and an API key created within the Vectara console to enable indexing and searching.
|
||||||
|
|
||||||
|
### VectorStore
|
||||||
|
|
||||||
|
There exists a wrapper around the Vectara platform, allowing you to use it as a vectorstore, whether for semantic search or example selection.
|
||||||
|
|
||||||
|
To import this vectorstore:
|
||||||
|
```python
|
||||||
|
from langchain.vectorstores import Vectara
|
||||||
|
```
|
||||||
|
|
||||||
|
To create an instance of the Vectara vectorstore:
|
||||||
|
```python
|
||||||
|
vectara = Vectara(
|
||||||
|
vectara_customer_id=customer_id,
|
||||||
|
vectara_corpus_id=corpus_id,
|
||||||
|
vectara_api_key=api_key
|
||||||
|
)
|
||||||
|
```
|
||||||
|
The customer_id, corpus_id and api_key are optional, and if they are not supplied will be read from the environment variables `VECTARA_CUSTOMER_ID`, `VECTARA_CORPUS_ID` and `VECTARA_API_KEY`, respectively.
|
||||||
|
|
||||||
|
|
||||||
|
For a more detailed walkthrough of the Vectara wrapper, see one of the two example notebooks:
|
||||||
|
* [Chat Over Documents with Vectara](./vectara/vectara_chat.html)
|
||||||
|
* [Vectara Text Generation](./vectara/vectara_text_generation.html)
|
||||||
|
|
||||||
|
|
@ -0,0 +1,199 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Vectara Text Generation\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook is based on [chat_vector_db](https://github.com/hwchase17/langchain/blob/master/docs/modules/chains/index_examples/question_answering.ipynb) and adapted to Vectara."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Prepare Data\n",
|
||||||
|
"\n",
|
||||||
|
"First, we prepare the data. For this example, we fetch a documentation site that consists of markdown files hosted on Github and split them into small enough Documents."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.llms import OpenAI\n",
|
||||||
|
"from langchain.docstore.document import Document\n",
|
||||||
|
"import requests\n",
|
||||||
|
"from langchain.vectorstores import Vectara\n",
|
||||||
|
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||||
|
"from langchain.prompts import PromptTemplate\n",
|
||||||
|
"import pathlib\n",
|
||||||
|
"import subprocess\n",
|
||||||
|
"import tempfile"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Cloning into '.'...\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"def get_github_docs(repo_owner, repo_name):\n",
|
||||||
|
" with tempfile.TemporaryDirectory() as d:\n",
|
||||||
|
" subprocess.check_call(\n",
|
||||||
|
" f\"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .\",\n",
|
||||||
|
" cwd=d,\n",
|
||||||
|
" shell=True,\n",
|
||||||
|
" )\n",
|
||||||
|
" git_sha = (\n",
|
||||||
|
" subprocess.check_output(\"git rev-parse HEAD\", shell=True, cwd=d)\n",
|
||||||
|
" .decode(\"utf-8\")\n",
|
||||||
|
" .strip()\n",
|
||||||
|
" )\n",
|
||||||
|
" repo_path = pathlib.Path(d)\n",
|
||||||
|
" markdown_files = list(repo_path.glob(\"*/*.md\")) + list(\n",
|
||||||
|
" repo_path.glob(\"*/*.mdx\")\n",
|
||||||
|
" )\n",
|
||||||
|
" for markdown_file in markdown_files:\n",
|
||||||
|
" with open(markdown_file, \"r\") as f:\n",
|
||||||
|
" relative_path = markdown_file.relative_to(repo_path)\n",
|
||||||
|
" github_url = f\"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}\"\n",
|
||||||
|
" yield Document(page_content=f.read(), metadata={\"source\": github_url})\n",
|
||||||
|
"\n",
|
||||||
|
"sources = get_github_docs(\"yirenlu92\", \"deno-manual-forked\")\n",
|
||||||
|
"\n",
|
||||||
|
"source_chunks = []\n",
|
||||||
|
"splitter = CharacterTextSplitter(separator=\" \", chunk_size=1024, chunk_overlap=0)\n",
|
||||||
|
"for source in sources:\n",
|
||||||
|
" for chunk in splitter.split_text(source.page_content):\n",
|
||||||
|
" source_chunks.append(chunk)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Set Up Vector DB\n",
|
||||||
|
"\n",
|
||||||
|
"Now that we have the documentation content in chunks, let's put all this information in a vector index for easy retrieval."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"search_index = Vectara.from_texts(source_chunks, embedding=None)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Set Up LLM Chain with Custom Prompt\n",
|
||||||
|
"\n",
|
||||||
|
"Next, let's set up a simple LLM chain but give it a custom prompt for blog post generation. Note that the custom prompt is parameterized and takes two inputs: `context`, which will be the documents fetched from the vector search, and `topic`, which is given by the user."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chains import LLMChain\n",
|
||||||
|
"prompt_template = \"\"\"Use the context below to write a 400 word blog post about the topic below:\n",
|
||||||
|
" Context: {context}\n",
|
||||||
|
" Topic: {topic}\n",
|
||||||
|
" Blog post:\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"PROMPT = PromptTemplate(\n",
|
||||||
|
" template=prompt_template, input_variables=[\"context\", \"topic\"]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"llm = OpenAI(openai_api_key=os.environ['OPENAI_API_KEY'], temperature=0)\n",
|
||||||
|
"\n",
|
||||||
|
"chain = LLMChain(llm=llm, prompt=PROMPT)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Generate Text\n",
|
||||||
|
"\n",
|
||||||
|
"Finally, we write a function to apply our inputs to the chain. The function takes an input parameter `topic`. We find the documents in the vector index that correspond to that `topic`, and use them as additional context in our simple LLM chain."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def generate_blog_post(topic):\n",
|
||||||
|
" docs = search_index.similarity_search(topic, k=4)\n",
|
||||||
|
" inputs = [{\"context\": doc.page_content, \"topic\": topic} for doc in docs]\n",
|
||||||
|
" print(chain.apply(inputs))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[{'text': '\\n\\nEnvironment variables are an essential part of any development workflow. They provide a way to store and access information that is specific to the environment in which the code is running. This can be especially useful when working with different versions of a language or framework, or when running code on different machines.\\n\\nThe Deno CLI tasks extension provides a way to easily manage environment variables when running Deno commands. This extension provides a task definition for allowing you to create tasks that execute the `deno` CLI from within the editor. The template for the Deno CLI tasks has the following interface, which can be configured in a `tasks.json` within your workspace:\\n\\nThe task definition includes the `type` field, which should be set to `deno`, and the `command` field, which is the `deno` command to run (e.g. `run`, `test`, `cache`, etc.). Additionally, you can specify additional arguments to pass on the command line, the current working directory to execute the command, and any environment variables.\\n\\nUsing environment variables with the Deno CLI tasks extension is a great way to ensure that your code is running in the correct environment. For example, if you are running a test suite,'}, {'text': '\\n\\nEnvironment variables are an important part of any programming language, and they can be used to store and access data in a variety of ways. In this blog post, we\\'ll be taking a look at environment variables specifically for the shell.\\n\\nShell variables are similar to environment variables, but they won\\'t be exported to spawned commands. They are defined with the following syntax:\\n\\n```sh\\nVAR_NAME=value\\n```\\n\\nShell variables can be used to store and access data in a variety of ways. For example, you can use them to store values that you want to re-use, but don\\'t want to be available in any spawned processes.\\n\\nFor example, if you wanted to store a value and then use it in a command, you could do something like this:\\n\\n```sh\\nVAR=hello && echo $VAR && deno eval \"console.log(\\'Deno: \\' + Deno.env.get(\\'VAR\\'))\"\\n```\\n\\nThis would output the following:\\n\\n```\\nhello\\nDeno: undefined\\n```\\n\\nAs you can see, the value stored in the shell variable is not available in the spawned process.\\n\\n'}, {'text': '\\n\\nWhen it comes to developing applications, environment variables are an essential part of the process. Environment variables are used to store information that can be used by applications and scripts to customize their behavior. This is especially important when it comes to developing applications with Deno, as there are several environment variables that can impact the behavior of Deno.\\n\\nThe most important environment variable for Deno is `DENO_AUTH_TOKENS`. This environment variable is used to store authentication tokens that are used to access remote resources. This is especially important when it comes to accessing remote APIs or databases. Without the proper authentication tokens, Deno will not be able to access the remote resources.\\n\\nAnother important environment variable for Deno is `DENO_DIR`. This environment variable is used to store the directory where Deno will store its files. This includes the Deno executable, the Deno cache, and the Deno configuration files. By setting this environment variable, you can ensure that Deno will always be able to find the files it needs.\\n\\nFinally, there is the `DENO_PLUGINS` environment variable. This environment variable is used to store the list of plugins that Deno will use. This is important for customizing the'}, {'text': '\\n\\nEnvironment variables are a great way to store and access sensitive information in your Deno applications. Deno offers built-in support for environment variables with `Deno.env`, and you can also use a `.env` file to store and access environment variables. In this blog post, we\\'ll explore both of these options and how to use them in your Deno applications.\\n\\n## Built-in `Deno.env`\\n\\nThe Deno runtime offers built-in support for environment variables with [`Deno.env`](https://deno.land/api@v1.25.3?s=Deno.env). `Deno.env` has getter and setter methods. Here is example usage:\\n\\n```ts\\nDeno.env.set(\"FIREBASE_API_KEY\", \"examplekey123\");\\nDeno.env.set(\"FIREBASE_AUTH_DOMAIN\", \"firebasedomain.com\");\\n\\nconsole.log(Deno.env.get(\"FIREBASE_API_KEY\")); // examplekey123\\nconsole.log(Deno.env.get(\"FIREBASE_AUTH_'}]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"generate_blog_post(\"environment variables\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -0,0 +1,309 @@
|
|||||||
|
"""Wrapper around Vectara vector database."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from hashlib import md5
|
||||||
|
from typing import Any, Iterable, List, Optional, Tuple, Type
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from langchain.embeddings.base import Embeddings
|
||||||
|
from langchain.schema import Document
|
||||||
|
from langchain.vectorstores.base import VectorStore, VectorStoreRetriever
|
||||||
|
|
||||||
|
|
||||||
|
class Vectara(VectorStore):
|
||||||
|
"""Implementation of Vector Store using Vectara (https://vectara.com).
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.vectorstores import Vectara
|
||||||
|
|
||||||
|
vectorstore = Vectara(
|
||||||
|
vectara_customer_id=vectara_customer_id,
|
||||||
|
vectara_corpus_id=vectara_corpus_id,
|
||||||
|
vectara_api_key=vectara_api_key
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vectara_customer_id: Optional[str] = None,
|
||||||
|
vectara_corpus_id: Optional[str] = None,
|
||||||
|
vectara_api_key: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""Initialize with Vectara API."""
|
||||||
|
self._vectara_customer_id = vectara_customer_id or os.environ.get(
|
||||||
|
"VECTARA_CUSTOMER_ID"
|
||||||
|
)
|
||||||
|
self._vectara_corpus_id = vectara_corpus_id or os.environ.get(
|
||||||
|
"VECTARA_CORPUS_ID"
|
||||||
|
)
|
||||||
|
self._vectara_api_key = vectara_api_key or os.environ.get("VECTARA_API_KEY")
|
||||||
|
if (
|
||||||
|
self._vectara_customer_id is None
|
||||||
|
or self._vectara_corpus_id is None
|
||||||
|
or self._vectara_api_key is None
|
||||||
|
):
|
||||||
|
logging.warning(
|
||||||
|
"Cant find Vectara credentials, customer_id or corpus_id in "
|
||||||
|
"environment."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.debug(f"Using corpus id {self._vectara_corpus_id}")
|
||||||
|
self._session = requests.Session() # to reuse connections
|
||||||
|
|
||||||
|
def _get_post_headers(self) -> dict:
|
||||||
|
"""Returns headers that should be attached to each post request."""
|
||||||
|
return {
|
||||||
|
"x-api-key": self._vectara_api_key,
|
||||||
|
"customer-id": self._vectara_customer_id,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _delete_doc(self, doc_id: str) -> bool:
|
||||||
|
"""
|
||||||
|
Delete a document from the Vectara corpus.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): URL of the page to delete.
|
||||||
|
doc_id (str): ID of the document to delete.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if deletion was successful, False otherwise.
|
||||||
|
"""
|
||||||
|
body = {
|
||||||
|
"customer_id": self._vectara_customer_id,
|
||||||
|
"corpus_id": self._vectara_corpus_id,
|
||||||
|
"document_id": doc_id,
|
||||||
|
}
|
||||||
|
response = self._session.post(
|
||||||
|
"https://api.vectara.io/v1/delete-doc",
|
||||||
|
data=json.dumps(body),
|
||||||
|
verify=True,
|
||||||
|
headers=self._get_post_headers(),
|
||||||
|
)
|
||||||
|
if response.status_code != 200:
|
||||||
|
logging.error(
|
||||||
|
f"Delete request failed for doc_id = {doc_id} with status code "
|
||||||
|
f"{response.status_code}, reason {response.reason}, text "
|
||||||
|
f"{response.text}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _index_doc(self, doc_id: str, text: str, metadata: dict) -> bool:
|
||||||
|
request: dict[str, Any] = {}
|
||||||
|
request["customer_id"] = self._vectara_customer_id
|
||||||
|
request["corpus_id"] = self._vectara_corpus_id
|
||||||
|
request["document"] = {
|
||||||
|
"document_id": doc_id,
|
||||||
|
"metadataJson": json.dumps(metadata),
|
||||||
|
"section": [{"text": text, "metadataJson": json.dumps(metadata)}],
|
||||||
|
}
|
||||||
|
|
||||||
|
response = self._session.post(
|
||||||
|
headers=self._get_post_headers(),
|
||||||
|
url="https://api.vectara.io/v1/index",
|
||||||
|
data=json.dumps(request),
|
||||||
|
timeout=30,
|
||||||
|
verify=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
status_code = response.status_code
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
status_str = result["status"]["code"] if "status" in result else None
|
||||||
|
if status_code == 409 or (status_str and status_str == "ALREADY_EXISTS"):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def add_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Run more texts through the embeddings and add to the vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: Iterable of strings to add to the vectorstore.
|
||||||
|
metadatas: Optional list of metadatas associated with the texts.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ids from adding the texts into the vectorstore.
|
||||||
|
|
||||||
|
"""
|
||||||
|
ids = [md5(text.encode("utf-8")).hexdigest() for text in texts]
|
||||||
|
for i, doc in enumerate(texts):
|
||||||
|
doc_id = ids[i]
|
||||||
|
metadata = metadatas[i] if metadatas else {}
|
||||||
|
succeeded = self._index_doc(doc_id, doc, metadata)
|
||||||
|
if not succeeded:
|
||||||
|
self._delete_doc(doc_id)
|
||||||
|
self._index_doc(doc_id, doc, metadata)
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def similarity_search_with_score(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 5,
|
||||||
|
alpha: float = 0.025,
|
||||||
|
filter: Optional[str] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Return Vectara documents most similar to query, along with scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 5.
|
||||||
|
alpha: parameter for hybrid search (called "lambda" in Vectara
|
||||||
|
documentation).
|
||||||
|
filter: Dictionary of argument(s) to filter on metadata. For example a
|
||||||
|
filter can be "doc.rating > 3.0 and part.lang = 'deu'"} see
|
||||||
|
https://docs.vectara.com/docs/search-apis/sql/filter-overview
|
||||||
|
for more details.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query and score for each.
|
||||||
|
"""
|
||||||
|
response = self._session.post(
|
||||||
|
headers=self._get_post_headers(),
|
||||||
|
url="https://api.vectara.io/v1/query",
|
||||||
|
data=json.dumps(
|
||||||
|
{
|
||||||
|
"query": [
|
||||||
|
{
|
||||||
|
"query": query,
|
||||||
|
"start": 0,
|
||||||
|
"num_results": k,
|
||||||
|
"context_config": {
|
||||||
|
"sentences_before": 3,
|
||||||
|
"sentences_after": 3,
|
||||||
|
},
|
||||||
|
"corpus_key": [
|
||||||
|
{
|
||||||
|
"customer_id": self._vectara_customer_id,
|
||||||
|
"corpus_id": self._vectara_corpus_id,
|
||||||
|
"metadataFilter": filter,
|
||||||
|
"lexical_interpolation_config": {"lambda": alpha},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
logging.error(
|
||||||
|
"Query failed %s",
|
||||||
|
f"(code {response.status_code}, reason {response.reason}, details "
|
||||||
|
f"{response.text})",
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
responses = result["responseSet"][0]["response"]
|
||||||
|
vectara_default_metadata = ["lang", "len", "offset"]
|
||||||
|
docs = [
|
||||||
|
(
|
||||||
|
Document(
|
||||||
|
page_content=x["text"],
|
||||||
|
metadata={
|
||||||
|
m["name"]: m["value"]
|
||||||
|
for m in x["metadata"]
|
||||||
|
if m["name"] not in vectara_default_metadata
|
||||||
|
},
|
||||||
|
),
|
||||||
|
x["score"],
|
||||||
|
)
|
||||||
|
for x in responses
|
||||||
|
]
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def similarity_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 5,
|
||||||
|
alpha: float = 0.025,
|
||||||
|
filter: Optional[str] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return Vectara documents most similar to query, along with scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 5.
|
||||||
|
filter: Dictionary of argument(s) to filter on metadata. For example a
|
||||||
|
filter can be "doc.rating > 3.0 and part.lang = 'deu'"} see
|
||||||
|
https://docs.vectara.com/docs/search-apis/sql/filter-overview for more
|
||||||
|
details.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query
|
||||||
|
"""
|
||||||
|
docs_and_scores = self.similarity_search_with_score(
|
||||||
|
query, k=k, alpha=alpha, filter=filter, **kwargs
|
||||||
|
)
|
||||||
|
return [doc for doc, _ in docs_and_scores]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_texts(
|
||||||
|
cls: Type[Vectara],
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Optional[Embeddings] = None,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> Vectara:
|
||||||
|
"""Construct Vectara wrapper from raw documents.
|
||||||
|
This is intended to be a quick way to get started.
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain import Vectara
|
||||||
|
vectara = Vectara.from_texts(
|
||||||
|
texts,
|
||||||
|
vectara_customer_id=customer_id,
|
||||||
|
vectara_corpus_id=corpus_id,
|
||||||
|
vectara_api_key=api_key,
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
# Note: Vectara generates its own embeddings, so we ignore the provided
|
||||||
|
# embeddings (required by interface)
|
||||||
|
vectara = cls(**kwargs)
|
||||||
|
vectara.add_texts(texts, metadatas)
|
||||||
|
return vectara
|
||||||
|
|
||||||
|
def as_retriever(self, **kwargs: Any) -> VectaraRetriever:
|
||||||
|
return VectaraRetriever(vectorstore=self, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class VectaraRetriever(VectorStoreRetriever):
|
||||||
|
vectorstore: Vectara
|
||||||
|
search_kwargs: dict = Field(default_factory=lambda: {"alpha": 0.025, "k": 5})
|
||||||
|
"""Search params.
|
||||||
|
k: Number of Documents to return. Defaults to 5.
|
||||||
|
alpha: parameter for hybrid search (called "lambda" in Vectara
|
||||||
|
documentation).
|
||||||
|
filter: Dictionary of argument(s) to filter on metadata. For example a
|
||||||
|
filter can be "doc.rating > 3.0 and part.lang = 'deu'"} see
|
||||||
|
https://docs.vectara.com/docs/search-apis/sql/filter-overview
|
||||||
|
for more details.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def add_texts(
|
||||||
|
self, texts: List[str], metadatas: Optional[List[dict]] = None
|
||||||
|
) -> None:
|
||||||
|
"""Add text to the Vectara vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts (List[str]): The text
|
||||||
|
metadatas (List[dict]): Metadata dicts, must line up with existing store
|
||||||
|
"""
|
||||||
|
self.vectorstore.add_texts(texts, metadatas)
|
@ -0,0 +1,34 @@
|
|||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.vectorstores.vectara import Vectara
|
||||||
|
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||||
|
|
||||||
|
|
||||||
|
def get_abbr(s: str) -> str:
|
||||||
|
words = s.split(" ") # Split the string into words
|
||||||
|
first_letters = [word[0] for word in words] # Extract the first letter of each word
|
||||||
|
return "".join(first_letters) # Join the first letters into a single string
|
||||||
|
|
||||||
|
|
||||||
|
def test_vectara_add_documents() -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
|
||||||
|
# start with some initial documents
|
||||||
|
texts = ["grounded generation", "retrieval augmented generation", "data privacy"]
|
||||||
|
docsearch: Vectara = Vectara.from_texts(
|
||||||
|
texts,
|
||||||
|
embedding=FakeEmbeddings(),
|
||||||
|
metadatas=[{"abbr": "gg"}, {"abbr": "rag"}, {"abbr": "dp"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
# then add some additional documents
|
||||||
|
new_texts = ["large language model", "information retrieval", "question answering"]
|
||||||
|
docsearch.add_documents(
|
||||||
|
[Document(page_content=t, metadata={"abbr": get_abbr(t)}) for t in new_texts]
|
||||||
|
)
|
||||||
|
|
||||||
|
# finally do a similarity search to see if all works okay
|
||||||
|
output = docsearch.similarity_search("large language model", k=2)
|
||||||
|
assert output[0].page_content == "large language model"
|
||||||
|
assert output[0].metadata == {"abbr": "llm"}
|
||||||
|
assert output[1].page_content == "information retrieval"
|
||||||
|
assert output[1].metadata == {"abbr": "ir"}
|
Loading…
Reference in New Issue