mirror of https://github.com/hwchase17/langchain
zep/new ZepVectorStore (#9159)
- new ZepVectorStore class - ZepVectorStore unit tests - ZepVectorStore demo notebook - update zep-python to ~1.0.2 --------- Co-authored-by: Bagatur <baskaryan@gmail.com>pull/9300/head
parent
2519580994
commit
1d55141c50
@ -0,0 +1,494 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# Zep\n",
|
||||||
|
"\n",
|
||||||
|
"Zep is an open source long-term memory store for LLM applications. Zep makes it easy to add relevant documents,\n",
|
||||||
|
"chat history memory & rich user data to your LLM app's prompts.\n",
|
||||||
|
"\n",
|
||||||
|
"**Note:** The `ZepVectorStore` works with `Documents` and is intended to be used as a `Retriever`.\n",
|
||||||
|
"It offers separate functionality to Zep's `ZepMemory` class, which is designed for persisting, enriching\n",
|
||||||
|
"and searching your user's chat history.\n",
|
||||||
|
"\n",
|
||||||
|
"## Why Zep's VectorStore? 🤖🚀\n",
|
||||||
|
"Zep automatically embeds documents added to the Zep Vector Store using low-latency models local to the Zep server.\n",
|
||||||
|
"The Zep client also offers async interfaces for all document operations. These two together with Zep's chat memory\n",
|
||||||
|
" functionality make Zep ideal for building conversational LLM apps where latency and performance are important.\n",
|
||||||
|
"\n",
|
||||||
|
"## Installation\n",
|
||||||
|
"Follow the [Zep Quickstart Guide](https://docs.getzep.com/deployment/quickstart/) to install and get started with Zep.\n",
|
||||||
|
"\n",
|
||||||
|
"## Usage\n",
|
||||||
|
"\n",
|
||||||
|
"You'll need your Zep API URL and optionally an API key to use the Zep VectorStore. \n",
|
||||||
|
"See the [Zep docs](https://docs.getzep.com) for more information.\n",
|
||||||
|
"\n",
|
||||||
|
"In the examples below, we're using Zep's auto-embedding feature which automatically embed documents on the Zep server \n",
|
||||||
|
"using low-latency embedding models.\n",
|
||||||
|
"\n",
|
||||||
|
"## Note\n",
|
||||||
|
"- These examples use Zep's async interfaces. Call sync interfaces by removing the `a` prefix from the method names.\n",
|
||||||
|
"- If you pass in an `Embeddings` instance Zep will use this to embed documents rather than auto-embed them.\n",
|
||||||
|
"You must also set your document collection to `isAutoEmbedded === false`. \n",
|
||||||
|
"- If you set your collection to `isAutoEmbedded === false`, you must pass in an `Embeddings` instance."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"id": "9eb8dfa6fdb71ef5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Load or create a Collection from documents"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"id": "9a3a11aab1412d98"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from uuid import uuid4\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain.document_loaders import WebBaseLoader\n",
|
||||||
|
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||||
|
"from langchain.vectorstores import ZepVectorStore\n",
|
||||||
|
"from langchain.vectorstores.zep import CollectionConfig\n",
|
||||||
|
"\n",
|
||||||
|
"ZEP_API_URL = \"http://localhost:8000\" # this is the API url of your Zep instance\n",
|
||||||
|
"ZEP_API_KEY = \"<optional_key>\" # optional API Key for your Zep instance\n",
|
||||||
|
"collection_name = f\"babbage{uuid4().hex}\" # a unique collection name. alphanum only\n",
|
||||||
|
"\n",
|
||||||
|
"# Collection config is needed if we're creating a new Zep Collection\n",
|
||||||
|
"config = CollectionConfig(\n",
|
||||||
|
" name=collection_name,\n",
|
||||||
|
" description=\"<optional description>\",\n",
|
||||||
|
" metadata={\"optional_metadata\": \"associated with the collection\"},\n",
|
||||||
|
" is_auto_embedded=True, # we'll have Zep embed our documents using its low-latency embedder\n",
|
||||||
|
" embedding_dimensions=1536 # this should match the model you've configured Zep to use.\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# load the document\n",
|
||||||
|
"article_url = \"https://www.gutenberg.org/cache/epub/71292/pg71292.txt\"\n",
|
||||||
|
"loader = WebBaseLoader(article_url)\n",
|
||||||
|
"documents = loader.load()\n",
|
||||||
|
"\n",
|
||||||
|
"# split it into chunks\n",
|
||||||
|
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
|
||||||
|
"docs = text_splitter.split_documents(documents)\n",
|
||||||
|
"\n",
|
||||||
|
"# Instantiate the VectorStore. Since the collection does not already exist in Zep,\n",
|
||||||
|
"# it will be created and populated with the documents we pass in.\n",
|
||||||
|
"vs = ZepVectorStore.from_documents(docs,\n",
|
||||||
|
" collection_name=collection_name,\n",
|
||||||
|
" config=config,\n",
|
||||||
|
" api_url=ZEP_API_URL,\n",
|
||||||
|
" api_key=ZEP_API_KEY\n",
|
||||||
|
" )"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-08-13T01:07:50.672390Z",
|
||||||
|
"start_time": "2023-08-13T01:07:48.777799Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"id": "519418421a32e4d"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Embedding status: 0/402 documents embedded\n",
|
||||||
|
"Embedding status: 0/402 documents embedded\n",
|
||||||
|
"Embedding status: 402/402 documents embedded\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# wait for the collection embedding to complete\n",
|
||||||
|
"\n",
|
||||||
|
"async def wait_for_ready(collection_name: str) -> None:\n",
|
||||||
|
" from zep_python import ZepClient\n",
|
||||||
|
" import time\n",
|
||||||
|
"\n",
|
||||||
|
" client = ZepClient(ZEP_API_URL, ZEP_API_KEY)\n",
|
||||||
|
"\n",
|
||||||
|
" while True:\n",
|
||||||
|
" c = await client.document.aget_collection(collection_name)\n",
|
||||||
|
" print(\n",
|
||||||
|
" \"Embedding status: \"\n",
|
||||||
|
" f\"{c.document_embedded_count}/{c.document_count} documents embedded\"\n",
|
||||||
|
" )\n",
|
||||||
|
" time.sleep(1)\n",
|
||||||
|
" if c.status == \"ready\":\n",
|
||||||
|
" break\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"await wait_for_ready(collection_name)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-08-13T01:07:53.807663Z",
|
||||||
|
"start_time": "2023-08-13T01:07:50.671241Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"id": "201dc57b124cb6d7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Simarility Search Query over the Collection"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"id": "94ca9dfa7d0ecaa5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tables necessary to determine the places of the planets are not less\r\n",
|
||||||
|
"necessary than those for the sun, moon, and stars. Some notion of the\r\n",
|
||||||
|
"number and complexity of these tables may be formed, when we state that\r\n",
|
||||||
|
"the positions of the two principal planets, (and these the most\r\n",
|
||||||
|
"necessary for the navigator,) Jupiter and Saturn, require each not less\r\n",
|
||||||
|
"than one hundred and sixteen tables. Yet it is not only necessary to\r\n",
|
||||||
|
"predict the position of these bodies, but it is likewise expedient to -> 0.8998482592744614 \n",
|
||||||
|
"====\n",
|
||||||
|
"\n",
|
||||||
|
"tabulate the motions of the four satellites of Jupiter, to predict the\r\n",
|
||||||
|
"exact times at which they enter his shadow, and at which their shadows\r\n",
|
||||||
|
"cross his disc, as well as the times at which they are interposed\r\n",
|
||||||
|
"between him and the Earth, and he between them and the Earth.\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"Among the extensive classes of tables here enumerated, there are several\r\n",
|
||||||
|
"which are in their nature permanent and unalterable, and would never\r\n",
|
||||||
|
"require to be recomputed, if they could once be computed with perfect -> 0.8976143854195493 \n",
|
||||||
|
"====\n",
|
||||||
|
"\n",
|
||||||
|
"the scheme of notation thus applied, immediately suggested the\r\n",
|
||||||
|
"advantages which must attend it as an instrument for expressing the\r\n",
|
||||||
|
"structure, operation, and circulation of the animal system; and we\r\n",
|
||||||
|
"entertain no doubt of its adequacy for that purpose. Not only the\r\n",
|
||||||
|
"mechanical connexion of the solid members of the bodies of men and\r\n",
|
||||||
|
"animals, but likewise the structure and operation of the softer parts,\r\n",
|
||||||
|
"including the muscles, integuments, membranes, &c. the nature, motion, -> 0.889982614061763 \n",
|
||||||
|
"====\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# query it\n",
|
||||||
|
"query = \"what is the structure of our solar system?\"\n",
|
||||||
|
"docs_scores = await vs.asimilarity_search_with_relevance_scores(query, k=3)\n",
|
||||||
|
"\n",
|
||||||
|
"# print results\n",
|
||||||
|
"for d, s in docs_scores:\n",
|
||||||
|
" print(d.page_content, \" -> \", s, \"\\n====\\n\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-08-13T01:07:54.195988Z",
|
||||||
|
"start_time": "2023-08-13T01:07:53.808550Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"id": "1998de0a96fe89c3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Search over Collection Re-ranked by MMR"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"id": "e02b61a9af0b2c80"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tables necessary to determine the places of the planets are not less\r\n",
|
||||||
|
"necessary than those for the sun, moon, and stars. Some notion of the\r\n",
|
||||||
|
"number and complexity of these tables may be formed, when we state that\r\n",
|
||||||
|
"the positions of the two principal planets, (and these the most\r\n",
|
||||||
|
"necessary for the navigator,) Jupiter and Saturn, require each not less\r\n",
|
||||||
|
"than one hundred and sixteen tables. Yet it is not only necessary to\r\n",
|
||||||
|
"predict the position of these bodies, but it is likewise expedient to \n",
|
||||||
|
"====\n",
|
||||||
|
"\n",
|
||||||
|
"the scheme of notation thus applied, immediately suggested the\r\n",
|
||||||
|
"advantages which must attend it as an instrument for expressing the\r\n",
|
||||||
|
"structure, operation, and circulation of the animal system; and we\r\n",
|
||||||
|
"entertain no doubt of its adequacy for that purpose. Not only the\r\n",
|
||||||
|
"mechanical connexion of the solid members of the bodies of men and\r\n",
|
||||||
|
"animals, but likewise the structure and operation of the softer parts,\r\n",
|
||||||
|
"including the muscles, integuments, membranes, &c. the nature, motion, \n",
|
||||||
|
"====\n",
|
||||||
|
"\n",
|
||||||
|
"tabulate the motions of the four satellites of Jupiter, to predict the\r\n",
|
||||||
|
"exact times at which they enter his shadow, and at which their shadows\r\n",
|
||||||
|
"cross his disc, as well as the times at which they are interposed\r\n",
|
||||||
|
"between him and the Earth, and he between them and the Earth.\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"Among the extensive classes of tables here enumerated, there are several\r\n",
|
||||||
|
"which are in their nature permanent and unalterable, and would never\r\n",
|
||||||
|
"require to be recomputed, if they could once be computed with perfect \n",
|
||||||
|
"====\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"query = \"what is the structure of our solar system?\"\n",
|
||||||
|
"docs = await vs.asearch(query, search_type=\"mmr\", k=3)\n",
|
||||||
|
"\n",
|
||||||
|
"for d in docs:\n",
|
||||||
|
" print(d.page_content, \"\\n====\\n\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-08-13T01:07:54.394873Z",
|
||||||
|
"start_time": "2023-08-13T01:07:54.180901Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"id": "488112da752b1d58"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# Filter by Metadata\n",
|
||||||
|
"\n",
|
||||||
|
"Use a metadata filter to narrow down results. First, load another book: \"Adventures of Sherlock Holmes\""
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"id": "42455e31d4ab0d68"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Embedding status: 402/1692 documents embedded\n",
|
||||||
|
"Embedding status: 402/1692 documents embedded\n",
|
||||||
|
"Embedding status: 552/1692 documents embedded\n",
|
||||||
|
"Embedding status: 702/1692 documents embedded\n",
|
||||||
|
"Embedding status: 1002/1692 documents embedded\n",
|
||||||
|
"Embedding status: 1002/1692 documents embedded\n",
|
||||||
|
"Embedding status: 1152/1692 documents embedded\n",
|
||||||
|
"Embedding status: 1302/1692 documents embedded\n",
|
||||||
|
"Embedding status: 1452/1692 documents embedded\n",
|
||||||
|
"Embedding status: 1602/1692 documents embedded\n",
|
||||||
|
"Embedding status: 1692/1692 documents embedded\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Let's add more content to the existing Collection\n",
|
||||||
|
"article_url = \"https://www.gutenberg.org/files/48320/48320-0.txt\"\n",
|
||||||
|
"loader = WebBaseLoader(article_url)\n",
|
||||||
|
"documents = loader.load()\n",
|
||||||
|
"\n",
|
||||||
|
"# split it into chunks\n",
|
||||||
|
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
|
||||||
|
"docs = text_splitter.split_documents(documents)\n",
|
||||||
|
"\n",
|
||||||
|
"await vs.aadd_documents(docs)\n",
|
||||||
|
"\n",
|
||||||
|
"await wait_for_ready(collection_name)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-08-13T01:08:06.323569Z",
|
||||||
|
"start_time": "2023-08-13T01:07:54.381822Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"id": "146c8a96201c0ab9"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"### We see results from both books. Note the `source` metadata"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"id": "5b225f3ae1e61de8"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"by that body to Mr Babbage:--'In no department of science, or of the\r\n",
|
||||||
|
"arts, does this discovery promise to be so eminently useful as in that\r\n",
|
||||||
|
"of astronomy, and its kindred sciences, with the various arts dependent\r\n",
|
||||||
|
"on them. In none are computations more operose than those which\r\n",
|
||||||
|
"astronomy in particular requires;--in none are preparatory facilities\r\n",
|
||||||
|
"more needful;--in none is error more detrimental. The practical\r\n",
|
||||||
|
"astronomer is interrupted in his pursuit, and diverted from his task of -> {'source': 'https://www.gutenberg.org/cache/epub/71292/pg71292.txt'} \n",
|
||||||
|
"====\n",
|
||||||
|
"\n",
|
||||||
|
"possess all knowledge which is likely to be useful to him in his work,\r\n",
|
||||||
|
"and this I have endeavored in my case to do. If I remember rightly, you\r\n",
|
||||||
|
"on one occasion, in the early days of our friendship, defined my limits\r\n",
|
||||||
|
"in a very precise fashion.”\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"“Yes,” I answered, laughing. “It was a singular document. Philosophy,\r\n",
|
||||||
|
"astronomy, and politics were marked at zero, I remember. Botany\r\n",
|
||||||
|
"variable, geology profound as regards the mud-stains from any region -> {'source': 'https://www.gutenberg.org/files/48320/48320-0.txt'} \n",
|
||||||
|
"====\n",
|
||||||
|
"\n",
|
||||||
|
"in all its relations; but above all, with Astronomy and Navigation. So\r\n",
|
||||||
|
"important have they been considered, that in many instances large sums\r\n",
|
||||||
|
"have been appropriated by the most enlightened nations in the production\r\n",
|
||||||
|
"of them; and yet so numerous and insurmountable have been the\r\n",
|
||||||
|
"difficulties attending the attainment of this end, that after all, even\r\n",
|
||||||
|
"navigators, putting aside every other department of art and science,\r\n",
|
||||||
|
"have, until very recently, been scantily and imperfectly supplied with -> {'source': 'https://www.gutenberg.org/cache/epub/71292/pg71292.txt'} \n",
|
||||||
|
"====\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"query = \"Was he interested in astronomy?\"\n",
|
||||||
|
"docs = await vs.asearch(query, search_type=\"similarity\", k=3)\n",
|
||||||
|
"\n",
|
||||||
|
"for d in docs:\n",
|
||||||
|
" print(d.page_content, \" -> \", d.metadata, \"\\n====\\n\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-08-13T01:08:06.504769Z",
|
||||||
|
"start_time": "2023-08-13T01:08:06.325435Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"id": "53700a9cd817cde4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"### Let's try again using a filter for only the Sherlock Holmes document."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"id": "7b81d7cae351a1ec"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"possess all knowledge which is likely to be useful to him in his work,\r\n",
|
||||||
|
"and this I have endeavored in my case to do. If I remember rightly, you\r\n",
|
||||||
|
"on one occasion, in the early days of our friendship, defined my limits\r\n",
|
||||||
|
"in a very precise fashion.”\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"“Yes,” I answered, laughing. “It was a singular document. Philosophy,\r\n",
|
||||||
|
"astronomy, and politics were marked at zero, I remember. Botany\r\n",
|
||||||
|
"variable, geology profound as regards the mud-stains from any region -> {'source': 'https://www.gutenberg.org/files/48320/48320-0.txt'} \n",
|
||||||
|
"====\n",
|
||||||
|
"\n",
|
||||||
|
"the light shining upon his strong-set aquiline features. So he sat as I\r\n",
|
||||||
|
"dropped off to sleep, and so he sat when a sudden ejaculation caused me\r\n",
|
||||||
|
"to wake up, and I found the summer sun shining into the apartment. The\r\n",
|
||||||
|
"pipe was still between his lips, the smoke still curled upward, and the\r\n",
|
||||||
|
"room was full of a dense tobacco haze, but nothing remained of the heap\r\n",
|
||||||
|
"of shag which I had seen upon the previous night.\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"“Awake, Watson?” he asked.\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"“Yes.”\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"“Game for a morning drive?” -> {'source': 'https://www.gutenberg.org/files/48320/48320-0.txt'} \n",
|
||||||
|
"====\n",
|
||||||
|
"\n",
|
||||||
|
"“I glanced at the books upon the table, and in spite of my ignorance\r\n",
|
||||||
|
"of German I could see that two of them were treatises on science, the\r\n",
|
||||||
|
"others being volumes of poetry. Then I walked across to the window,\r\n",
|
||||||
|
"hoping that I might catch some glimpse of the country-side, but an oak\r\n",
|
||||||
|
"shutter, heavily barred, was folded across it. It was a wonderfully\r\n",
|
||||||
|
"silent house. There was an old clock ticking loudly somewhere in the\r\n",
|
||||||
|
"passage, but otherwise everything was deadly still. A vague feeling of -> {'source': 'https://www.gutenberg.org/files/48320/48320-0.txt'} \n",
|
||||||
|
"====\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"filter = {\n",
|
||||||
|
" \"where\": {\"jsonpath\": \"$[*] ? (@.source == 'https://www.gutenberg.org/files/48320/48320-0.txt')\"},\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"docs = await vs.asearch(query, search_type=\"similarity\", metadata=filter, k=3)\n",
|
||||||
|
"\n",
|
||||||
|
"for d in docs:\n",
|
||||||
|
" print(d.page_content, \" -> \", d.metadata, \"\\n====\\n\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false,
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2023-08-13T01:08:06.672836Z",
|
||||||
|
"start_time": "2023-08-13T01:08:06.505944Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"id": "8f1bdcba03979d22"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 2
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython2",
|
||||||
|
"version": "2.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,674 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import warnings
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.embeddings.base import Embeddings
|
||||||
|
from langchain.vectorstores.base import VectorStore
|
||||||
|
from langchain.vectorstores.utils import maximal_marginal_relevance
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from zep_python.document import Document as ZepDocument
|
||||||
|
from zep_python.document import DocumentCollection
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CollectionConfig:
|
||||||
|
"""
|
||||||
|
A configuration class for a Zep Collection.
|
||||||
|
|
||||||
|
If the collection does not exist, it will be created.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
name (str): The name of the collection.
|
||||||
|
description (Optional[str]): An optional description of the collection.
|
||||||
|
metadata (Optional[Dict[str, Any]]): Optional metadata for the collection.
|
||||||
|
embedding_dimensions (int): The number of dimensions for the embeddings in
|
||||||
|
the collection. This should match the Zep server configuration
|
||||||
|
if auto-embed is true.
|
||||||
|
is_auto_embedded (bool): A flag indicating whether the collection is
|
||||||
|
automatically embedded by Zep.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
description: Optional[str]
|
||||||
|
metadata: Optional[Dict[str, Any]]
|
||||||
|
embedding_dimensions: int
|
||||||
|
is_auto_embedded: bool
|
||||||
|
|
||||||
|
|
||||||
|
class ZepVectorStore(VectorStore):
|
||||||
|
"""
|
||||||
|
ZepVectorStore is a VectorStore implementation that uses the Zep long-term memory
|
||||||
|
store as a backend. It provides methods for adding texts or documents to the store,
|
||||||
|
searching for similar documents, and deleting documents.
|
||||||
|
|
||||||
|
Search scores are calculated using cosine similarity normalized to [0, 1].
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_url (str): The URL of the Zep API.
|
||||||
|
collection_name (str): The name of the collection in the Zep store.
|
||||||
|
api_key (Optional[str]): The API key for the Zep API.
|
||||||
|
config (Optional[CollectionConfig]): The configuration for the collection.
|
||||||
|
Required if the collection does not already exist.
|
||||||
|
embedding (Optional[Embeddings]): Optional embedding function to use to
|
||||||
|
embed the texts. Required if the collection is not auto-embedded.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
collection_name: str,
|
||||||
|
api_url: str,
|
||||||
|
*,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
config: Optional[CollectionConfig] = None,
|
||||||
|
embedding: Optional[Embeddings] = None,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
if not collection_name:
|
||||||
|
raise ValueError(
|
||||||
|
"collection_name must be specified when using ZepVectorStore."
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
from zep_python import ZepClient
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Could not import zep-python python package. "
|
||||||
|
"Please install it with `pip install zep-python`."
|
||||||
|
)
|
||||||
|
self._client = ZepClient(api_url, api_key=api_key)
|
||||||
|
|
||||||
|
self.collection_name = collection_name
|
||||||
|
# If for some reason the collection name is not the same as the one in the
|
||||||
|
# config, update it.
|
||||||
|
if config and config.name != self.collection_name:
|
||||||
|
config.name = self.collection_name
|
||||||
|
|
||||||
|
self._collection_config = config
|
||||||
|
self._collection = self._load_collection()
|
||||||
|
self._embedding = embedding
|
||||||
|
|
||||||
|
# self.add_texts(texts, metadatas=metadatas, **kwargs)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def embeddings(self) -> Optional[Embeddings]:
|
||||||
|
"""Access the query embedding object if available."""
|
||||||
|
return self._embedding
|
||||||
|
|
||||||
|
def _load_collection(self) -> DocumentCollection:
|
||||||
|
"""
|
||||||
|
Load the collection from the Zep backend.
|
||||||
|
"""
|
||||||
|
from zep_python import NotFoundError
|
||||||
|
|
||||||
|
try:
|
||||||
|
collection = self._client.document.get_collection(self.collection_name)
|
||||||
|
except NotFoundError:
|
||||||
|
logger.info(
|
||||||
|
f"Collection {self.collection_name} not found. "
|
||||||
|
"Creating new collection."
|
||||||
|
)
|
||||||
|
collection = self._create_collection()
|
||||||
|
|
||||||
|
return collection
|
||||||
|
|
||||||
|
def _create_collection(self) -> DocumentCollection:
|
||||||
|
"""
|
||||||
|
Create a new collection in the Zep backend.
|
||||||
|
"""
|
||||||
|
if not self._collection_config:
|
||||||
|
raise ValueError(
|
||||||
|
"Collection config must be specified when creating a new collection."
|
||||||
|
)
|
||||||
|
collection = self._client.document.add_collection(
|
||||||
|
**asdict(self._collection_config)
|
||||||
|
)
|
||||||
|
return collection
|
||||||
|
|
||||||
|
def _generate_documents_to_add(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[Dict[Any, Any]]] = None,
|
||||||
|
document_ids: Optional[List[str]] = None,
|
||||||
|
) -> List[ZepDocument]:
|
||||||
|
from zep_python.document import Document as ZepDocument
|
||||||
|
|
||||||
|
embeddings = None
|
||||||
|
if self._collection and self._collection.is_auto_embedded:
|
||||||
|
if self._embedding is not None:
|
||||||
|
warnings.warn(
|
||||||
|
"""The collection is set to auto-embed and an embedding
|
||||||
|
function is present. Ignoring the embedding function.""",
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
elif self._embedding is not None:
|
||||||
|
embeddings = self._embedding.embed_documents(list(texts))
|
||||||
|
if self._collection and self._collection.embedding_dimensions != len(
|
||||||
|
embeddings[0]
|
||||||
|
):
|
||||||
|
raise ValueError(
|
||||||
|
"The embedding dimensions of the collection and the embedding"
|
||||||
|
" function do not match. Collection dimensions:"
|
||||||
|
f" {self._collection.embedding_dimensions}, Embedding dimensions:"
|
||||||
|
f" {len(embeddings[0])}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
documents: List[ZepDocument] = []
|
||||||
|
for i, d in enumerate(texts):
|
||||||
|
documents.append(
|
||||||
|
ZepDocument(
|
||||||
|
content=d,
|
||||||
|
metadata=metadatas[i] if metadatas else None,
|
||||||
|
document_id=document_ids[i] if document_ids else None,
|
||||||
|
embedding=embeddings[i] if embeddings else None,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return documents
|
||||||
|
|
||||||
|
def add_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
document_ids: Optional[List[str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Run more texts through the embeddings and add to the vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: Iterable of strings to add to the vectorstore.
|
||||||
|
metadatas: Optional list of metadatas associated with the texts.
|
||||||
|
document_ids: Optional list of document ids associated with the texts.
|
||||||
|
kwargs: vectorstore specific parameters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ids from adding the texts into the vectorstore.
|
||||||
|
"""
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
documents = self._generate_documents_to_add(texts, metadatas, document_ids)
|
||||||
|
uuids = self._collection.add_documents(documents)
|
||||||
|
|
||||||
|
return uuids
|
||||||
|
|
||||||
|
async def aadd_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
document_ids: Optional[List[str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Run more texts through the embeddings and add to the vectorstore."""
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
documents = self._generate_documents_to_add(texts, metadatas, document_ids)
|
||||||
|
uuids = await self._collection.aadd_documents(documents)
|
||||||
|
|
||||||
|
return uuids
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
search_type: str,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
k: int = 3,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to query using specified search type."""
|
||||||
|
if search_type == "similarity":
|
||||||
|
return self.similarity_search(query, k=k, metadata=metadata, **kwargs)
|
||||||
|
elif search_type == "mmr":
|
||||||
|
return self.max_marginal_relevance_search(
|
||||||
|
query, k=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"search_type of {search_type} not allowed. Expected "
|
||||||
|
"search_type to be 'similarity' or 'mmr'."
|
||||||
|
)
|
||||||
|
|
||||||
|
async def asearch(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
search_type: str,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
k: int = 3,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to query using specified search type."""
|
||||||
|
if search_type == "similarity":
|
||||||
|
return await self.asimilarity_search(
|
||||||
|
query, k=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
elif search_type == "mmr":
|
||||||
|
return await self.amax_marginal_relevance_search(
|
||||||
|
query, k=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"search_type of {search_type} not allowed. Expected "
|
||||||
|
"search_type to be 'similarity' or 'mmr'."
|
||||||
|
)
|
||||||
|
|
||||||
|
def similarity_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to query."""
|
||||||
|
|
||||||
|
results = self._similarity_search_with_relevance_scores(
|
||||||
|
query, k=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
return [doc for doc, _ in results]
|
||||||
|
|
||||||
|
def similarity_search_with_score(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Run similarity search with distance."""
|
||||||
|
|
||||||
|
return self._similarity_search_with_relevance_scores(
|
||||||
|
query, k=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def _similarity_search_with_relevance_scores(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""
|
||||||
|
Default similarity search with relevance scores. Modify if necessary
|
||||||
|
in subclass.
|
||||||
|
Return docs and relevance scores in the range [0, 1].
|
||||||
|
|
||||||
|
0 is dissimilar, 1 is most similar.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: input text
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
metadata: Optional, metadata filter
|
||||||
|
**kwargs: kwargs to be passed to similarity search. Should include:
|
||||||
|
score_threshold: Optional, a floating point value between 0 to 1 and
|
||||||
|
filter the resulting set of retrieved docs
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Tuples of (doc, similarity_score)
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self._collection.is_auto_embedded and self._embedding:
|
||||||
|
query_vector = self._embedding.embed_query(query)
|
||||||
|
results = self._collection.search(
|
||||||
|
embedding=query_vector, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results = self._collection.search(
|
||||||
|
query, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
(
|
||||||
|
Document(
|
||||||
|
page_content=doc.content,
|
||||||
|
metadata=doc.metadata,
|
||||||
|
),
|
||||||
|
doc.score or 0.0,
|
||||||
|
)
|
||||||
|
for doc in results
|
||||||
|
]
|
||||||
|
|
||||||
|
async def asimilarity_search_with_relevance_scores(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Return docs most similar to query."""
|
||||||
|
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self._collection.is_auto_embedded and self._embedding:
|
||||||
|
query_vector = self._embedding.embed_query(query)
|
||||||
|
results = await self._collection.asearch(
|
||||||
|
embedding=query_vector, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results = await self._collection.asearch(
|
||||||
|
query, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
(
|
||||||
|
Document(
|
||||||
|
page_content=doc.content,
|
||||||
|
metadata=doc.metadata,
|
||||||
|
),
|
||||||
|
doc.score or 0.0,
|
||||||
|
)
|
||||||
|
for doc in results
|
||||||
|
]
|
||||||
|
|
||||||
|
async def asimilarity_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to query."""
|
||||||
|
|
||||||
|
results = await self.asimilarity_search_with_relevance_scores(
|
||||||
|
query, k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return [doc for doc, _ in results]
|
||||||
|
|
||||||
|
def similarity_search_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to embedding vector.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding: Embedding to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
metadata: Optional, metadata filter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query vector.
|
||||||
|
"""
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
results = self._collection.search(
|
||||||
|
embedding=embedding, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
Document(
|
||||||
|
page_content=doc.content,
|
||||||
|
metadata=doc.metadata,
|
||||||
|
)
|
||||||
|
for doc in results
|
||||||
|
]
|
||||||
|
|
||||||
|
async def asimilarity_search_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to embedding vector."""
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
results = self._collection.search(
|
||||||
|
embedding=embedding, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
Document(
|
||||||
|
page_content=doc.content,
|
||||||
|
metadata=doc.metadata,
|
||||||
|
)
|
||||||
|
for doc in results
|
||||||
|
]
|
||||||
|
|
||||||
|
def _max_marginal_relevance_selection(
|
||||||
|
self,
|
||||||
|
query_vector: List[float],
|
||||||
|
results: List["ZepDocument"],
|
||||||
|
*,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
k: int = 4,
|
||||||
|
) -> List[Document]:
|
||||||
|
mmr_selected = maximal_marginal_relevance(
|
||||||
|
np.array([query_vector], dtype=np.float32),
|
||||||
|
[d.embedding for d in results],
|
||||||
|
lambda_mult=lambda_mult,
|
||||||
|
k=k,
|
||||||
|
)
|
||||||
|
selected = [results[i] for i in mmr_selected]
|
||||||
|
return [Document(page_content=d.content, metadata=d.metadata) for d in selected]
|
||||||
|
|
||||||
|
def max_marginal_relevance_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
fetch_k: int = 20,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs selected using the maximal marginal relevance.
|
||||||
|
|
||||||
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||||
|
among selected documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||||
|
lambda_mult: Number between 0 and 1 that determines the degree
|
||||||
|
of diversity among the results with 0 corresponding
|
||||||
|
to maximum diversity and 1 to minimum diversity.
|
||||||
|
Defaults to 0.5.
|
||||||
|
metadata: Optional, metadata to filter the resulting set of retrieved docs
|
||||||
|
Returns:
|
||||||
|
List of Documents selected by maximal marginal relevance.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self._collection.is_auto_embedded and self._embedding:
|
||||||
|
query_vector = self._embedding.embed_query(query)
|
||||||
|
results = self._collection.search(
|
||||||
|
embedding=query_vector, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results, query_vector = self._collection.search_return_query_vector(
|
||||||
|
query, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._max_marginal_relevance_selection(
|
||||||
|
query_vector, results, k=k, lambda_mult=lambda_mult
|
||||||
|
)
|
||||||
|
|
||||||
|
async def amax_marginal_relevance_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
fetch_k: int = 20,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs selected using the maximal marginal relevance."""
|
||||||
|
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self._collection.is_auto_embedded and self._embedding:
|
||||||
|
query_vector = self._embedding.embed_query(query)
|
||||||
|
results = await self._collection.asearch(
|
||||||
|
embedding=query_vector, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results, query_vector = await self._collection.asearch_return_query_vector(
|
||||||
|
query, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._max_marginal_relevance_selection(
|
||||||
|
query_vector, results, k=k, lambda_mult=lambda_mult
|
||||||
|
)
|
||||||
|
|
||||||
|
def max_marginal_relevance_search_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
fetch_k: int = 20,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs selected using the maximal marginal relevance.
|
||||||
|
|
||||||
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||||
|
among selected documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding: Embedding to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||||
|
lambda_mult: Number between 0 and 1 that determines the degree
|
||||||
|
of diversity among the results with 0 corresponding
|
||||||
|
to maximum diversity and 1 to minimum diversity.
|
||||||
|
Defaults to 0.5.
|
||||||
|
metadata: Optional, metadata to filter the resulting set of retrieved docs
|
||||||
|
Returns:
|
||||||
|
List of Documents selected by maximal marginal relevance.
|
||||||
|
"""
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
results = self._collection.search(
|
||||||
|
embedding=embedding, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._max_marginal_relevance_selection(
|
||||||
|
embedding, results, k=k, lambda_mult=lambda_mult
|
||||||
|
)
|
||||||
|
|
||||||
|
async def amax_marginal_relevance_search_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
fetch_k: int = 20,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs selected using the maximal marginal relevance."""
|
||||||
|
if not self._collection:
|
||||||
|
raise ValueError(
|
||||||
|
"collection should be an instance of a Zep DocumentCollection"
|
||||||
|
)
|
||||||
|
|
||||||
|
results = await self._collection.asearch(
|
||||||
|
embedding=embedding, limit=k, metadata=metadata, **kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._max_marginal_relevance_selection(
|
||||||
|
embedding, results, k=k, lambda_mult=lambda_mult
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_texts(
|
||||||
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Optional[Embeddings] = None,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
collection_name: str = "",
|
||||||
|
api_url: str = "",
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
config: Optional[CollectionConfig] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> ZepVectorStore:
|
||||||
|
"""
|
||||||
|
Class method that returns a ZepVectorStore instance initialized from texts.
|
||||||
|
|
||||||
|
If the collection does not exist, it will be created.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts (List[str]): The list of texts to add to the vectorstore.
|
||||||
|
embedding (Optional[Embeddings]): Optional embedding function to use to
|
||||||
|
embed the texts.
|
||||||
|
metadatas (Optional[List[Dict[str, Any]]]): Optional list of metadata
|
||||||
|
associated with the texts.
|
||||||
|
collection_name (str): The name of the collection in the Zep store.
|
||||||
|
api_url (str): The URL of the Zep API.
|
||||||
|
api_key (Optional[str]): The API key for the Zep API.
|
||||||
|
config (Optional[CollectionConfig]): The configuration for the collection.
|
||||||
|
**kwargs: Additional parameters specific to the vectorstore.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ZepVectorStore: An instance of ZepVectorStore.
|
||||||
|
"""
|
||||||
|
vecstore = cls(
|
||||||
|
collection_name,
|
||||||
|
api_url,
|
||||||
|
api_key=api_key,
|
||||||
|
config=config,
|
||||||
|
embedding=embedding,
|
||||||
|
)
|
||||||
|
vecstore.add_texts(texts, metadatas)
|
||||||
|
return vecstore
|
||||||
|
|
||||||
|
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
|
||||||
|
"""Delete by Zep vector UUIDs.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ids : Optional[List[str]]
|
||||||
|
The UUIDs of the vectors to delete.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If no UUIDs are provided.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if ids is None or len(ids) == 0:
|
||||||
|
raise ValueError("No uuids provided to delete.")
|
||||||
|
|
||||||
|
if self._collection is None:
|
||||||
|
raise ValueError("No collection name provided.")
|
||||||
|
|
||||||
|
for u in ids:
|
||||||
|
self._collection.delete_document(u)
|
@ -0,0 +1,238 @@
|
|||||||
|
# mypy: disable-error-code=attr-defined
|
||||||
|
import copy
|
||||||
|
from random import random
|
||||||
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pytest_mock import MockerFixture
|
||||||
|
|
||||||
|
from langchain.schema import Document
|
||||||
|
from langchain.vectorstores import ZepVectorStore
|
||||||
|
from langchain.vectorstores.zep import CollectionConfig
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from zep_python.document import Document as ZepDocument
|
||||||
|
from zep_python.document import DocumentCollection
|
||||||
|
|
||||||
|
VECTOR_DIMS = 5
|
||||||
|
|
||||||
|
|
||||||
|
def gen_vector() -> List[float]:
|
||||||
|
return [random() for _ in range(VECTOR_DIMS)]
|
||||||
|
|
||||||
|
|
||||||
|
def gen_mock_zep_document(
|
||||||
|
collection_name: str,
|
||||||
|
embedding_dimensions: Optional[int] = None,
|
||||||
|
) -> "ZepDocument":
|
||||||
|
from zep_python.document import Document as ZepDocument
|
||||||
|
|
||||||
|
embedding = (
|
||||||
|
[random() for _ in range(embedding_dimensions)]
|
||||||
|
if embedding_dimensions
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
return ZepDocument(
|
||||||
|
uuid=str(uuid4()),
|
||||||
|
collection_name=collection_name,
|
||||||
|
content="Test Document",
|
||||||
|
embedding=embedding,
|
||||||
|
metadata={"key": "value"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def texts_metadatas() -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"texts": ["Test Document" for _ in range(2)],
|
||||||
|
"metadatas": [{"key": "value"} for _ in range(2)],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_documents() -> List[Document]:
|
||||||
|
return [
|
||||||
|
Document(
|
||||||
|
page_content="Test Document",
|
||||||
|
metadata={"key": "value"},
|
||||||
|
)
|
||||||
|
for _ in range(2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def texts_metadatas_as_zep_documents() -> List["ZepDocument"]:
|
||||||
|
from zep_python.document import Document as ZepDocument
|
||||||
|
|
||||||
|
return [
|
||||||
|
ZepDocument(
|
||||||
|
content="Test Document",
|
||||||
|
metadata={"key": "value"},
|
||||||
|
)
|
||||||
|
for _ in range(2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def search_results() -> List["ZepDocument"]:
|
||||||
|
return [
|
||||||
|
gen_mock_zep_document(
|
||||||
|
collection_name="test_collection", embedding_dimensions=VECTOR_DIMS
|
||||||
|
)
|
||||||
|
for _ in range(2)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def search_results_with_query_embedding() -> Tuple[List["ZepDocument"], List[float]]:
|
||||||
|
return_count = 2
|
||||||
|
return [
|
||||||
|
gen_mock_zep_document(
|
||||||
|
collection_name="test_collection", embedding_dimensions=VECTOR_DIMS
|
||||||
|
)
|
||||||
|
for _ in range(return_count)
|
||||||
|
], gen_vector()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_collection_config() -> CollectionConfig:
|
||||||
|
return CollectionConfig(
|
||||||
|
name="test_collection",
|
||||||
|
description="Test Collection",
|
||||||
|
metadata={"key": "value"},
|
||||||
|
embedding_dimensions=VECTOR_DIMS,
|
||||||
|
is_auto_embedded=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
@pytest.mark.requires("zep_python")
|
||||||
|
def mock_collection(
|
||||||
|
mocker: MockerFixture,
|
||||||
|
mock_collection_config: CollectionConfig,
|
||||||
|
search_results: List[Document],
|
||||||
|
search_results_with_query_embedding: Tuple[List[Document], List[float]],
|
||||||
|
) -> "DocumentCollection":
|
||||||
|
from zep_python.document import DocumentCollection
|
||||||
|
|
||||||
|
mock_collection: DocumentCollection = mocker.patch(
|
||||||
|
"zep_python.document.collections.DocumentCollection", autospec=True
|
||||||
|
)
|
||||||
|
mock_collection.search.return_value = copy.deepcopy(search_results)
|
||||||
|
mock_collection.asearch.return_value = copy.deepcopy(search_results)
|
||||||
|
|
||||||
|
temp_value = copy.deepcopy(search_results_with_query_embedding)
|
||||||
|
mock_collection.search_return_query_vector.return_value = copy.deepcopy(temp_value)
|
||||||
|
mock_collection.asearch_return_query_vector.return_value = copy.deepcopy(temp_value)
|
||||||
|
|
||||||
|
mock_collection.name = mock_collection_config.name
|
||||||
|
mock_collection.is_auto_embedded = mock_collection_config.is_auto_embedded
|
||||||
|
mock_collection.embedding_dimensions = mock_collection_config.embedding_dimensions
|
||||||
|
|
||||||
|
return mock_collection
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
@pytest.mark.requires("zep_python")
|
||||||
|
def zep_vectorstore(
|
||||||
|
mocker: MockerFixture,
|
||||||
|
mock_collection: "DocumentCollection",
|
||||||
|
mock_collection_config: CollectionConfig,
|
||||||
|
) -> ZepVectorStore:
|
||||||
|
mock_document_client = mocker.patch(
|
||||||
|
"zep_python.document.client.DocumentClient", autospec=True
|
||||||
|
)
|
||||||
|
mock_document_client.get_collection.return_value = mock_collection
|
||||||
|
mock_client = mocker.patch("zep_python.ZepClient", autospec=True)
|
||||||
|
mock_client.return_value.document = mock_document_client
|
||||||
|
|
||||||
|
vs = ZepVectorStore(
|
||||||
|
mock_collection_config.name,
|
||||||
|
"http://localhost:8080",
|
||||||
|
api_key="test",
|
||||||
|
config=mock_collection_config,
|
||||||
|
)
|
||||||
|
return vs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("zep_python")
|
||||||
|
def test_from_texts(
|
||||||
|
zep_vectorstore: ZepVectorStore,
|
||||||
|
mock_collection_config: CollectionConfig,
|
||||||
|
mock_collection: "DocumentCollection",
|
||||||
|
texts_metadatas: Dict[str, Any],
|
||||||
|
texts_metadatas_as_zep_documents: List["ZepDocument"],
|
||||||
|
) -> None:
|
||||||
|
vs = zep_vectorstore.from_texts(
|
||||||
|
**texts_metadatas,
|
||||||
|
collection_name=mock_collection_config.name,
|
||||||
|
api_url="http://localhost:8000"
|
||||||
|
)
|
||||||
|
|
||||||
|
vs._collection.add_documents.assert_called_once_with( # type: ignore
|
||||||
|
texts_metadatas_as_zep_documents
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("zep_python")
|
||||||
|
def test_add_documents(
|
||||||
|
zep_vectorstore: ZepVectorStore,
|
||||||
|
mock_collection: "DocumentCollection",
|
||||||
|
mock_documents: List[Document],
|
||||||
|
texts_metadatas_as_zep_documents: List["ZepDocument"],
|
||||||
|
) -> None:
|
||||||
|
zep_vectorstore.add_documents(mock_documents)
|
||||||
|
|
||||||
|
mock_collection.add_documents.assert_called_once_with( # type: ignore
|
||||||
|
texts_metadatas_as_zep_documents
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("zep_python")
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_asearch_similarity(
|
||||||
|
zep_vectorstore: ZepVectorStore,
|
||||||
|
) -> None:
|
||||||
|
r = await zep_vectorstore.asearch(
|
||||||
|
query="Test Document", search_type="similarity", k=2
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(r) == 2
|
||||||
|
assert r[0].page_content == "Test Document"
|
||||||
|
assert r[0].metadata == {"key": "value"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("zep_python")
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_asearch_mmr(
|
||||||
|
zep_vectorstore: ZepVectorStore,
|
||||||
|
) -> None:
|
||||||
|
r = await zep_vectorstore.asearch(query="Test Document", search_type="mmr", k=1)
|
||||||
|
|
||||||
|
assert len(r) == 1
|
||||||
|
assert r[0].page_content == "Test Document"
|
||||||
|
assert r[0].metadata == {"key": "value"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("zep_python")
|
||||||
|
def test_search_similarity(
|
||||||
|
zep_vectorstore: ZepVectorStore,
|
||||||
|
) -> None:
|
||||||
|
r = zep_vectorstore.search(query="Test Document", search_type="similarity", k=2)
|
||||||
|
|
||||||
|
assert len(r) == 2
|
||||||
|
assert r[0].page_content == "Test Document"
|
||||||
|
assert r[0].metadata == {"key": "value"}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("zep_python")
|
||||||
|
def test_search_mmr(
|
||||||
|
zep_vectorstore: ZepVectorStore,
|
||||||
|
) -> None:
|
||||||
|
r = zep_vectorstore.search(query="Test Document", search_type="mmr", k=1)
|
||||||
|
|
||||||
|
assert len(r) == 1
|
||||||
|
assert r[0].page_content == "Test Document"
|
||||||
|
assert r[0].metadata == {"key": "value"}
|
Loading…
Reference in New Issue