mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Merge branch 'master' of github.com:hwchase17/langchain
This commit is contained in:
commit
bd8d418a95
@ -16,5 +16,5 @@ Construct sequences of calls
|
||||
Let chains choose which tools to use given high-level directives
|
||||
#### [Memory](/docs/modules/memory/)
|
||||
Persist application state between runs of a chain
|
||||
#### [Callbacks](/docs/modules/callbacks/getting_started/)
|
||||
#### [Callbacks](/docs/modules/callbacks/)
|
||||
Log and stream intermediate steps of any chain
|
14
docs/docs_skeleton/ignore_build.sh
Executable file
14
docs/docs_skeleton/ignore_build.sh
Executable file
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "VERCEL_GIT_COMMIT_REF: $VERCEL_GIT_COMMIT_REF"
|
||||
|
||||
if [[ $VERCEL_GIT_COMMIT_REF = __docs__* ]] ; then
|
||||
# Proceed with the build
|
||||
echo "✅ - Build can proceed"
|
||||
exit 1;
|
||||
|
||||
else
|
||||
# Don't build
|
||||
echo "🛑 - Build cancelled"
|
||||
exit 0;
|
||||
fi
|
@ -1,7 +1,6 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
|
||||
"metadata": {},
|
||||
@ -78,7 +77,6 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "2721ba8a",
|
||||
"metadata": {},
|
||||
@ -99,6 +97,135 @@
|
||||
" recursive=False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d6b80931",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Passing in Optional File Loaders\n",
|
||||
"\n",
|
||||
"When processing files other than Google Docs and Google Sheets, it can be helpful to pass an optional file loader to `GoogleDriveLoader`. If you pass in a file loader, that file loader will be used on documents that do not have a Google Docs or Google Sheets MIME type. Here is an example of how to load an Excel document from Google Drive using a file loader. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "94207e39",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import GoogleDriveLoader\n",
|
||||
"from langchain.document_loaders import UnstructuredFileIOLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a15fbee0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"file_id=\"1x9WBtFPWMEAdjcJzPScRsjpjQvpSo_kz\"\n",
|
||||
"loader = GoogleDriveLoader(\n",
|
||||
" file_ids=[file_id],\n",
|
||||
" file_loader_cls=UnstructuredFileIOLoader,\n",
|
||||
" file_loader_kwargs={\"mode\": \"elements\"}\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "98410bda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "e3e72221",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n <tbody>\\n <tr>\\n <td>Team</td>\\n <td>Location</td>\\n <td>Stanley Cups</td>\\n </tr>\\n <tr>\\n <td>Blues</td>\\n <td>STL</td>\\n <td>1</td>\\n </tr>\\n <tr>\\n <td>Flyers</td>\\n <td>PHI</td>\\n <td>2</td>\\n </tr>\\n <tr>\\n <td>Maple Leafs</td>\\n <td>TOR</td>\\n <td>13</td>\\n </tr>\\n </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "238cd06f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also process a folder with a mix of files and Google Docs/Sheets using the following pattern:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "0e2d093f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"folder_id=\"1asMOHY1BqBS84JcRbOag5LOJac74gpmD\"\n",
|
||||
"loader = GoogleDriveLoader(\n",
|
||||
" folder_id=folder_id,\n",
|
||||
" file_loader_cls=UnstructuredFileIOLoader,\n",
|
||||
" file_loader_kwargs={\"mode\": \"elements\"}\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "b35ddcc6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "3cc141e0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n <tbody>\\n <tr>\\n <td>Team</td>\\n <td>Location</td>\\n <td>Stanley Cups</td>\\n </tr>\\n <tr>\\n <td>Blues</td>\\n <td>STL</td>\\n <td>1</td>\\n </tr>\\n <tr>\\n <td>Flyers</td>\\n <td>PHI</td>\\n <td>2</td>\\n </tr>\\n <tr>\\n <td>Maple Leafs</td>\\n <td>TOR</td>\\n <td>13</td>\\n </tr>\\n </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e312268a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -117,7 +244,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -7,30 +7,50 @@
|
||||
"source": [
|
||||
"# MarkdownHeaderTextSplitter\n",
|
||||
"\n",
|
||||
"This splits a markdown file by a specified set of headers. For example, if we want to split this markdown:\n",
|
||||
"### Motivation\n",
|
||||
"\n",
|
||||
"Many chat or Q+A applications involve chunking input documents prior to embedding and vector storage.\n",
|
||||
"\n",
|
||||
"[These notes](https://www.pinecone.io/learn/chunking-strategies/) from Pinecone provide some useful tips:\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"When a full paragraph or document is embedded, the embedding process considers both the overall context and the relationships between the sentences and phrases within the text. This can result in a more comprehensive vector representation that captures the broader meaning and themes of the text.\n",
|
||||
"```\n",
|
||||
" \n",
|
||||
"As mentioned, chunking often aims to keep text with common context together.\n",
|
||||
"\n",
|
||||
"With this in mind, we might want to specifically honor the structure of the document itself.\n",
|
||||
"\n",
|
||||
"For example, a markdown file is organized by headers.\n",
|
||||
"\n",
|
||||
"Creating chunks within specific header groups is an intuitive idea.\n",
|
||||
"\n",
|
||||
"To address this challenge, we can use `MarkdownHeaderTextSplitter`.\n",
|
||||
"\n",
|
||||
"This will split a markdown file by a specified set of headers. \n",
|
||||
"\n",
|
||||
"For example, if we want to split this markdown:\n",
|
||||
"```\n",
|
||||
"md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Headers to split on:\n",
|
||||
" \n",
|
||||
"We can specify the headers to split on:\n",
|
||||
"```\n",
|
||||
"[(\"#\", \"Header 1\"),(\"##\", \"Header 2\")]\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Expected output:\n",
|
||||
"And content is grouped or split by common headers:\n",
|
||||
"```\n",
|
||||
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Optionally, this also includes `return_each_line` in case a user want to perform other types of aggregation. \n",
|
||||
"\n",
|
||||
"If `return_each_line=True`, each line and associated header metadata are simply returned. "
|
||||
"Let's have a look at some examples below."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 4,
|
||||
"id": "19c044f0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -40,7 +60,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 5,
|
||||
"id": "2ae3649b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -64,63 +84,403 @@
|
||||
"]\n",
|
||||
"\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
||||
"splits = markdown_splitter.split_text(markdown_document)\n",
|
||||
"for split in splits:\n",
|
||||
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
|
||||
"for split in md_header_splits:\n",
|
||||
" print(split)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2a32026a",
|
||||
"id": "9bd8977a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Here's an example on a larger file with `return_each_line=True` passed, allowing each line to be examined."
|
||||
"Within each markdown group we can then apply any text splitter we want. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "8af8f9a2",
|
||||
"execution_count": 6,
|
||||
"id": "480e0e3a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'content': 'Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n",
|
||||
"{'content': 'Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n",
|
||||
"{'content': 'As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n",
|
||||
"{'content': 'additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n",
|
||||
"{'content': 'From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence', 'Header 4': 'Standardization'}}\n",
|
||||
"{'content': 'Implementations of Markdown are available for over a dozen programming languages.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Implementations'}}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"markdown_document = \"# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n",
|
||||
"\n",
|
||||
"headers_to_split_on = [\n",
|
||||
" (\"#\", \"Header 1\"),\n",
|
||||
" (\"##\", \"Header 2\"),\n",
|
||||
" (\"###\", \"Header 3\"),\n",
|
||||
" (\"####\", \"Header 4\"),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(\n",
|
||||
" headers_to_split_on=headers_to_split_on, return_each_line=True\n",
|
||||
")\n",
|
||||
"splits = markdown_splitter.split_text(markdown_document)\n",
|
||||
"for line in splits:\n",
|
||||
" print(line)"
|
||||
"# MD splits\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
||||
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
|
||||
"\n",
|
||||
"# Char-level splits\n",
|
||||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||
"chunk_size = 10\n",
|
||||
"chunk_overlap = 0\n",
|
||||
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
||||
"\n",
|
||||
"# Split within each header group\n",
|
||||
"all_splits=[]\n",
|
||||
"all_metadatas=[] \n",
|
||||
"for header_group in md_header_splits:\n",
|
||||
" _splits = text_splitter.split_text(header_group['content'])\n",
|
||||
" _metadatas = [header_group['metadata'] for _ in _splits]\n",
|
||||
" all_splits += _splits\n",
|
||||
" all_metadatas += _metadatas"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "3f5d775e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Markdown[9'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_splits[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "33ab0d5c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'Header 1': 'Intro', 'Header 2': 'History'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_metadatas[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "dcf70760",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Use case\n",
|
||||
"\n",
|
||||
"Let's appy `MarkdownHeaderTextSplitter` to a Notion page [here](https://rlancemartin.notion.site/Auto-Evaluation-of-Metadata-Filtering-18502448c85240828f33716740f9574b?pvs=4) as a test.\n",
|
||||
"\n",
|
||||
"The page is downloaded as markdown and stored locally as shown [here](https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/notion)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "73313d6c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load Notion database as a markdownfile file\n",
|
||||
"from langchain.document_loaders import NotionDirectoryLoader\n",
|
||||
"loader = NotionDirectoryLoader(\"../Notion_DB_Metadata\")\n",
|
||||
"docs = loader.load()\n",
|
||||
"md_file=docs[0].page_content"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "6fa341d7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'content': 'We previously introduced [auto-evaluator](https://blog.langchain.dev/auto-evaluator-opportunities/), an open-source tool for grading LLM question-answer chains. Here, we extend auto-evaluator with a [lightweight Streamlit app](https://github.com/langchain-ai/auto-evaluator/tree/main/streamlit) that can connect to any existing Pinecone index. We add the ability to test metadata filtering using `SelfQueryRetriever` as well as some other approaches that we’ve found to be useful, as discussed below. \\n[ret_trim.mov](Auto-Evaluation%20of%20Metadata%20Filtering%2018502448c85240828f33716740f9574b/ret_trim.mov)',\n",
|
||||
" 'metadata': {'Section': 'Evaluation'}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Let's create groups based on the section headers\n",
|
||||
"headers_to_split_on = [\n",
|
||||
" (\"###\", \"Section\"),\n",
|
||||
"]\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
||||
"md_header_splits = markdown_splitter.split_text(md_file)\n",
|
||||
"md_header_splits[3]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "42d8bb9b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, we split the text in each group and keep the group as metadata."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"id": "a9831de2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define our text splitter\n",
|
||||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||
"chunk_size = 500\n",
|
||||
"chunk_overlap = 50\n",
|
||||
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
||||
" \n",
|
||||
"# Create splits within each header group\n",
|
||||
"all_splits=[]\n",
|
||||
"all_metadatas=[]\n",
|
||||
"for header_group in md_header_splits:\n",
|
||||
" _splits = text_splitter.split_text(header_group['content'])\n",
|
||||
" _metadatas = [header_group['metadata'] for _ in _splits]\n",
|
||||
" all_splits += _splits\n",
|
||||
" all_metadatas += _metadatas"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"id": "b5691ee5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'In these cases, semantic search will look for the concept `episode 53` in the chunks, but instead we simply want to filter the chunks for `episode 53` and then perform semantic search to extract those that best summarize the episode. Metadata filtering does this, so long as we 1) we have a metadata filter for episode number and 2) we can extract the value from the query (e.g., `54` or `252`) that we want to extract. The LangChain `SelfQueryRetriever` does the latter (see'"
|
||||
]
|
||||
},
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_splits[6]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "e1dfb405",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'Section': 'Motivation'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_metadatas[6]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "79868606",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This sets us up well do perform metadata filtering based on the document structure.\n",
|
||||
"\n",
|
||||
"Let's bring this all togther by building a vectorstore first."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "987183f2",
|
||||
"id": "143d7347",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"! pip install chromadb"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"id": "cbcb917a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Build vectorstore\n",
|
||||
"from langchain.vectorstores import Chroma\n",
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"vectorstore = Chroma.from_texts(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3f6031fc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's create a `SelfQueryRetriever` that can filter based upon metadata we defined."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"id": "5b1b6a75",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create retriever \n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
|
||||
"from langchain.chains.query_constructor.base import AttributeInfo\n",
|
||||
"\n",
|
||||
"# Define our metadata\n",
|
||||
"metadata_field_info = [\n",
|
||||
" AttributeInfo(\n",
|
||||
" name=\"Section\",\n",
|
||||
" description=\"Headers of the markdown document that organize the ideas\",\n",
|
||||
" type=\"string or list[string]\",\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"document_content_description = \"Headers of the markdown document\"\n",
|
||||
"\n",
|
||||
"# Define self query retriver\n",
|
||||
"llm = OpenAI(temperature=0)\n",
|
||||
"sq_retriever = SelfQueryRetriever.from_llm(llm, vectorstore, document_content_description, metadata_field_info, verbose=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9d0dbed8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can fetch chunks specifically from any section of the doc!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "6c37fe1b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='Introduction' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='Section', value='Introduction') limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='![Untitled](Auto-Evaluation%20of%20Metadata%20Filtering%2018502448c85240828f33716740f9574b/Untitled.png)', metadata={'Section': 'Introduction'}),\n",
|
||||
" Document(page_content='Q+A systems often use a two-step approach: retrieve relevant text chunks and then synthesize them into an answer. There many ways to approach this. For example, we recently [discussed](https://blog.langchain.dev/auto-evaluation-of-anthropic-100k-context-window/) the Retriever-Less option (at bottom in the below diagram), highlighting the Anthropic 100k context window model. Metadata filtering is an alternative approach that pre-filters chunks based on a user-defined criteria in a VectorDB using', metadata={'Section': 'Introduction'}),\n",
|
||||
" Document(page_content='on a user-defined criteria in a VectorDB using metadata tags prior to semantic search.', metadata={'Section': 'Introduction'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Test\n",
|
||||
"question=\"Summarize the Introduction section of the document\"\n",
|
||||
"sq_retriever.get_relevant_documents(question)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bb0efebd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, we can create chat or Q+A apps that are aware of the explict document structure. \n",
|
||||
"\n",
|
||||
"Of course, semantic search without specific metadata filtering would probably work reasonably well for this simple document.\n",
|
||||
"\n",
|
||||
"But, the ability to retain document structure for metadata filtering can be helpful for more complicated or longer documents."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "3b40e24e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='Introduction' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='Section', value='Introduction') limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The document discusses different approaches to retrieve relevant text chunks and synthesize them into an answer in Q+A systems. One of the approaches is metadata filtering, which pre-filters chunks based on user-defined criteria in a VectorDB using metadata tags prior to semantic search. The Retriever-Less option, which uses the Anthropic 100k context window model, is also mentioned as an alternative approach.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n",
|
||||
"qa_chain = RetrievalQA.from_chain_type(llm,retriever=sq_retriever)\n",
|
||||
"qa_chain.run(question)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "dfeeb327",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"query='Testing' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='Section', value='Testing') limit=None\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The Testing section of the document describes how the performance of the SelfQueryRetriever was evaluated using various test cases. The tests were designed to evaluate the ability of the SelfQueryRetriever to correctly infer metadata filters from the query using metadata_field_info. The results of the tests showed that the SelfQueryRetriever performed well in some cases, but failed in others. The document also provides a link to the code for the auto-evaluator and instructions on how to use it. Additionally, the document mentions the use of the Kor library for structured data extraction to explicitly specify transformations that the auto-evaluator can use.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 50,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"question=\"Summarize the Testing section of the document\"\n",
|
||||
"qa_chain.run(question)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -129,11 +129,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "db3fa309",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
@ -144,11 +140,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c160d5bb",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
@ -158,11 +150,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "96215c90",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = OpenSearchVectorSearch.from_documents(\n",
|
||||
@ -183,11 +171,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "62a7cea0",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
@ -207,11 +191,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0a8e3c0e",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = OpenSearchVectorSearch.from_documents(\n",
|
||||
@ -230,11 +210,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "92bc40db",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
@ -254,11 +230,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6d9f436e",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = OpenSearchVectorSearch.from_documents(\n",
|
||||
@ -278,16 +250,34 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8ca50bce",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Maximum marginal relevance search (MMR)\n",
|
||||
"If you’d like to look up for some similar documents, but you’d also like to receive diverse results, MMR is method you should consider. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents."
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10, lambda_param=0.5)"
|
||||
],
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "73264864",
|
||||
|
@ -220,7 +220,11 @@ class ArgillaCallbackHandler(BaseCallbackHandler):
|
||||
def on_chain_start(
|
||||
self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
|
||||
) -> None:
|
||||
"""Do nothing when LLM chain starts."""
|
||||
"""If the key `input` is in `inputs`, then save it in `self.prompts` using
|
||||
either the `parent_run_id` or the `run_id` as the key. This is done so that
|
||||
we don't log the same input prompt twice, once when the LLM starts and once
|
||||
when the chain starts.
|
||||
"""
|
||||
if "input" in inputs:
|
||||
self.prompts.update(
|
||||
{
|
||||
@ -233,44 +237,55 @@ class ArgillaCallbackHandler(BaseCallbackHandler):
|
||||
)
|
||||
|
||||
def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None:
|
||||
"""Do nothing when LLM chain ends."""
|
||||
prompts = self.prompts[str(kwargs["parent_run_id"] or kwargs["run_id"])]
|
||||
if "outputs" in outputs:
|
||||
# Creates the records and adds them to the `FeedbackDataset`
|
||||
self.dataset.add_records(
|
||||
records=[
|
||||
{
|
||||
"fields": {
|
||||
"prompt": prompt,
|
||||
"response": output["text"].strip(),
|
||||
},
|
||||
}
|
||||
for prompt, output in zip(prompts, outputs["outputs"])
|
||||
]
|
||||
)
|
||||
elif "output" in outputs:
|
||||
# Creates the records and adds them to the `FeedbackDataset`
|
||||
self.dataset.add_records(
|
||||
records=[
|
||||
{
|
||||
"fields": {
|
||||
"prompt": " ".join(prompts),
|
||||
"response": outputs["output"].strip(),
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The `outputs` dictionary did not contain the expected keys `outputs` "
|
||||
"or `output`."
|
||||
)
|
||||
"""If either the `parent_run_id` or the `run_id` is in `self.prompts`, then
|
||||
log the outputs to Argilla, and pop the run from `self.prompts`. The behavior
|
||||
differs if the output is a list or not.
|
||||
"""
|
||||
if not any(
|
||||
key in self.prompts
|
||||
for key in [str(kwargs["parent_run_id"]), str(kwargs["run_id"])]
|
||||
):
|
||||
return
|
||||
prompts = self.prompts.get(str(kwargs["parent_run_id"])) or self.prompts.get(
|
||||
str(kwargs["run_id"])
|
||||
)
|
||||
for chain_output_key, chain_output_val in outputs.items():
|
||||
if isinstance(chain_output_val, list):
|
||||
# Creates the records and adds them to the `FeedbackDataset`
|
||||
self.dataset.add_records(
|
||||
records=[
|
||||
{
|
||||
"fields": {
|
||||
"prompt": prompt,
|
||||
"response": output["text"].strip(),
|
||||
},
|
||||
}
|
||||
for prompt, output in zip(
|
||||
prompts, chain_output_val # type: ignore
|
||||
)
|
||||
]
|
||||
)
|
||||
else:
|
||||
# Creates the records and adds them to the `FeedbackDataset`
|
||||
self.dataset.add_records(
|
||||
records=[
|
||||
{
|
||||
"fields": {
|
||||
"prompt": " ".join(prompts), # type: ignore
|
||||
"response": chain_output_val.strip(),
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Push the records to Argilla
|
||||
self.dataset.push_to_argilla()
|
||||
|
||||
# Pop current run from `self.runs`
|
||||
self.prompts.pop(str(kwargs["parent_run_id"] or kwargs["run_id"]))
|
||||
if str(kwargs["parent_run_id"]) in self.prompts:
|
||||
self.prompts.pop(str(kwargs["parent_run_id"]))
|
||||
if str(kwargs["run_id"]) in self.prompts:
|
||||
self.prompts.pop(str(kwargs["run_id"]))
|
||||
|
||||
def on_chain_error(
|
||||
self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
|
||||
|
@ -163,16 +163,18 @@ class MapReduceDocumentsChain(BaseCombineDocumentsChain):
|
||||
[{**{self.document_variable_name: d.page_content}, **kwargs} for d in docs],
|
||||
callbacks=callbacks,
|
||||
)
|
||||
return self._process_results(results, docs, callbacks=callbacks, **kwargs)
|
||||
return await self._aprocess_results(
|
||||
results, docs, callbacks=callbacks, **kwargs
|
||||
)
|
||||
|
||||
def _process_results(
|
||||
def _process_results_common(
|
||||
self,
|
||||
results: List[Dict],
|
||||
docs: List[Document],
|
||||
token_max: int = 3000,
|
||||
callbacks: Callbacks = None,
|
||||
**kwargs: Any,
|
||||
) -> Tuple[str, dict]:
|
||||
) -> Tuple[List[Document], dict]:
|
||||
question_result_key = self.llm_chain.output_key
|
||||
result_docs = [
|
||||
Document(page_content=r[question_result_key], metadata=docs[i].metadata)
|
||||
@ -201,11 +203,39 @@ class MapReduceDocumentsChain(BaseCombineDocumentsChain):
|
||||
extra_return_dict = {"intermediate_steps": _results}
|
||||
else:
|
||||
extra_return_dict = {}
|
||||
return result_docs, extra_return_dict
|
||||
|
||||
def _process_results(
|
||||
self,
|
||||
results: List[Dict],
|
||||
docs: List[Document],
|
||||
token_max: int = 3000,
|
||||
callbacks: Callbacks = None,
|
||||
**kwargs: Any,
|
||||
) -> Tuple[str, dict]:
|
||||
result_docs, extra_return_dict = self._process_results_common(
|
||||
results, docs, token_max, callbacks=callbacks, **kwargs
|
||||
)
|
||||
output = self.combine_document_chain.run(
|
||||
input_documents=result_docs, callbacks=callbacks, **kwargs
|
||||
)
|
||||
return output, extra_return_dict
|
||||
|
||||
async def _aprocess_results(
|
||||
self,
|
||||
results: List[Dict],
|
||||
docs: List[Document],
|
||||
callbacks: Callbacks = None,
|
||||
**kwargs: Any,
|
||||
) -> Tuple[str, dict]:
|
||||
result_docs, extra_return_dict = self._process_results_common(
|
||||
results, docs, callbacks=callbacks, **kwargs
|
||||
)
|
||||
output = await self.combine_document_chain.arun(
|
||||
input_documents=result_docs, callbacks=callbacks, **kwargs
|
||||
)
|
||||
return output, extra_return_dict
|
||||
|
||||
@property
|
||||
def _chain_type(self) -> str:
|
||||
return "map_reduce_documents_chain"
|
||||
|
@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
recursive: bool = False
|
||||
file_types: Optional[Sequence[str]] = None
|
||||
load_trashed_files: bool = False
|
||||
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
|
||||
# results in pydantic validation errors
|
||||
file_loader_cls: Any = None
|
||||
file_loader_kwargs: Dict["str", Any] = {}
|
||||
|
||||
@root_validator
|
||||
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
returns.append(self._load_document_from_id(file["id"])) # type: ignore
|
||||
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
|
||||
returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore
|
||||
elif file["mimeType"] == "application/pdf":
|
||||
elif (
|
||||
file["mimeType"] == "application/pdf"
|
||||
or self.file_loader_cls is not None
|
||||
):
|
||||
returns.extend(self._load_file_from_id(file["id"])) # type: ignore
|
||||
else:
|
||||
pass
|
||||
@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
done = False
|
||||
while done is False:
|
||||
status, done = downloader.next_chunk()
|
||||
content = fh.getvalue()
|
||||
|
||||
from PyPDF2 import PdfReader
|
||||
if self.file_loader_cls is not None:
|
||||
fh.seek(0)
|
||||
loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs)
|
||||
docs = loader.load()
|
||||
for doc in docs:
|
||||
doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view"
|
||||
return docs
|
||||
|
||||
pdf_reader = PdfReader(BytesIO(content))
|
||||
else:
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
return [
|
||||
Document(
|
||||
page_content=page.extract_text(),
|
||||
metadata={
|
||||
"source": f"https://drive.google.com/file/d/{id}/view",
|
||||
"title": f"{file.get('name')}",
|
||||
"page": i,
|
||||
},
|
||||
)
|
||||
for i, page in enumerate(pdf_reader.pages)
|
||||
]
|
||||
content = fh.getvalue()
|
||||
pdf_reader = PdfReader(BytesIO(content))
|
||||
|
||||
return [
|
||||
Document(
|
||||
page_content=page.extract_text(),
|
||||
metadata={
|
||||
"source": f"https://drive.google.com/file/d/{id}/view",
|
||||
"title": f"{file.get('name')}",
|
||||
"page": i,
|
||||
},
|
||||
)
|
||||
for i, page in enumerate(pdf_reader.pages)
|
||||
]
|
||||
|
||||
def _load_file_from_ids(self) -> List[Document]:
|
||||
"""Load files from a list of IDs."""
|
||||
|
@ -62,15 +62,17 @@ class BasePDFLoader(BaseLoader, ABC):
|
||||
)
|
||||
|
||||
self.web_path = self.file_path
|
||||
self.temp_file = tempfile.NamedTemporaryFile()
|
||||
self.temp_file.write(r.content)
|
||||
self.file_path = self.temp_file.name
|
||||
self.temp_dir = tempfile.TemporaryDirectory()
|
||||
temp_pdf = Path(self.temp_dir.name) / "tmp.pdf"
|
||||
with open(temp_pdf, mode="wb") as f:
|
||||
f.write(r.content)
|
||||
self.file_path = str(temp_pdf)
|
||||
elif not os.path.isfile(self.file_path):
|
||||
raise ValueError("File path %s is not a valid file or url" % self.file_path)
|
||||
|
||||
def __del__(self) -> None:
|
||||
if hasattr(self, "temp_file"):
|
||||
self.temp_file.close()
|
||||
if hasattr(self, "temp_dir"):
|
||||
self.temp_dir.cleanup()
|
||||
|
||||
@staticmethod
|
||||
def _is_valid_url(url: str) -> bool:
|
||||
|
@ -1,31 +1,28 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
from langchain.schema import BaseOutputParser
|
||||
|
||||
|
||||
class GuardrailsOutputParser(BaseOutputParser):
|
||||
guard: Any
|
||||
api: Optional[Callable]
|
||||
args: Any
|
||||
kwargs: Any
|
||||
|
||||
@property
|
||||
def _type(self) -> str:
|
||||
return "guardrails"
|
||||
|
||||
@classmethod
|
||||
def from_rail(cls, rail_file: str, num_reasks: int = 1) -> GuardrailsOutputParser:
|
||||
try:
|
||||
from guardrails import Guard
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"guardrails-ai package not installed. "
|
||||
"Install it by running `pip install guardrails-ai`."
|
||||
)
|
||||
return cls(guard=Guard.from_rail(rail_file, num_reasks=num_reasks))
|
||||
|
||||
@classmethod
|
||||
def from_rail_string(
|
||||
cls, rail_str: str, num_reasks: int = 1
|
||||
def from_rail(
|
||||
cls,
|
||||
rail_file: str,
|
||||
num_reasks: int = 1,
|
||||
api: Optional[Callable] = None,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
) -> GuardrailsOutputParser:
|
||||
try:
|
||||
from guardrails import Guard
|
||||
@ -34,10 +31,38 @@ class GuardrailsOutputParser(BaseOutputParser):
|
||||
"guardrails-ai package not installed. "
|
||||
"Install it by running `pip install guardrails-ai`."
|
||||
)
|
||||
return cls(guard=Guard.from_rail_string(rail_str, num_reasks=num_reasks))
|
||||
return cls(
|
||||
guard=Guard.from_rail(rail_file, num_reasks=num_reasks),
|
||||
api=api,
|
||||
args=args,
|
||||
kwargs=kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_rail_string(
|
||||
cls,
|
||||
rail_str: str,
|
||||
num_reasks: int = 1,
|
||||
api: Optional[Callable] = None,
|
||||
*args: Any,
|
||||
**kwargs: Any,
|
||||
) -> GuardrailsOutputParser:
|
||||
try:
|
||||
from guardrails import Guard
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"guardrails-ai package not installed. "
|
||||
"Install it by running `pip install guardrails-ai`."
|
||||
)
|
||||
return cls(
|
||||
guard=Guard.from_rail_string(rail_str, num_reasks=num_reasks),
|
||||
api=api,
|
||||
args=args,
|
||||
kwargs=kwargs,
|
||||
)
|
||||
|
||||
def get_format_instructions(self) -> str:
|
||||
return self.guard.raw_prompt.format_instructions
|
||||
|
||||
def parse(self, text: str) -> Dict:
|
||||
return self.guard.parse(text)
|
||||
return self.guard.parse(text, llm_api=self.api, *self.args, **self.kwargs)
|
||||
|
@ -4,10 +4,13 @@ from __future__ import annotations
|
||||
import uuid
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
import numpy as np
|
||||
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.schema import Document
|
||||
from langchain.utils import get_from_dict_or_env
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
from langchain.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
IMPORT_OPENSEARCH_PY_ERROR = (
|
||||
"Could not import OpenSearch. Please install it with `pip install opensearch-py`."
|
||||
@ -76,9 +79,12 @@ def _bulk_ingest_embeddings(
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
vector_field: str = "vector_field",
|
||||
text_field: str = "text",
|
||||
mapping: Dict = {},
|
||||
mapping: Optional[Dict] = None,
|
||||
) -> List[str]:
|
||||
"""Bulk Ingest Embeddings into given index."""
|
||||
if not mapping:
|
||||
mapping = dict()
|
||||
|
||||
bulk = _import_bulk()
|
||||
not_found_error = _import_not_found_error()
|
||||
requests = []
|
||||
@ -201,10 +207,14 @@ def _approximate_search_query_with_lucene_filter(
|
||||
def _default_script_query(
|
||||
query_vector: List[float],
|
||||
space_type: str = "l2",
|
||||
pre_filter: Dict = MATCH_ALL_QUERY,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
vector_field: str = "vector_field",
|
||||
) -> Dict:
|
||||
"""For Script Scoring Search, this is the default query."""
|
||||
|
||||
if not pre_filter:
|
||||
pre_filter = MATCH_ALL_QUERY
|
||||
|
||||
return {
|
||||
"query": {
|
||||
"script_score": {
|
||||
@ -245,10 +255,14 @@ def __get_painless_scripting_source(
|
||||
def _default_painless_scripting_query(
|
||||
query_vector: List[float],
|
||||
space_type: str = "l2Squared",
|
||||
pre_filter: Dict = MATCH_ALL_QUERY,
|
||||
pre_filter: Optional[Dict] = None,
|
||||
vector_field: str = "vector_field",
|
||||
) -> Dict:
|
||||
"""For Painless Scripting Search, this is the default query."""
|
||||
|
||||
if not pre_filter:
|
||||
pre_filter = MATCH_ALL_QUERY
|
||||
|
||||
source = __get_painless_scripting_source(space_type, query_vector)
|
||||
return {
|
||||
"query": {
|
||||
@ -355,7 +369,7 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
By default supports Approximate Search.
|
||||
By default, supports Approximate Search.
|
||||
Also supports Script Scoring and Painless Scripting.
|
||||
|
||||
Args:
|
||||
@ -413,7 +427,7 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs and it's scores most similar to query.
|
||||
|
||||
By default supports Approximate Search.
|
||||
By default, supports Approximate Search.
|
||||
Also supports Script Scoring and Painless Scripting.
|
||||
|
||||
Args:
|
||||
@ -426,10 +440,47 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
Optional Args:
|
||||
same as `similarity_search`
|
||||
"""
|
||||
embedding = self.embedding_function.embed_query(query)
|
||||
search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search")
|
||||
|
||||
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
||||
metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata")
|
||||
|
||||
hits = self._raw_similarity_search_with_score(query=query, k=k, **kwargs)
|
||||
|
||||
documents_with_scores = [
|
||||
(
|
||||
Document(
|
||||
page_content=hit["_source"][text_field],
|
||||
metadata=hit["_source"]
|
||||
if metadata_field == "*" or metadata_field not in hit["_source"]
|
||||
else hit["_source"][metadata_field],
|
||||
),
|
||||
hit["_score"],
|
||||
)
|
||||
for hit in hits
|
||||
]
|
||||
return documents_with_scores
|
||||
|
||||
def _raw_similarity_search_with_score(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[dict]:
|
||||
"""Return raw opensearch documents (dict) including vectors,
|
||||
scores most similar to query.
|
||||
|
||||
By default, supports Approximate Search.
|
||||
Also supports Script Scoring and Painless Scripting.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of dict with its scores most similar to the query.
|
||||
|
||||
Optional Args:
|
||||
same as `similarity_search`
|
||||
"""
|
||||
embedding = self.embedding_function.embed_query(query)
|
||||
search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search")
|
||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||
|
||||
if search_type == "approximate_search":
|
||||
@ -473,20 +524,59 @@ class OpenSearchVectorSearch(VectorStore):
|
||||
raise ValueError("Invalid `search_type` provided as an argument")
|
||||
|
||||
response = self.client.search(index=self.index_name, body=search_query)
|
||||
hits = [hit for hit in response["hits"]["hits"][:k]]
|
||||
documents_with_scores = [
|
||||
(
|
||||
Document(
|
||||
page_content=hit["_source"][text_field],
|
||||
metadata=hit["_source"]
|
||||
if metadata_field == "*" or metadata_field not in hit["_source"]
|
||||
else hit["_source"][metadata_field],
|
||||
),
|
||||
hit["_score"],
|
||||
|
||||
return [hit for hit in response["hits"]["hits"][:k]]
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> list[Document]:
|
||||
"""Return docs selected using the maximal marginal relevance.
|
||||
|
||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||
among selected documents.
|
||||
|
||||
Args:
|
||||
query: Text to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
||||
Defaults to 20.
|
||||
lambda_mult: Number between 0 and 1 that determines the degree
|
||||
of diversity among the results with 0 corresponding
|
||||
to maximum diversity and 1 to minimum diversity.
|
||||
Defaults to 0.5.
|
||||
Returns:
|
||||
List of Documents selected by maximal marginal relevance.
|
||||
"""
|
||||
|
||||
vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field")
|
||||
text_field = _get_kwargs_value(kwargs, "text_field", "text")
|
||||
metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata")
|
||||
|
||||
# Get embedding of the user query
|
||||
embedding = self.embedding_function.embed_query(query)
|
||||
|
||||
# Do ANN/KNN search to get top fetch_k results where fetch_k >= k
|
||||
results = self._raw_similarity_search_with_score(query, fetch_k, **kwargs)
|
||||
|
||||
embeddings = [result["_source"][vector_field] for result in results]
|
||||
|
||||
# Rerank top k results using MMR, (mmr_selected is a list of indices)
|
||||
mmr_selected = maximal_marginal_relevance(
|
||||
np.array(embedding), embeddings, k=k, lambda_mult=lambda_mult
|
||||
)
|
||||
|
||||
return [
|
||||
Document(
|
||||
page_content=results[i]["_source"][text_field],
|
||||
metadata=results[i]["_source"][metadata_field],
|
||||
)
|
||||
for hit in hits
|
||||
for i in mmr_selected
|
||||
]
|
||||
return documents_with_scores
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langchain"
|
||||
version = "0.0.202"
|
||||
version = "0.0.203"
|
||||
description = "Building applications with LLMs through composability"
|
||||
authors = []
|
||||
license = "MIT"
|
||||
|
Loading…
Reference in New Issue
Block a user