From 2eea5d4cb4eed4ff0e194713abd730b219ef6f51 Mon Sep 17 00:00:00 2001 From: Davis Chase <130488702+dev2049@users.noreply.github.com> Date: Sat, 17 Jun 2023 11:17:08 -0700 Subject: [PATCH 01/11] Add ignore vercel preview script (#6320) skip building preview of docs for anything branch that doesn't start with `__docs__`. will eventually update to look at code diff directories but patching for now --- docs/docs_skeleton/ignore_build.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100755 docs/docs_skeleton/ignore_build.sh diff --git a/docs/docs_skeleton/ignore_build.sh b/docs/docs_skeleton/ignore_build.sh new file mode 100755 index 0000000000..114e45604c --- /dev/null +++ b/docs/docs_skeleton/ignore_build.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +echo "VERCEL_GIT_COMMIT_REF: $VERCEL_GIT_COMMIT_REF" + +if [[ $VERCEL_GIT_COMMIT_REF = __docs__* ]] ; then + # Proceed with the build + echo "✅ - Build can proceed" + exit 1; + +else + # Don't build + echo "🛑 - Build cancelled" + exit 0; +fi From a2bbe3dda4f03d02fdd9f87d413169cba9a4d131 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sat, 17 Jun 2023 12:22:37 -0700 Subject: [PATCH 02/11] Harrison/mmr support for opensearch (#6349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mehmet Öner Yalçın --- .../integrations/opensearch.ipynb | 70 ++++------ .../vectorstores/opensearch_vector_search.py | 130 +++++++++++++++--- 2 files changed, 140 insertions(+), 60 deletions(-) diff --git a/docs/extras/modules/data_connection/vectorstores/integrations/opensearch.ipynb b/docs/extras/modules/data_connection/vectorstores/integrations/opensearch.ipynb index 654d9453fd..ee9fa2760e 100644 --- a/docs/extras/modules/data_connection/vectorstores/integrations/opensearch.ipynb +++ b/docs/extras/modules/data_connection/vectorstores/integrations/opensearch.ipynb @@ -129,11 +129,7 @@ "cell_type": "code", "execution_count": null, "id": "db3fa309", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "query = \"What did the president say about Ketanji Brown Jackson\"\n", @@ -144,11 +140,7 @@ "cell_type": "code", "execution_count": null, "id": "c160d5bb", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "print(docs[0].page_content)" @@ -158,11 +150,7 @@ "cell_type": "code", "execution_count": null, "id": "96215c90", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "docsearch = OpenSearchVectorSearch.from_documents(\n", @@ -183,11 +171,7 @@ "cell_type": "code", "execution_count": null, "id": "62a7cea0", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "print(docs[0].page_content)" @@ -207,11 +191,7 @@ "cell_type": "code", "execution_count": null, "id": "0a8e3c0e", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "docsearch = OpenSearchVectorSearch.from_documents(\n", @@ -230,11 +210,7 @@ "cell_type": "code", "execution_count": null, "id": "92bc40db", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "print(docs[0].page_content)" @@ -254,11 +230,7 @@ "cell_type": "code", "execution_count": null, "id": "6d9f436e", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "docsearch = OpenSearchVectorSearch.from_documents(\n", @@ -278,16 +250,34 @@ "cell_type": "code", "execution_count": null, "id": "8ca50bce", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "print(docs[0].page_content)" ] }, + { + "cell_type": "markdown", + "source": [ + "### Maximum marginal relevance search (MMR)\n", + "If you’d like to look up for some similar documents, but you’d also like to receive diverse results, MMR is method you should consider. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10, lambda_param=0.5)" + ], + "metadata": { + "collapsed": false + } + }, { "cell_type": "markdown", "id": "73264864", diff --git a/langchain/vectorstores/opensearch_vector_search.py b/langchain/vectorstores/opensearch_vector_search.py index d33a39429a..dd7b36245d 100644 --- a/langchain/vectorstores/opensearch_vector_search.py +++ b/langchain/vectorstores/opensearch_vector_search.py @@ -4,10 +4,13 @@ from __future__ import annotations import uuid from typing import Any, Dict, Iterable, List, Optional, Tuple -from langchain.docstore.document import Document +import numpy as np + from langchain.embeddings.base import Embeddings +from langchain.schema import Document from langchain.utils import get_from_dict_or_env from langchain.vectorstores.base import VectorStore +from langchain.vectorstores.utils import maximal_marginal_relevance IMPORT_OPENSEARCH_PY_ERROR = ( "Could not import OpenSearch. Please install it with `pip install opensearch-py`." @@ -76,9 +79,12 @@ def _bulk_ingest_embeddings( metadatas: Optional[List[dict]] = None, vector_field: str = "vector_field", text_field: str = "text", - mapping: Dict = {}, + mapping: Optional[Dict] = None, ) -> List[str]: """Bulk Ingest Embeddings into given index.""" + if not mapping: + mapping = dict() + bulk = _import_bulk() not_found_error = _import_not_found_error() requests = [] @@ -201,10 +207,14 @@ def _approximate_search_query_with_lucene_filter( def _default_script_query( query_vector: List[float], space_type: str = "l2", - pre_filter: Dict = MATCH_ALL_QUERY, + pre_filter: Optional[Dict] = None, vector_field: str = "vector_field", ) -> Dict: """For Script Scoring Search, this is the default query.""" + + if not pre_filter: + pre_filter = MATCH_ALL_QUERY + return { "query": { "script_score": { @@ -245,10 +255,14 @@ def __get_painless_scripting_source( def _default_painless_scripting_query( query_vector: List[float], space_type: str = "l2Squared", - pre_filter: Dict = MATCH_ALL_QUERY, + pre_filter: Optional[Dict] = None, vector_field: str = "vector_field", ) -> Dict: """For Painless Scripting Search, this is the default query.""" + + if not pre_filter: + pre_filter = MATCH_ALL_QUERY + source = __get_painless_scripting_source(space_type, query_vector) return { "query": { @@ -355,7 +369,7 @@ class OpenSearchVectorSearch(VectorStore): ) -> List[Document]: """Return docs most similar to query. - By default supports Approximate Search. + By default, supports Approximate Search. Also supports Script Scoring and Painless Scripting. Args: @@ -413,7 +427,7 @@ class OpenSearchVectorSearch(VectorStore): ) -> List[Tuple[Document, float]]: """Return docs and it's scores most similar to query. - By default supports Approximate Search. + By default, supports Approximate Search. Also supports Script Scoring and Painless Scripting. Args: @@ -426,10 +440,47 @@ class OpenSearchVectorSearch(VectorStore): Optional Args: same as `similarity_search` """ - embedding = self.embedding_function.embed_query(query) - search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search") + text_field = _get_kwargs_value(kwargs, "text_field", "text") metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata") + + hits = self._raw_similarity_search_with_score(query=query, k=k, **kwargs) + + documents_with_scores = [ + ( + Document( + page_content=hit["_source"][text_field], + metadata=hit["_source"] + if metadata_field == "*" or metadata_field not in hit["_source"] + else hit["_source"][metadata_field], + ), + hit["_score"], + ) + for hit in hits + ] + return documents_with_scores + + def _raw_similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[dict]: + """Return raw opensearch documents (dict) including vectors, + scores most similar to query. + + By default, supports Approximate Search. + Also supports Script Scoring and Painless Scripting. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of dict with its scores most similar to the query. + + Optional Args: + same as `similarity_search` + """ + embedding = self.embedding_function.embed_query(query) + search_type = _get_kwargs_value(kwargs, "search_type", "approximate_search") vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field") if search_type == "approximate_search": @@ -473,20 +524,59 @@ class OpenSearchVectorSearch(VectorStore): raise ValueError("Invalid `search_type` provided as an argument") response = self.client.search(index=self.index_name, body=search_query) - hits = [hit for hit in response["hits"]["hits"][:k]] - documents_with_scores = [ - ( - Document( - page_content=hit["_source"][text_field], - metadata=hit["_source"] - if metadata_field == "*" or metadata_field not in hit["_source"] - else hit["_source"][metadata_field], - ), - hit["_score"], + + return [hit for hit in response["hits"]["hits"][:k]] + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> list[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + + vector_field = _get_kwargs_value(kwargs, "vector_field", "vector_field") + text_field = _get_kwargs_value(kwargs, "text_field", "text") + metadata_field = _get_kwargs_value(kwargs, "metadata_field", "metadata") + + # Get embedding of the user query + embedding = self.embedding_function.embed_query(query) + + # Do ANN/KNN search to get top fetch_k results where fetch_k >= k + results = self._raw_similarity_search_with_score(query, fetch_k, **kwargs) + + embeddings = [result["_source"][vector_field] for result in results] + + # Rerank top k results using MMR, (mmr_selected is a list of indices) + mmr_selected = maximal_marginal_relevance( + np.array(embedding), embeddings, k=k, lambda_mult=lambda_mult + ) + + return [ + Document( + page_content=results[i]["_source"][text_field], + metadata=results[i]["_source"][metadata_field], ) - for hit in hits + for i in mmr_selected ] - return documents_with_scores @classmethod def from_texts( From 2c97fbabbd8c9701af56586119cd464d154c62db Mon Sep 17 00:00:00 2001 From: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Date: Sat, 17 Jun 2023 13:19:27 -0700 Subject: [PATCH 03/11] Update MD header text splitter notebook (#6339) Highlight use case for maintaining header groups when splitting. --- .../markdown_header_metadata.ipynb | 131 ++++++++++++------ 1 file changed, 90 insertions(+), 41 deletions(-) diff --git a/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb b/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb index 36dda9b10f..53f849ecdd 100644 --- a/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb +++ b/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb @@ -7,30 +7,46 @@ "source": [ "# MarkdownHeaderTextSplitter\n", "\n", - "This splits a markdown file by a specified set of headers. For example, if we want to split this markdown:\n", + "Many chat or Q+A applications involve chunking input documents prior to embedding and vector storage.\n", + "\n", + "[These notes](https://www.pinecone.io/learn/chunking-strategies/) from Pinecone provide some useful tips:\n", + "\n", "```\n", - "md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n", + "When a full paragraph or document is embedded, the embedding process considers both the overall context and the relationships between the sentences and phrases within the text. This can result in a more comprehensive vector representation that captures the broader meaning and themes of the text. Larger input text sizes, on the other hand, may introduce noise or dilute the significance of individual sentences or phrases, making finding precise matches when querying the index more difficult.\n", "```\n", + " \n", + "As mentioned, chunking usually uses delimiters or length to keep text with a common context together.\n", + "\n", + "But, in some cases we might want to honor the structure of the document itself.\n", + "\n", + "For example, a markdown file is organized by headers and isolating chunks within header groups is an intuitive idea.\n", + "\n", + "If we mix chunks across header groups, then we may degrade the retrieval quality.\n", + " \n", + "To address this challenge, we can use `MarkdownHeaderTextSplitter` to split a markdown file by a specified set of headers. \n", "\n", - "Headers to split on:\n", + "For example, if we want to split this markdown:\n", + "```\n", + "md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n", + "```\n", + " \n", + "We can specify the headers to split on:\n", "```\n", "[(\"#\", \"Header 1\"),(\"##\", \"Header 2\")]\n", "```\n", "\n", - "Expected output:\n", + "And content is grouped or split by common headers:\n", "```\n", "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n", "```\n", "\n", - "Optionally, this also includes `return_each_line` in case a user want to perform other types of aggregation. \n", - "\n", - "If `return_each_line=True`, each line and associated header metadata are simply returned. " + "Let's have a look at some examples below." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "19c044f0", "metadata": {}, "outputs": [], @@ -40,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "id": "2ae3649b", "metadata": {}, "outputs": [ @@ -64,63 +80,96 @@ "]\n", "\n", "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", - "splits = markdown_splitter.split_text(markdown_document)\n", - "for split in splits:\n", + "md_header_splits = markdown_splitter.split_text(markdown_document)\n", + "for split in md_header_splits:\n", " print(split)" ] }, { "cell_type": "markdown", - "id": "2a32026a", + "id": "9bd8977a", "metadata": {}, "source": [ - "Here's an example on a larger file with `return_each_line=True` passed, allowing each line to be examined." + "Within each markdown group we can then apply any splitter we want. \n", + "\n", + "Now, we can ensure that the splits are constrained to common header groups and we can keep the headers in the metadata!" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "8af8f9a2", + "execution_count": 5, + "id": "480e0e3a", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'content': 'Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n", - "{'content': 'Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n", - "{'content': 'As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n", - "{'content': 'additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n", - "{'content': 'From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence', 'Header 4': 'Standardization'}}\n", - "{'content': 'Implementations of Markdown are available for over a dozen programming languages.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Implementations'}}\n" - ] - } - ], + "outputs": [], "source": [ "markdown_document = \"# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n", "\n", "headers_to_split_on = [\n", " (\"#\", \"Header 1\"),\n", " (\"##\", \"Header 2\"),\n", - " (\"###\", \"Header 3\"),\n", - " (\"####\", \"Header 4\"),\n", "]\n", "\n", - "markdown_splitter = MarkdownHeaderTextSplitter(\n", - " headers_to_split_on=headers_to_split_on, return_each_line=True\n", - ")\n", - "splits = markdown_splitter.split_text(markdown_document)\n", - "for line in splits:\n", - " print(line)" + "# MD splits\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", + "md_header_splits = markdown_splitter.split_text(markdown_document)\n", + "\n", + "# Char-level splits\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "chunk_size = 10\n", + "chunk_overlap = 0\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + "\n", + "# Split within each header group\n", + "all_splits=[]\n", + "all_metadatas=[] \n", + "for header_group in md_header_splits:\n", + " _splits = text_splitter.split_text(header_group['content'])\n", + " _metadatas = [header_group['metadata'] for _ in _splits]\n", + " all_splits += _splits\n", + " all_metadatas += _metadatas" ] }, { "cell_type": "code", - "execution_count": null, - "id": "987183f2", + "execution_count": 6, + "id": "3f5d775e", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "'Markdown[9'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_splits[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "33ab0d5c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Header 1': 'Intro', 'Header 2': 'History'}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_metadatas[0]" + ] } ], "metadata": { From 370becdfc2dea35eab6b56244872001116d24f0b Mon Sep 17 00:00:00 2001 From: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Date: Sat, 17 Jun 2023 21:40:20 -0700 Subject: [PATCH 04/11] Add self query retriever example with MD header splitting (#6359) Flesh out the notebook example for `MarkdownHeaderTextSplitter` --- .../markdown_header_metadata.ipynb | 345 +++++++++++++++++- 1 file changed, 328 insertions(+), 17 deletions(-) diff --git a/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb b/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb index 53f849ecdd..9d568cfa7b 100644 --- a/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb +++ b/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb @@ -7,23 +7,27 @@ "source": [ "# MarkdownHeaderTextSplitter\n", "\n", + "### Motivation\n", + "\n", "Many chat or Q+A applications involve chunking input documents prior to embedding and vector storage.\n", "\n", "[These notes](https://www.pinecone.io/learn/chunking-strategies/) from Pinecone provide some useful tips:\n", "\n", "```\n", - "When a full paragraph or document is embedded, the embedding process considers both the overall context and the relationships between the sentences and phrases within the text. This can result in a more comprehensive vector representation that captures the broader meaning and themes of the text. Larger input text sizes, on the other hand, may introduce noise or dilute the significance of individual sentences or phrases, making finding precise matches when querying the index more difficult.\n", + "When a full paragraph or document is embedded, the embedding process considers both the overall context and the relationships between the sentences and phrases within the text. This can result in a more comprehensive vector representation that captures the broader meaning and themes of the text.\n", "```\n", " \n", - "As mentioned, chunking usually uses delimiters or length to keep text with a common context together.\n", + "As mentioned, chunking often aims to keep text with common context together.\n", "\n", - "But, in some cases we might want to honor the structure of the document itself.\n", + "With this in mind, we might want to specifically honor the structure of the document itself.\n", "\n", - "For example, a markdown file is organized by headers and isolating chunks within header groups is an intuitive idea.\n", + "For example, a markdown file is organized by headers.\n", "\n", - "If we mix chunks across header groups, then we may degrade the retrieval quality.\n", - " \n", - "To address this challenge, we can use `MarkdownHeaderTextSplitter` to split a markdown file by a specified set of headers. \n", + "Creating chunks within specific header groups is an intuitive idea.\n", + "\n", + "To address this challenge, we can use `MarkdownHeaderTextSplitter`.\n", + "\n", + "This will split a markdown file by a specified set of headers. \n", "\n", "For example, if we want to split this markdown:\n", "```\n", @@ -46,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "19c044f0", "metadata": {}, "outputs": [], @@ -56,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "2ae3649b", "metadata": {}, "outputs": [ @@ -90,14 +94,12 @@ "id": "9bd8977a", "metadata": {}, "source": [ - "Within each markdown group we can then apply any splitter we want. \n", - "\n", - "Now, we can ensure that the splits are constrained to common header groups and we can keep the headers in the metadata!" + "Within each markdown group we can then apply any text splitter we want. " ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "480e0e3a", "metadata": {}, "outputs": [], @@ -131,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "3f5d775e", "metadata": {}, "outputs": [ @@ -141,7 +143,7 @@ "'Markdown[9'" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -152,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "33ab0d5c", "metadata": {}, "outputs": [ @@ -162,7 +164,7 @@ "{'Header 1': 'Intro', 'Header 2': 'History'}" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -170,6 +172,315 @@ "source": [ "all_metadatas[0]" ] + }, + { + "cell_type": "markdown", + "id": "dcf70760", + "metadata": {}, + "source": [ + "### Use case\n", + "\n", + "Let's appy `MarkdownHeaderTextSplitter` to a Notion page [here](https://rlancemartin.notion.site/Auto-Evaluation-of-Metadata-Filtering-18502448c85240828f33716740f9574b?pvs=4) as a test.\n", + "\n", + "The page is downloaded as markdown and stored locally as shown [here](https://python.langchain.com/docs/modules/data_connection/document_loaders/integrations/notion)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "73313d6c", + "metadata": {}, + "outputs": [], + "source": [ + "# Load Notion database as a markdownfile file\n", + "from langchain.document_loaders import NotionDirectoryLoader\n", + "loader = NotionDirectoryLoader(\"../Notion_DB_Metadata\")\n", + "docs = loader.load()\n", + "md_file=docs[0].page_content" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6fa341d7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'content': 'We previously introduced [auto-evaluator](https://blog.langchain.dev/auto-evaluator-opportunities/), an open-source tool for grading LLM question-answer chains. Here, we extend auto-evaluator with a [lightweight Streamlit app](https://github.com/langchain-ai/auto-evaluator/tree/main/streamlit) that can connect to any existing Pinecone index. We add the ability to test metadata filtering using `SelfQueryRetriever` as well as some other approaches that we’ve found to be useful, as discussed below. \\n[ret_trim.mov](Auto-Evaluation%20of%20Metadata%20Filtering%2018502448c85240828f33716740f9574b/ret_trim.mov)',\n", + " 'metadata': {'Section': 'Evaluation'}}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Let's create groups based on the section headers\n", + "headers_to_split_on = [\n", + " (\"###\", \"Section\"),\n", + "]\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", + "md_header_splits = markdown_splitter.split_text(md_file)\n", + "md_header_splits[3]" + ] + }, + { + "cell_type": "markdown", + "id": "42d8bb9b", + "metadata": {}, + "source": [ + "Now, we split the text in each group and keep the group as metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "a9831de2", + "metadata": {}, + "outputs": [], + "source": [ + "# Define our text splitter\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "chunk_size = 500\n", + "chunk_overlap = 50\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + " \n", + "# Create splits within each header group\n", + "all_splits=[]\n", + "all_metadatas=[]\n", + "for header_group in md_header_splits:\n", + " _splits = text_splitter.split_text(header_group['content'])\n", + " _metadatas = [header_group['metadata'] for _ in _splits]\n", + " all_splits += _splits\n", + " all_metadatas += _metadatas" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "b5691ee5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'In these cases, semantic search will look for the concept `episode 53` in the chunks, but instead we simply want to filter the chunks for `episode 53` and then perform semantic search to extract those that best summarize the episode. Metadata filtering does this, so long as we 1) we have a metadata filter for episode number and 2) we can extract the value from the query (e.g., `54` or `252`) that we want to extract. The LangChain `SelfQueryRetriever` does the latter (see'" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_splits[6]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "e1dfb405", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Section': 'Motivation'}" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_metadatas[6]" + ] + }, + { + "cell_type": "markdown", + "id": "79868606", + "metadata": {}, + "source": [ + "This sets us up well do perform metadata filtering based on the document structure.\n", + "\n", + "Let's bring this all togther by building a vectorstore first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "143d7347", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install chromadb" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "cbcb917a", + "metadata": {}, + "outputs": [], + "source": [ + "# Build vectorstore\n", + "from langchain.vectorstores import Chroma\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "embeddings = OpenAIEmbeddings()\n", + "vectorstore = Chroma.from_texts(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())" + ] + }, + { + "cell_type": "markdown", + "id": "3f6031fc", + "metadata": {}, + "source": [ + "Let's create a `SelfQueryRetriever` that can filter based upon metadata we defined." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "5b1b6a75", + "metadata": {}, + "outputs": [], + "source": [ + "# Create retriever \n", + "from langchain.llms import OpenAI\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n", + "\n", + "# Define our metadata\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"Section\",\n", + " description=\"Headers of the markdown document that organize the ideas\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + "]\n", + "document_content_description = \"Headers of the markdown document\"\n", + "\n", + "# Define self query retriver\n", + "llm = OpenAI(temperature=0)\n", + "sq_retriever = SelfQueryRetriever.from_llm(llm, vectorstore, document_content_description, metadata_field_info, verbose=True)" + ] + }, + { + "cell_type": "markdown", + "id": "9d0dbed8", + "metadata": {}, + "source": [ + "Now we can fetch chunks specifically from any section of the doc!" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "6c37fe1b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='Introduction' filter=Comparison(comparator=, attribute='Section', value='Introduction') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='![Untitled](Auto-Evaluation%20of%20Metadata%20Filtering%2018502448c85240828f33716740f9574b/Untitled.png)', metadata={'Section': 'Introduction'}),\n", + " Document(page_content='Q+A systems often use a two-step approach: retrieve relevant text chunks and then synthesize them into an answer. There many ways to approach this. For example, we recently [discussed](https://blog.langchain.dev/auto-evaluation-of-anthropic-100k-context-window/) the Retriever-Less option (at bottom in the below diagram), highlighting the Anthropic 100k context window model. Metadata filtering is an alternative approach that pre-filters chunks based on a user-defined criteria in a VectorDB using', metadata={'Section': 'Introduction'}),\n", + " Document(page_content='on a user-defined criteria in a VectorDB using metadata tags prior to semantic search.', metadata={'Section': 'Introduction'})]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Test\n", + "question=\"Summarize the Introduction section of the document\"\n", + "sq_retriever.get_relevant_documents(question)" + ] + }, + { + "cell_type": "markdown", + "id": "bb0efebd", + "metadata": {}, + "source": [ + "Now, we can create chat or Q+A apps that are aware of the explict document structure. \n", + "\n", + "Of course, semantic search without specific metadata filtering would probably work reasonably well for this simple document.\n", + "\n", + "But, the ability to retain document structure for metadata filtering can be helpful for more complicated or longer documents." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "3b40e24e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='Introduction' filter=Comparison(comparator=, attribute='Section', value='Introduction') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "'The document discusses different approaches to retrieve relevant text chunks and synthesize them into an answer in Q+A systems. One of the approaches is metadata filtering, which pre-filters chunks based on user-defined criteria in a VectorDB using metadata tags prior to semantic search. The Retriever-Less option, which uses the Anthropic 100k context window model, is also mentioned as an alternative approach.'" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.chains import RetrievalQA\n", + "from langchain.chat_models import ChatOpenAI\n", + "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n", + "qa_chain = RetrievalQA.from_chain_type(llm,retriever=sq_retriever)\n", + "qa_chain.run(question)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "dfeeb327", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='Testing' filter=Comparison(comparator=, attribute='Section', value='Testing') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "'The Testing section of the document describes how the performance of the SelfQueryRetriever was evaluated using various test cases. The tests were designed to evaluate the ability of the SelfQueryRetriever to correctly infer metadata filters from the query using metadata_field_info. The results of the tests showed that the SelfQueryRetriever performed well in some cases, but failed in others. The document also provides a link to the code for the auto-evaluator and instructions on how to use it. Additionally, the document mentions the use of the Kor library for structured data extraction to explicitly specify transformations that the auto-evaluator can use.'" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "question=\"Summarize the Testing section of the document\"\n", + "qa_chain.run(question)" + ] } ], "metadata": { From ec850e607f480abbfa048f5370dd4635a0627f48 Mon Sep 17 00:00:00 2001 From: Davis Chase <130488702+dev2049@users.noreply.github.com> Date: Sun, 18 Jun 2023 09:20:47 -0700 Subject: [PATCH 05/11] bump 203 (#6372) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4f0bce6a4b..d818ef64a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain" -version = "0.0.202" +version = "0.0.203" description = "Building applications with LLMs through composability" authors = [] license = "MIT" From ebfffaa38f99438df49d4bfe43b7e3b97d0a6fe4 Mon Sep 17 00:00:00 2001 From: rafael Date: Sun, 18 Jun 2023 19:50:20 +0200 Subject: [PATCH 06/11] Guardrails output parser: Pass LLM api for reasking (#6089) Fixes https://github.com/ShreyaR/guardrails/issues/155 Enables guardrails reasking by specifying an LLM api in the output parser. --- langchain/output_parsers/rail_parser.py | 37 +++++++++++++++++++++---- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/langchain/output_parsers/rail_parser.py b/langchain/output_parsers/rail_parser.py index 0dab50d9a8..cd02b33733 100644 --- a/langchain/output_parsers/rail_parser.py +++ b/langchain/output_parsers/rail_parser.py @@ -1,19 +1,29 @@ from __future__ import annotations -from typing import Any, Dict +from typing import Any, Callable, Dict, Optional from langchain.schema import BaseOutputParser class GuardrailsOutputParser(BaseOutputParser): guard: Any + api: Optional[Callable] + args: Any + kwargs: Any @property def _type(self) -> str: return "guardrails" @classmethod - def from_rail(cls, rail_file: str, num_reasks: int = 1) -> GuardrailsOutputParser: + def from_rail( + cls, + rail_file: str, + num_reasks: int = 1, + api: Optional[Callable] = None, + *args: Any, + **kwargs: Any, + ) -> GuardrailsOutputParser: try: from guardrails import Guard except ImportError: @@ -21,11 +31,21 @@ class GuardrailsOutputParser(BaseOutputParser): "guardrails-ai package not installed. " "Install it by running `pip install guardrails-ai`." ) - return cls(guard=Guard.from_rail(rail_file, num_reasks=num_reasks)) + return cls( + guard=Guard.from_rail(rail_file, num_reasks=num_reasks), + api=api, + args=args, + kwargs=kwargs, + ) @classmethod def from_rail_string( - cls, rail_str: str, num_reasks: int = 1 + cls, + rail_str: str, + num_reasks: int = 1, + api: Optional[Callable] = None, + *args: Any, + **kwargs: Any, ) -> GuardrailsOutputParser: try: from guardrails import Guard @@ -34,10 +54,15 @@ class GuardrailsOutputParser(BaseOutputParser): "guardrails-ai package not installed. " "Install it by running `pip install guardrails-ai`." ) - return cls(guard=Guard.from_rail_string(rail_str, num_reasks=num_reasks)) + return cls( + guard=Guard.from_rail_string(rail_str, num_reasks=num_reasks), + api=api, + args=args, + kwargs=kwargs, + ) def get_format_instructions(self) -> str: return self.guard.raw_prompt.format_instructions def parse(self, text: str) -> Dict: - return self.guard.parse(text) + return self.guard.parse(text, llm_api=self.api, *self.args, **self.kwargs) From a8cb9ee013f1aea9d700e917cba6f254145f696f Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 18 Jun 2023 11:07:23 -0700 Subject: [PATCH 07/11] Harrison/gdrive enhancements (#6375) Co-authored-by: Matt Robinson --- .../integrations/google_drive.ipynb | 133 +++++++++++++++++- langchain/document_loaders/googledrive.py | 48 ++++--- 2 files changed, 162 insertions(+), 19 deletions(-) diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb index e53461306e..fe59db46ce 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "b0ed136e-6983-4893-ae1b-b75753af05f8", "metadata": {}, @@ -78,7 +77,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2721ba8a", "metadata": {}, @@ -99,6 +97,135 @@ " recursive=False\n", ")" ] + }, + { + "cell_type": "markdown", + "id": "d6b80931", + "metadata": {}, + "source": [ + "## Passing in Optional File Loaders\n", + "\n", + "When processing files other than Google Docs and Google Sheets, it can be helpful to pass an optional file loader to `GoogleDriveLoader`. If you pass in a file loader, that file loader will be used on documents that do not have a Google Docs or Google Sheets MIME type. Here is an example of how to load an Excel document from Google Drive using a file loader. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "94207e39", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import GoogleDriveLoader\n", + "from langchain.document_loaders import UnstructuredFileIOLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a15fbee0", + "metadata": {}, + "outputs": [], + "source": [ + "file_id=\"1x9WBtFPWMEAdjcJzPScRsjpjQvpSo_kz\"\n", + "loader = GoogleDriveLoader(\n", + " file_ids=[file_id],\n", + " file_loader_cls=UnstructuredFileIOLoader,\n", + " file_loader_kwargs={\"mode\": \"elements\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "98410bda", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e3e72221", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "markdown", + "id": "238cd06f", + "metadata": {}, + "source": [ + "You can also process a folder with a mix of files and Google Docs/Sheets using the following pattern:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0e2d093f", + "metadata": {}, + "outputs": [], + "source": [ + "folder_id=\"1asMOHY1BqBS84JcRbOag5LOJac74gpmD\"\n", + "loader = GoogleDriveLoader(\n", + " folder_id=folder_id,\n", + " file_loader_cls=UnstructuredFileIOLoader,\n", + " file_loader_kwargs={\"mode\": \"elements\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b35ddcc6", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3cc141e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e312268a", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -117,7 +244,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py index c28bf705bc..5e21d401ed 100644 --- a/langchain/document_loaders/googledrive.py +++ b/langchain/document_loaders/googledrive.py @@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel): recursive: bool = False file_types: Optional[Sequence[str]] = None load_trashed_files: bool = False + # NOTE(MthwRobinson) - changing the file_loader_cls to type here currently + # results in pydantic validation errors + file_loader_cls: Any = None + file_loader_kwargs: Dict["str", Any] = {} @root_validator def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]: @@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel): returns.append(self._load_document_from_id(file["id"])) # type: ignore elif file["mimeType"] == "application/vnd.google-apps.spreadsheet": returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore - elif file["mimeType"] == "application/pdf": + elif ( + file["mimeType"] == "application/pdf" + or self.file_loader_cls is not None + ): returns.extend(self._load_file_from_id(file["id"])) # type: ignore else: pass @@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel): done = False while done is False: status, done = downloader.next_chunk() - content = fh.getvalue() - from PyPDF2 import PdfReader + if self.file_loader_cls is not None: + fh.seek(0) + loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs) + docs = loader.load() + for doc in docs: + doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view" + return docs - pdf_reader = PdfReader(BytesIO(content)) - - return [ - Document( - page_content=page.extract_text(), - metadata={ - "source": f"https://drive.google.com/file/d/{id}/view", - "title": f"{file.get('name')}", - "page": i, - }, - ) - for i, page in enumerate(pdf_reader.pages) - ] + else: + from PyPDF2 import PdfReader + + content = fh.getvalue() + pdf_reader = PdfReader(BytesIO(content)) + + return [ + Document( + page_content=page.extract_text(), + metadata={ + "source": f"https://drive.google.com/file/d/{id}/view", + "title": f"{file.get('name')}", + "page": i, + }, + ) + for i, page in enumerate(pdf_reader.pages) + ] def _load_file_from_ids(self) -> List[Document]: """Load files from a list of IDs.""" From e0dea577eeaf30d81788be00c10f1036935a9d72 Mon Sep 17 00:00:00 2001 From: Alvaro Bartolome Date: Sun, 18 Jun 2023 20:18:33 +0200 Subject: [PATCH 08/11] Extend `ArgillaCallbackHandler` support (#6153) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hi again @agola11! 🤗 ## What's in this PR? After playing around with different chains we noticed that some chains were using different `output_key`s and we were just handling some, so we've extended the support to any output, either if it's a Python list or a string. Kudos to @dvsrepo for spotting this! --------- Co-authored-by: Daniel Vila Suero --- langchain/callbacks/argilla_callback.py | 83 +++++++++++++++---------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/langchain/callbacks/argilla_callback.py b/langchain/callbacks/argilla_callback.py index 8c866d3337..1d550461fa 100644 --- a/langchain/callbacks/argilla_callback.py +++ b/langchain/callbacks/argilla_callback.py @@ -220,7 +220,11 @@ class ArgillaCallbackHandler(BaseCallbackHandler): def on_chain_start( self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any ) -> None: - """Do nothing when LLM chain starts.""" + """If the key `input` is in `inputs`, then save it in `self.prompts` using + either the `parent_run_id` or the `run_id` as the key. This is done so that + we don't log the same input prompt twice, once when the LLM starts and once + when the chain starts. + """ if "input" in inputs: self.prompts.update( { @@ -233,44 +237,55 @@ class ArgillaCallbackHandler(BaseCallbackHandler): ) def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None: - """Do nothing when LLM chain ends.""" - prompts = self.prompts[str(kwargs["parent_run_id"] or kwargs["run_id"])] - if "outputs" in outputs: - # Creates the records and adds them to the `FeedbackDataset` - self.dataset.add_records( - records=[ - { - "fields": { - "prompt": prompt, - "response": output["text"].strip(), - }, - } - for prompt, output in zip(prompts, outputs["outputs"]) - ] - ) - elif "output" in outputs: - # Creates the records and adds them to the `FeedbackDataset` - self.dataset.add_records( - records=[ - { - "fields": { - "prompt": " ".join(prompts), - "response": outputs["output"].strip(), - }, - } - ] - ) - else: - raise ValueError( - "The `outputs` dictionary did not contain the expected keys `outputs` " - "or `output`." - ) + """If either the `parent_run_id` or the `run_id` is in `self.prompts`, then + log the outputs to Argilla, and pop the run from `self.prompts`. The behavior + differs if the output is a list or not. + """ + if not any( + key in self.prompts + for key in [str(kwargs["parent_run_id"]), str(kwargs["run_id"])] + ): + return + prompts = self.prompts.get(str(kwargs["parent_run_id"])) or self.prompts.get( + str(kwargs["run_id"]) + ) + for chain_output_key, chain_output_val in outputs.items(): + if isinstance(chain_output_val, list): + # Creates the records and adds them to the `FeedbackDataset` + self.dataset.add_records( + records=[ + { + "fields": { + "prompt": prompt, + "response": output["text"].strip(), + }, + } + for prompt, output in zip( + prompts, chain_output_val # type: ignore + ) + ] + ) + else: + # Creates the records and adds them to the `FeedbackDataset` + self.dataset.add_records( + records=[ + { + "fields": { + "prompt": " ".join(prompts), # type: ignore + "response": chain_output_val.strip(), + }, + } + ] + ) # Push the records to Argilla self.dataset.push_to_argilla() # Pop current run from `self.runs` - self.prompts.pop(str(kwargs["parent_run_id"] or kwargs["run_id"])) + if str(kwargs["parent_run_id"]) in self.prompts: + self.prompts.pop(str(kwargs["parent_run_id"])) + if str(kwargs["run_id"]) in self.prompts: + self.prompts.pop(str(kwargs["run_id"])) def on_chain_error( self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any From 2b3b4e0f600285d139d4e10a498530810d15b288 Mon Sep 17 00:00:00 2001 From: Vijay Date: Sun, 18 Jun 2023 22:19:56 +0200 Subject: [PATCH 09/11] Add the ability to run the map_reduce chains process results step as async (#6181) This will add the ability to add an AsyncCallbackManager (handler) for the reducer chain, which would be able to stream the tokens via the `async def on_llm_new_token` callback method Fixes # (issue) [5532](https://github.com/hwchase17/langchain/issues/5532) @hwchase17 @agola11 The following code snippet explains how this change would be used to enable `reduce_llm` with streaming support in a `map_reduce` chain I have tested this change and it works for the streaming use-case of reducer responses. I am happy to share more information if this makes solution sense. ``` AsyncHandler .......................... class StreamingLLMCallbackHandler(AsyncCallbackHandler): """Callback handler for streaming LLM responses.""" def __init__(self, websocket): self.websocket = websocket # This callback method is to be executed in async async def on_llm_new_token(self, token: str, **kwargs: Any) -> None: resp = ChatResponse(sender="bot", message=token, type="stream") await self.websocket.send_json(resp.dict()) Chain .......... stream_handler = StreamingLLMCallbackHandler(websocket) stream_manager = AsyncCallbackManager([stream_handler]) streaming_llm = ChatOpenAI( streaming=True, callback_manager=stream_manager, verbose=False, temperature=0, ) main_llm = OpenAI( temperature=0, verbose=False, ) doc_chain = load_qa_chain( llm=main_llm, reduce_llm=streaming_llm, chain_type="map_reduce", callback_manager=manager ) qa_chain = ConversationalRetrievalChain( retriever=vectorstore.as_retriever(), combine_docs_chain=doc_chain, question_generator=question_generator, callback_manager=manager, ) # Here `acall` will trigger `acombine_docs` on `map_reduce` which should then call `_aprocess_result` which in turn will call `self.combine_document_chain.arun` hence async callback will be awaited result = await qa_chain.acall( {"question": question, "chat_history": chat_history} ) ``` --- .../chains/combine_documents/map_reduce.py | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/langchain/chains/combine_documents/map_reduce.py b/langchain/chains/combine_documents/map_reduce.py index 06e87e0392..84e49296a5 100644 --- a/langchain/chains/combine_documents/map_reduce.py +++ b/langchain/chains/combine_documents/map_reduce.py @@ -163,16 +163,18 @@ class MapReduceDocumentsChain(BaseCombineDocumentsChain): [{**{self.document_variable_name: d.page_content}, **kwargs} for d in docs], callbacks=callbacks, ) - return self._process_results(results, docs, callbacks=callbacks, **kwargs) + return await self._aprocess_results( + results, docs, callbacks=callbacks, **kwargs + ) - def _process_results( + def _process_results_common( self, results: List[Dict], docs: List[Document], token_max: int = 3000, callbacks: Callbacks = None, **kwargs: Any, - ) -> Tuple[str, dict]: + ) -> Tuple[List[Document], dict]: question_result_key = self.llm_chain.output_key result_docs = [ Document(page_content=r[question_result_key], metadata=docs[i].metadata) @@ -201,11 +203,39 @@ class MapReduceDocumentsChain(BaseCombineDocumentsChain): extra_return_dict = {"intermediate_steps": _results} else: extra_return_dict = {} + return result_docs, extra_return_dict + + def _process_results( + self, + results: List[Dict], + docs: List[Document], + token_max: int = 3000, + callbacks: Callbacks = None, + **kwargs: Any, + ) -> Tuple[str, dict]: + result_docs, extra_return_dict = self._process_results_common( + results, docs, token_max, callbacks=callbacks, **kwargs + ) output = self.combine_document_chain.run( input_documents=result_docs, callbacks=callbacks, **kwargs ) return output, extra_return_dict + async def _aprocess_results( + self, + results: List[Dict], + docs: List[Document], + callbacks: Callbacks = None, + **kwargs: Any, + ) -> Tuple[str, dict]: + result_docs, extra_return_dict = self._process_results_common( + results, docs, callbacks=callbacks, **kwargs + ) + output = await self.combine_document_chain.arun( + input_documents=result_docs, callbacks=callbacks, **kwargs + ) + return output, extra_return_dict + @property def _chain_type(self) -> str: return "map_reduce_documents_chain" From 4fc7939848a600064dc20b44e86c19e2cfa01491 Mon Sep 17 00:00:00 2001 From: xleven Date: Mon, 19 Jun 2023 06:08:12 +0800 Subject: [PATCH 10/11] fix link of callbacks on modules page (#6323) Since [Callbacks](https://python.langchain.com/docs/modules/callbacks/getting_started/) on [Modules](https://python.langchain.com/docs/modules/) went to a "Page Not Found". --- docs/docs_skeleton/docs/modules/index.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs_skeleton/docs/modules/index.mdx b/docs/docs_skeleton/docs/modules/index.mdx index dfa94de6a8..ac6b36a51c 100644 --- a/docs/docs_skeleton/docs/modules/index.mdx +++ b/docs/docs_skeleton/docs/modules/index.mdx @@ -16,5 +16,5 @@ Construct sequences of calls Let chains choose which tools to use given high-level directives #### [Memory](/docs/modules/memory/) Persist application state between runs of a chain -#### [Callbacks](/docs/modules/callbacks/getting_started/) +#### [Callbacks](/docs/modules/callbacks/) Log and stream intermediate steps of any chain \ No newline at end of file From 5be465bd86f940cf831e3a4d2841d92ce8699ffb Mon Sep 17 00:00:00 2001 From: MIDORIBIN Date: Mon, 19 Jun 2023 08:39:57 +0900 Subject: [PATCH 11/11] Fixed PermissionError on windows (#6170) Fixed PermissionError that occurred when downloading PDF files via http in BasePDFLoader on windows. When downloading PDF files via http in BasePDFLoader, NamedTemporaryFile is used. This function cannot open the file again on **Windows**.[Python Doc](https://docs.python.org/3.9/library/tempfile.html#tempfile.NamedTemporaryFile) So, we created a **temporary directory** with TemporaryDirectory and placed the downloaded file there. temporary directory is deleted in the deconstruct. Fixes #2698 #### Who can review? Tag maintainers/contributors who might be interested: - @eyurtsev - @hwchase17 --- langchain/document_loaders/pdf.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index 5f204f1f7f..1d334e097a 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -62,15 +62,17 @@ class BasePDFLoader(BaseLoader, ABC): ) self.web_path = self.file_path - self.temp_file = tempfile.NamedTemporaryFile() - self.temp_file.write(r.content) - self.file_path = self.temp_file.name + self.temp_dir = tempfile.TemporaryDirectory() + temp_pdf = Path(self.temp_dir.name) / "tmp.pdf" + with open(temp_pdf, mode="wb") as f: + f.write(r.content) + self.file_path = str(temp_pdf) elif not os.path.isfile(self.file_path): raise ValueError("File path %s is not a valid file or url" % self.file_path) def __del__(self) -> None: - if hasattr(self, "temp_file"): - self.temp_file.close() + if hasattr(self, "temp_dir"): + self.temp_dir.cleanup() @staticmethod def _is_valid_url(url: str) -> bool: