diff --git a/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb b/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb index bff85c09..860a8977 100644 --- a/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb +++ b/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb @@ -50,8 +50,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "19c044f0", + "execution_count": 2, + "id": "ceb3c1fb", "metadata": {}, "outputs": [], "source": [ @@ -65,13 +65,16 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", - "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n", - "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" - ] + "data": { + "text/plain": [ + "[Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n", + " Document(page_content='Hi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n", + " Document(page_content='Hi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -85,83 +88,52 @@ "\n", "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", "md_header_splits = markdown_splitter.split_text(markdown_document)\n", - "for split in md_header_splits:\n", - " print(split)" - ] - }, - { - "cell_type": "markdown", - "id": "9bd8977a", - "metadata": {}, - "source": [ - "Within each markdown group we can then apply any text splitter we want. " + "md_header_splits" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "480e0e3a", - "metadata": {}, - "outputs": [], - "source": [ - "markdown_document = \"# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n", - "\n", - "headers_to_split_on = [\n", - " (\"#\", \"Header 1\"),\n", - " (\"##\", \"Header 2\"),\n", - "]\n", - "\n", - "# MD splits\n", - "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", - "md_header_splits = markdown_splitter.split_text(markdown_document)\n", - "\n", - "# Char-level splits\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "chunk_size = 10\n", - "chunk_overlap = 0\n", - "text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", - "\n", - "# Split within each header group\n", - "all_splits=[]\n", - "all_metadatas=[] \n", - "for header_group in md_header_splits:\n", - " _splits = text_splitter.split_text(header_group['content'])\n", - " _metadatas = [header_group['metadata'] for _ in _splits]\n", - " all_splits += _splits\n", - " all_metadatas += _metadatas" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "3f5d775e", + "execution_count": 4, + "id": "aac1738c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'Markdown[9'" + "langchain.schema.Document" ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_splits[0]" + "type(md_header_splits[0])" + ] + }, + { + "cell_type": "markdown", + "id": "9bd8977a", + "metadata": {}, + "source": [ + "Within each markdown group we can then apply any text splitter we want. " ] }, { "cell_type": "code", "execution_count": 8, - "id": "33ab0d5c", + "id": "480e0e3a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'Header 1': 'Intro', 'Header 2': 'History'}" + "[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n", + " Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n", + " Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n", + " Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n", + " Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]" ] }, "execution_count": 8, @@ -170,7 +142,26 @@ } ], "source": [ - "all_metadatas[0]" + "markdown_document = \"# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n", + "\n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + "]\n", + "\n", + "# MD splits\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", + "md_header_splits = markdown_splitter.split_text(markdown_document)\n", + "\n", + "# Char-level splits\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "chunk_size = 250\n", + "chunk_overlap = 30\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + "\n", + "# Split\n", + "splits = text_splitter.split_documents(md_header_splits)\n", + "splits" ] } ], diff --git a/docs/extras/use_cases/question_answering/document-context-aware-QA.ipynb b/docs/extras/use_cases/question_answering/document-context-aware-QA.ipynb index 6f958989..9bdccb23 100644 --- a/docs/extras/use_cases/question_answering/document-context-aware-QA.ipynb +++ b/docs/extras/use_cases/question_answering/document-context-aware-QA.ipynb @@ -9,11 +9,11 @@ "\n", "Text splitting for vector storage often uses sentences or other delimiters [to keep related text together](https://www.pinecone.io/learn/chunking-strategies/). \n", "\n", - "But many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting. \n", + "But many documents (such as `Markdown` files) have structure (headers) that can be explicitly used in splitting. \n", "\n", - "We added a new text splitter for Markdown files that lets a user split based specified headers. \n", + "The `MarkdownHeaderTextSplitter` lets a user split `Markdown` files files based on specified headers. \n", "\n", - "This results in chunks that retain the header(s) that it came from (e.g., Introduction) in the chunk metadata.\n", + "This results in chunks that retain the header(s) that it came from in the metadata.\n", "\n", "This works nicely w/ `SelfQueryRetriever`.\n", "\n", @@ -30,19 +30,10 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "cda52c2c", + "execution_count": null, + "id": "2e587f65", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/31treehaus/miniconda3/envs/langchain-new/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.4) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "# Load Notion page as a markdownfile file\n", "from langchain.document_loaders import NotionDirectoryLoader\n", @@ -54,22 +45,10 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "730b84f2", + "execution_count": null, + "id": "1cd3fd7e", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'content': 'We previously introduced [auto-evaluator](https://blog.langchain.dev/auto-evaluator-opportunities/), an open-source tool for grading LLM question-answer chains. Here, we extend auto-evaluator with a [lightweight Streamlit app](https://github.com/langchain-ai/auto-evaluator/tree/main/streamlit) that can connect to any existing Pinecone index. We add the ability to test metadata filtering using `SelfQueryRetriever` as well as some other approaches that we’ve found to be useful, as discussed below. \\n[ret_trim.mov](Auto-Evaluation%20of%20Metadata%20Filtering%2018502448c85240828f33716740f9574b/ret_trim.mov)',\n", - " 'metadata': {'Section': 'Evaluation'}}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Let's create groups based on the section headers in our page\n", "from langchain.text_splitter import MarkdownHeaderTextSplitter\n", @@ -77,8 +56,7 @@ " (\"###\", \"Section\"),\n", "]\n", "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", - "md_header_splits = markdown_splitter.split_text(md_file)\n", - "md_header_splits[3]" + "md_header_splits = markdown_splitter.split_text(md_file)" ] }, { @@ -86,7 +64,7 @@ "id": "4f73a609", "metadata": {}, "source": [ - "Now, we split the text in each header group and keep the group as metadata." + "Now, perform text splitting on the header grouped documents. " ] }, { @@ -101,57 +79,7 @@ "chunk_size = 500\n", "chunk_overlap = 0\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", - " \n", - "# Create splits within each header group and combine them\n", - "all_splits=[]\n", - "all_metadatas=[]\n", - "for header_group in md_header_splits:\n", - " _splits = text_splitter.split_text(header_group['content'])\n", - " _metadatas = [header_group['metadata'] for _ in _splits]\n", - " all_splits += _splits\n", - " all_metadatas += _metadatas" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "7424f78b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'In these cases, semantic search will look for the concept `episode 53` in the chunks, but instead we simply want to filter the chunks for `episode 53` and then perform semantic search to extract those that best summarize the episode. Metadata filtering does this, so long as we 1) we have a metadata filter for episode number and 2) we can extract the value from the query (e.g., `54` or `252`) that we want to extract. The LangChain `SelfQueryRetriever` does the latter (see'" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "all_splits[6]" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "08f5db3a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Section': 'Motivation'}" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "all_metadatas[6]" + "all_splits = text_splitter.split_documents(md_header_splits)" ] }, { @@ -183,7 +111,7 @@ "source": [ "# Build vectorstore and keep the metadata\n", "from langchain.vectorstores import Chroma\n", - "vectorstore = Chroma.from_texts(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())" + "vectorstore = Chroma.from_documents(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())" ] }, { diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index 8ffc2565..25fa4bd4 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -288,7 +288,7 @@ class MarkdownHeaderTextSplitter: headers_to_split_on, key=lambda split: len(split[0]), reverse=True ) - def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[LineType]: + def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]: """Combine lines with common metadata into chunks Args: lines: Line of text / associated header metadata @@ -307,9 +307,13 @@ class MarkdownHeaderTextSplitter: else: # Otherwise, append the current line to the aggregated list aggregated_chunks.append(line) - return aggregated_chunks - def split_text(self, text: str) -> List[LineType]: + return [ + Document(page_content=chunk["content"], metadata=chunk["metadata"]) + for chunk in aggregated_chunks + ] + + def split_text(self, text: str) -> List[Document]: """Split markdown file Args: text: Markdown file""" @@ -401,7 +405,10 @@ class MarkdownHeaderTextSplitter: if not self.return_each_line: return self.aggregate_lines_to_chunks(lines_with_metadata) else: - return lines_with_metadata + return [ + Document(page_content=chunk["content"], metadata=chunk["metadata"]) + for chunk in lines_with_metadata + ] # should be in newer Python versions (3.10+) diff --git a/tests/unit_tests/test_text_splitter.py b/tests/unit_tests/test_text_splitter.py index 8614a8ef..75312c2c 100644 --- a/tests/unit_tests/test_text_splitter.py +++ b/tests/unit_tests/test_text_splitter.py @@ -694,14 +694,14 @@ def test_md_header_text_splitter_1() -> None: ) output = markdown_splitter.split_text(markdown_document) expected_output = [ - { - "content": "Hi this is Jim \nHi this is Joe", - "metadata": {"Header 1": "Foo", "Header 2": "Bar"}, - }, - { - "content": "Hi this is Molly", - "metadata": {"Header 1": "Foo", "Header 2": "Baz"}, - }, + Document( + page_content="Hi this is Jim \nHi this is Joe", + metadata={"Header 1": "Foo", "Header 2": "Bar"}, + ), + Document( + page_content="Hi this is Molly", + metadata={"Header 1": "Foo", "Header 2": "Baz"}, + ), ] assert output == expected_output @@ -729,18 +729,18 @@ def test_md_header_text_splitter_2() -> None: ) output = markdown_splitter.split_text(markdown_document) expected_output = [ - { - "content": "Hi this is Jim \nHi this is Joe", - "metadata": {"Header 1": "Foo", "Header 2": "Bar"}, - }, - { - "content": "Hi this is Lance", - "metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"}, - }, - { - "content": "Hi this is Molly", - "metadata": {"Header 1": "Foo", "Header 2": "Baz"}, - }, + Document( + page_content="Hi this is Jim \nHi this is Joe", + metadata={"Header 1": "Foo", "Header 2": "Bar"}, + ), + Document( + page_content="Hi this is Lance", + metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"}, + ), + Document( + page_content="Hi this is Molly", + metadata={"Header 1": "Foo", "Header 2": "Baz"}, + ), ] assert output == expected_output @@ -774,27 +774,27 @@ def test_md_header_text_splitter_3() -> None: output = markdown_splitter.split_text(markdown_document) expected_output = [ - { - "content": "Hi this is Jim \nHi this is Joe", - "metadata": {"Header 1": "Foo", "Header 2": "Bar"}, - }, - { - "content": "Hi this is Lance", - "metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"}, - }, - { - "content": "Hi this is John", - "metadata": { + Document( + page_content="Hi this is Jim \nHi this is Joe", + metadata={"Header 1": "Foo", "Header 2": "Bar"}, + ), + Document( + page_content="Hi this is Lance", + metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"}, + ), + Document( + page_content="Hi this is John", + metadata={ "Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo", "Header 4": "Bim", }, - }, - { - "content": "Hi this is Molly", - "metadata": {"Header 1": "Foo", "Header 2": "Baz"}, - }, + ), + Document( + page_content="Hi this is Molly", + metadata={"Header 1": "Foo", "Header 2": "Baz"}, + ), ] assert output == expected_output