From 7018806a9266a69a953dcab7d222b60639de4492 Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructuredai.io>
Date: Thu, 9 Mar 2023 13:55:07 -0500
Subject: [PATCH] feat: document loader for markdown files (#1558)

### Summary

Adds a document loader for handling markdown files. This document loader
requires `unstructured>=0.4.16`.

### Testing

```python
from langchain.document_loaders import UnstructuredMarkdownLoader

loader = UnstructuredMarkdownLoader("README.md")
loader.load()
```
---
 .../document_loaders/examples/markdown.ipynb  | 145 ++++++++++++++++++
 langchain/document_loaders/__init__.py        |   2 +
 langchain/document_loaders/markdown.py        |  25 +++
 3 files changed, 172 insertions(+)
 create mode 100644 docs/modules/document_loaders/examples/markdown.ipynb
 create mode 100644 langchain/document_loaders/markdown.py

diff --git a/docs/modules/document_loaders/examples/markdown.ipynb b/docs/modules/document_loaders/examples/markdown.ipynb
new file mode 100644
index 0000000000..99156fe7b9
--- /dev/null
+++ b/docs/modules/document_loaders/examples/markdown.ipynb
@@ -0,0 +1,145 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "39af9ecd",
+   "metadata": {},
+   "source": [
+    "# Markdown\n",
+    "\n",
+    "This covers how to load markdown documents into a document format that we can use downstream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "721c48aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredMarkdownLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9d3d0e35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredMarkdownLoader(\"../../../../README.md\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "06073f91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c9adc5cb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content=\"ð\\x9f¦\\x9cï¸\\x8fð\\x9f”\\x97 LangChain\\n\\nâ\\x9a¡ Building applications with LLMs through composability â\\x9a¡\\n\\nProduction Support: As you move your LangChains into production, we'd love to offer more comprehensive support.\\nPlease fill out this form and we'll set up a dedicated support Slack channel.\\n\\nQuick Install\\n\\npip install langchain\\n\\nð\\x9f¤” What is this?\\n\\nLarge language models (LLMs) are emerging as a transformative technology, enabling\\ndevelopers to build applications that they previously could not.\\nBut using these LLMs in isolation is often not enough to\\ncreate a truly powerful app - the real power comes when you can combine them with other sources of computation or knowledge.\\n\\nThis library is aimed at assisting in the development of those types of applications. Common examples of these types of applications include:\\n\\nâ\\x9d“ Question Answering over specific documents\\n\\nDocumentation\\n\\nEnd-to-end Example: Question Answering over Notion Database\\n\\nð\\x9f’¬ Chatbots\\n\\nDocumentation\\n\\nEnd-to-end Example: Chat-LangChain\\n\\nð\\x9f¤\\x96 Agents\\n\\nDocumentation\\n\\nEnd-to-end Example: GPT+WolframAlpha\\n\\nð\\x9f“\\x96 Documentation\\n\\nPlease see here for full documentation on:\\n\\nGetting started (installation, setting up the environment, simple examples)\\n\\nHow-To examples (demos, integrations, helper functions)\\n\\nReference (full API docs)\\n  Resources (high-level explanation of core concepts)\\n\\nð\\x9f\\x9a\\x80 What can this help with?\\n\\nThere are six main areas that LangChain is designed to help with.\\nThese are, in increasing order of complexity:\\n\\nð\\x9f“\\x83 LLMs and Prompts:\\n\\nThis includes prompt management, prompt optimization, generic interface for all LLMs, and common utilities for working with LLMs.\\n\\nð\\x9f”\\x97 Chains:\\n\\nChains go beyond just a single LLM call, and are sequences of calls (whether to an LLM or a different utility). LangChain provides a standard interface for chains, lots of integrations with other tools, and end-to-end chains for common applications.\\n\\nð\\x9f“\\x9a Data Augmented Generation:\\n\\nData Augmented Generation involves specific types of chains that first interact with an external datasource to fetch data to use in the generation step. Examples of this include summarization of long pieces of text and question/answering over specific data sources.\\n\\nð\\x9f¤\\x96 Agents:\\n\\nAgents involve an LLM making decisions about which Actions to take, taking that Action, seeing an Observation, and repeating that until done. LangChain provides a standard interface for agents, a selection of agents to choose from, and examples of end to end agents.\\n\\nð\\x9f§\\xa0 Memory:\\n\\nMemory is the concept of persisting state between calls of a chain/agent. LangChain provides a standard interface for memory, a collection of memory implementations, and examples of chains/agents that use memory.\\n\\nð\\x9f§\\x90 Evaluation:\\n\\n[BETA] Generative models are notoriously hard to evaluate with traditional metrics. One new way of evaluating them is using language models themselves to do the evaluation. LangChain provides some prompts/chains for assisting in this.\\n\\nFor more information on these concepts, please see our full documentation.\\n\\nð\\x9f’\\x81 Contributing\\n\\nAs an open source project in a rapidly developing field, we are extremely open to contributions, whether it be in the form of a new feature, improved infra, or better documentation.\\n\\nFor detailed information on how to contribute, see here.\", lookup_str='', metadata={'source': '../../../../README.md'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "525d6b67",
+   "metadata": {},
+   "source": [
+    "## Retain Elements\n",
+    "\n",
+    "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "064f9162",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredMarkdownLoader(\"../../../../README.md\", mode=\"elements\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "abefbbdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a547c534",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='ð\\x9f¦\\x9cï¸\\x8fð\\x9f”\\x97 LangChain', lookup_str='', metadata={'source': '../../../../README.md', 'page_number': 1, 'category': 'UncategorizedText'}, lookup_index=0)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "381d4139",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index d28ed937aa..e6d6cc9b9b 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -19,6 +19,7 @@ from langchain.document_loaders.html import UnstructuredHTMLLoader
 from langchain.document_loaders.ifixit import IFixitLoader
 from langchain.document_loaders.image import UnstructuredImageLoader
 from langchain.document_loaders.imsdb import IMSDbLoader
+from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
 from langchain.document_loaders.notebook import NotebookLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
 from langchain.document_loaders.obsidian import ObsidianLoader
@@ -66,6 +67,7 @@ __all__ = [
     "ObsidianLoader",
     "UnstructuredDocxLoader",
     "UnstructuredEmailLoader",
+    "UnstructuredMarkdownLoader",
     "RoamLoader",
     "YoutubeLoader",
     "S3FileLoader",
diff --git a/langchain/document_loaders/markdown.py b/langchain/document_loaders/markdown.py
new file mode 100644
index 0000000000..c049e8ff6d
--- /dev/null
+++ b/langchain/document_loaders/markdown.py
@@ -0,0 +1,25 @@
+"""Loader that loads Markdown files."""
+from typing import List
+
+from langchain.document_loaders.unstructured import UnstructuredFileLoader
+
+
+class UnstructuredMarkdownLoader(UnstructuredFileLoader):
+    """Loader that uses unstructured to load markdown files."""
+
+    def _get_elements(self) -> List:
+        from unstructured.__version__ import __version__ as __unstructured_version__
+        from unstructured.partition.md import partition_md
+
+        # NOTE(MthwRobinson) - enables the loader to work when you're using pre-release
+        # versions of unstructured like 0.4.17-dev1
+        _unstructured_version = __unstructured_version__.split("-")[0]
+        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
+
+        if unstructured_version < (0, 4, 16):
+            raise ValueError(
+                f"You are on unstructured version {__unstructured_version__}. "
+                "Partitioning markdown files is only supported in unstructured>=0.4.16."
+            )
+
+        return partition_md(filename=self.file_path)