Harrison/obsidian (#920)

1 year ago · 637c0d6508
parent 1e56879d38
commit 637c0d6508
15 changed files with 384 additions and 4 deletions
--- a/docs/modules/document_loaders/examples/email.ipynb
+++ b/docs/modules/document_loaders/examples/email.ipynb
@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9fdbd55d",
+   "metadata": {},
+   "source": [
+    "# Email\n",
+    "\n",
+    "This notebook shows how to load email (`.eml`) files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "40cd9806",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredEmailLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2d20b852",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredEmailLoader('example_data/fake-email.eml')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "579fa702",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "90c1d899",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='This is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ef9a5f4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/document_loaders/examples/example_data/fake-content.html
+++ b/docs/modules/document_loaders/examples/example_data/fake-content.html
@ -0,0 +1,9 @@
+<!DOCTYPE html>
+<html>
+<body>
+
+<h1>My First Heading</h1>
+<p>My first paragraph.</p>
+
+</body>
+</html>
--- a/docs/modules/document_loaders/examples/example_data/fake-email.eml
+++ b/docs/modules/document_loaders/examples/example_data/fake-email.eml
@ -0,0 +1,20 @@
+MIME-Version: 1.0
+Date: Fri, 16 Dec 2022 17:04:16 -0500
+Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
+Subject: Test Email
+From: Matthew Robinson <mrobinson@unstructured.io>
+To: Matthew Robinson <mrobinson@unstructured.io>
+Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
+
+--00000000000095c9b205eff92630
+Content-Type: text/plain; charset="UTF-8"
+This is a test email to use for unit tests.
+Important points:
+   - Roses are red
+   - Violets are blue
+--00000000000095c9b205eff92630
+Content-Type: text/html; charset="UTF-8"
+
+<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
+
+--00000000000095c9b205eff92630--
--- a/docs/modules/document_loaders/examples/example_data/fake-power-point.pptx
+++ b/docs/modules/document_loaders/examples/example_data/fake-power-point.pptx
--- a/docs/modules/document_loaders/examples/example_data/fake.docx
+++ b/docs/modules/document_loaders/examples/example_data/fake.docx
--- a/docs/modules/document_loaders/examples/example_data/layout-parser-paper.pdf
+++ b/docs/modules/document_loaders/examples/example_data/layout-parser-paper.pdf
--- a/docs/modules/document_loaders/examples/microsoft_word.ipynb
+++ b/docs/modules/document_loaders/examples/microsoft_word.ipynb
@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "34c90eed",
+   "metadata": {},
+   "source": [
+    "# Microsoft Word\n",
+    "\n",
+    "This notebook shows how to load text from Microsoft word documents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "28ded768",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredDocxLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f1f26035",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredDocxLoader('example_data/fake.docx')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2c87dde9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "0e4a884c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61953c83",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/document_loaders/examples/obsidian.ipynb
+++ b/docs/modules/document_loaders/examples/obsidian.ipynb
@ -0,0 +1,66 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1dc7df1d",
+   "metadata": {},
+   "source": [
+    "# Obsidian\n",
+    "This notebook covers how to load documents from an Obsidian database.\n",
+    "\n",
+    "Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "007c5cbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import ObsidianLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1caec59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = ObsidianLoader(\"<path-to-obsidian>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1c30ff7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/document_loaders/how_to_guides.rst
+++ b/docs/modules/document_loaders/how_to_guides.rst
@ -17,6 +17,14 @@ There are a lot of different document loaders that LangChain supports. Below are

 `PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file.

+`Email <./examples/email.html>`_: A walkthrough of how to load data from an email (`.eml`) file.
+
+`GoogleDrive <./examples/googledrive.html>`_: A walkthrough of how to load data from Google drive.
+
+`Microsoft Word <./examples/microsoft_word.html>`_: A walkthrough of how to load data from Microsoft Word files.
+
+`Obsidian <./examples/obsidian.html>`_: A walkthrough of how to load data from an Obsidian file dump.
+
 .. toctree::
   :maxdepth: 1
   :glob:
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -1,9 +1,12 @@
 """All different types of document loaders."""

 from langchain.document_loaders.directory import DirectoryLoader
+from langchain.document_loaders.docx import UnstructuredDocxLoader
+from langchain.document_loaders.email import UnstructuredEmailLoader
 from langchain.document_loaders.googledrive import GoogleDriveLoader
 from langchain.document_loaders.html import UnstructuredHTMLLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
+from langchain.document_loaders.obsidian import ObsidianLoader
 from langchain.document_loaders.pdf import UnstructuredPDFLoader
 from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
 from langchain.document_loaders.readthedocs import ReadTheDocsLoader
@ -18,4 +21,7 @@ __all__ = [
    "UnstructuredHTMLLoader",
    "UnstructuredPowerPointLoader",
    "UnstructuredPDFLoader",
+    "ObsidianLoader",
+    "UnstructuredDocxLoader",
+    "UnstructuredEmailLoader",
 ]
--- a/langchain/document_loaders/docx.py
+++ b/langchain/document_loaders/docx.py
@ -0,0 +1,29 @@
+"""Loader that loads Microsoft Word files."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class UnstructuredDocxLoader(BaseLoader):
+    """Loader that uses unstructured to load Microsoft Word files."""
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        try:
+            import unstructured  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "unstructured package not found, please install it with "
+                "`pip install unstructured`"
+            )
+        self.file_path = file_path
+
+    def load(self) -> List[Document]:
+        """Load file."""
+        from unstructured.partition.docx import partition_docx
+
+        elements = partition_docx(filename=self.file_path)
+        text = "\n\n".join([str(el) for el in elements])
+        metadata = {"source": self.file_path}
+        return [Document(page_content=text, metadata=metadata)]
--- a/langchain/document_loaders/email.py
+++ b/langchain/document_loaders/email.py
@ -0,0 +1,29 @@
+"""Loader that loads email files."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class UnstructuredEmailLoader(BaseLoader):
+    """Loader that uses unstructured to load email files."""
+
+    def __init__(self, file_path: str):
+        """Initialize with file path."""
+        try:
+            import unstructured  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "unstructured package not found, please install it with "
+                "`pip install unstructured`"
+            )
+        self.file_path = file_path
+
+    def load(self) -> List[Document]:
+        """Load file."""
+        from unstructured.partition.email import partition_email
+
+        elements = partition_email(filename=self.file_path)
+        text = "\n\n".join([str(el) for el in elements])
+        metadata = {"source": self.file_path}
+        return [Document(page_content=text, metadata=metadata)]
--- a/langchain/document_loaders/obsidian.py
+++ b/langchain/document_loaders/obsidian.py
@ -0,0 +1,25 @@
+"""Loader that loads Notion directory dump."""
+from pathlib import Path
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class ObsidianLoader(BaseLoader):
+    """Loader that loads Obsidian files from disk."""
+
+    def __init__(self, path: str):
+        """Initialize with path."""
+        self.file_path = path
+
+    def load(self) -> List[Document]:
+        """Load documents."""
+        ps = list(Path(self.file_path).glob("**/*.md"))
+        docs = []
+        for p in ps:
+            with open(p) as f:
+                text = f.read()
+            metadata = {"source": str(p)}
+            docs.append(Document(page_content=text, metadata=metadata))
+        return docs
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@ -1,4 +1,4 @@
-"""Loader that loads PowerPoint files."""
+"""Loader that loads PDF files."""
 from typing import List

 from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader


 class UnstructuredPDFLoader(BaseLoader):
-    """Loader that uses unstructured to load PowerPoint files."""
+    """Loader that uses unstructured to load PDF files."""

    def __init__(self, file_path: str):
        """Initialize with file path."""
--- a/langchain/document_loaders/powerpoint.py
+++ b/langchain/document_loaders/powerpoint.py
@ -1,4 +1,4 @@
-"""Loader that loads PDF files."""
+"""Loader that loads powerpoint files."""
 from typing import List

 from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader


 class UnstructuredPowerPointLoader(BaseLoader):
-    """Loader that uses unstructured to load PDF files."""
+    """Loader that uses unstructured to load powerpoint files."""

    def __init__(self, file_path: str):
        """Initialize with file path."""