diff --git a/docs/modules/document_loaders/examples/email.ipynb b/docs/modules/document_loaders/examples/email.ipynb
new file mode 100644
index 0000000000..ef04f4bb1b
--- /dev/null
+++ b/docs/modules/document_loaders/examples/email.ipynb
@@ -0,0 +1,94 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "9fdbd55d",
+ "metadata": {},
+ "source": [
+ "# Email\n",
+ "\n",
+ "This notebook shows how to load email (`.eml`) files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "40cd9806",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders import UnstructuredEmailLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "2d20b852",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = UnstructuredEmailLoader('example_data/fake-email.eml')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "579fa702",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = loader.load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "90c1d899",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Document(page_content='This is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4ef9a5f4",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/modules/document_loaders/examples/example_data/fake-content.html b/docs/modules/document_loaders/examples/example_data/fake-content.html
new file mode 100644
index 0000000000..9ad19d308e
--- /dev/null
+++ b/docs/modules/document_loaders/examples/example_data/fake-content.html
@@ -0,0 +1,9 @@
+
+
+
+
+My First Heading
+My first paragraph.
+
+
+
diff --git a/docs/modules/document_loaders/examples/example_data/fake-email.eml b/docs/modules/document_loaders/examples/example_data/fake-email.eml
new file mode 100644
index 0000000000..9615367e6e
--- /dev/null
+++ b/docs/modules/document_loaders/examples/example_data/fake-email.eml
@@ -0,0 +1,20 @@
+MIME-Version: 1.0
+Date: Fri, 16 Dec 2022 17:04:16 -0500
+Message-ID:
+Subject: Test Email
+From: Matthew Robinson
+To: Matthew Robinson
+Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
+
+--00000000000095c9b205eff92630
+Content-Type: text/plain; charset="UTF-8"
+This is a test email to use for unit tests.
+Important points:
+ - Roses are red
+ - Violets are blue
+--00000000000095c9b205eff92630
+Content-Type: text/html; charset="UTF-8"
+
+This is a test email to use for unit tests.
Important points:
- Roses are red
- Violets are blue
+
+--00000000000095c9b205eff92630--
diff --git a/docs/modules/document_loaders/examples/example_data/fake-power-point.pptx b/docs/modules/document_loaders/examples/example_data/fake-power-point.pptx
new file mode 100644
index 0000000000..01d8449489
Binary files /dev/null and b/docs/modules/document_loaders/examples/example_data/fake-power-point.pptx differ
diff --git a/docs/modules/document_loaders/examples/example_data/fake.docx b/docs/modules/document_loaders/examples/example_data/fake.docx
new file mode 100644
index 0000000000..566aa64571
Binary files /dev/null and b/docs/modules/document_loaders/examples/example_data/fake.docx differ
diff --git a/docs/modules/document_loaders/examples/example_data/layout-parser-paper.pdf b/docs/modules/document_loaders/examples/example_data/layout-parser-paper.pdf
new file mode 100644
index 0000000000..c4b6c2ef88
Binary files /dev/null and b/docs/modules/document_loaders/examples/example_data/layout-parser-paper.pdf differ
diff --git a/docs/modules/document_loaders/examples/microsoft_word.ipynb b/docs/modules/document_loaders/examples/microsoft_word.ipynb
new file mode 100644
index 0000000000..ae1c35ab2b
--- /dev/null
+++ b/docs/modules/document_loaders/examples/microsoft_word.ipynb
@@ -0,0 +1,94 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "34c90eed",
+ "metadata": {},
+ "source": [
+ "# Microsoft Word\n",
+ "\n",
+ "This notebook shows how to load text from Microsoft word documents."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "28ded768",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders import UnstructuredDocxLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "f1f26035",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = UnstructuredDocxLoader('example_data/fake.docx')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "2c87dde9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = loader.load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "0e4a884c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "61953c83",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/modules/document_loaders/examples/obsidian.ipynb b/docs/modules/document_loaders/examples/obsidian.ipynb
new file mode 100644
index 0000000000..e92b9c2b96
--- /dev/null
+++ b/docs/modules/document_loaders/examples/obsidian.ipynb
@@ -0,0 +1,66 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "1dc7df1d",
+ "metadata": {},
+ "source": [
+ "# Obsidian\n",
+ "This notebook covers how to load documents from an Obsidian database.\n",
+ "\n",
+ "Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "007c5cbf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders import ObsidianLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1caec59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loader = ObsidianLoader(\"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b1c30ff7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs = loader.load()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst
index daa8a352ba..7d64326ddd 100644
--- a/docs/modules/document_loaders/how_to_guides.rst
+++ b/docs/modules/document_loaders/how_to_guides.rst
@@ -17,6 +17,14 @@ There are a lot of different document loaders that LangChain supports. Below are
`PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file.
+`Email <./examples/email.html>`_: A walkthrough of how to load data from an email (`.eml`) file.
+
+`GoogleDrive <./examples/googledrive.html>`_: A walkthrough of how to load data from Google drive.
+
+`Microsoft Word <./examples/microsoft_word.html>`_: A walkthrough of how to load data from Microsoft Word files.
+
+`Obsidian <./examples/obsidian.html>`_: A walkthrough of how to load data from an Obsidian file dump.
+
.. toctree::
:maxdepth: 1
:glob:
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index 328fcdb9a7..0b5b3b6c78 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -1,9 +1,12 @@
"""All different types of document loaders."""
from langchain.document_loaders.directory import DirectoryLoader
+from langchain.document_loaders.docx import UnstructuredDocxLoader
+from langchain.document_loaders.email import UnstructuredEmailLoader
from langchain.document_loaders.googledrive import GoogleDriveLoader
from langchain.document_loaders.html import UnstructuredHTMLLoader
from langchain.document_loaders.notion import NotionDirectoryLoader
+from langchain.document_loaders.obsidian import ObsidianLoader
from langchain.document_loaders.pdf import UnstructuredPDFLoader
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
@@ -18,4 +21,7 @@ __all__ = [
"UnstructuredHTMLLoader",
"UnstructuredPowerPointLoader",
"UnstructuredPDFLoader",
+ "ObsidianLoader",
+ "UnstructuredDocxLoader",
+ "UnstructuredEmailLoader",
]
diff --git a/langchain/document_loaders/docx.py b/langchain/document_loaders/docx.py
new file mode 100644
index 0000000000..0b595ece5a
--- /dev/null
+++ b/langchain/document_loaders/docx.py
@@ -0,0 +1,29 @@
+"""Loader that loads Microsoft Word files."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class UnstructuredDocxLoader(BaseLoader):
+ """Loader that uses unstructured to load Microsoft Word files."""
+
+ def __init__(self, file_path: str):
+ """Initialize with file path."""
+ try:
+ import unstructured # noqa:F401
+ except ImportError:
+ raise ValueError(
+ "unstructured package not found, please install it with "
+ "`pip install unstructured`"
+ )
+ self.file_path = file_path
+
+ def load(self) -> List[Document]:
+ """Load file."""
+ from unstructured.partition.docx import partition_docx
+
+ elements = partition_docx(filename=self.file_path)
+ text = "\n\n".join([str(el) for el in elements])
+ metadata = {"source": self.file_path}
+ return [Document(page_content=text, metadata=metadata)]
diff --git a/langchain/document_loaders/email.py b/langchain/document_loaders/email.py
new file mode 100644
index 0000000000..ec22601fb3
--- /dev/null
+++ b/langchain/document_loaders/email.py
@@ -0,0 +1,29 @@
+"""Loader that loads email files."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class UnstructuredEmailLoader(BaseLoader):
+ """Loader that uses unstructured to load email files."""
+
+ def __init__(self, file_path: str):
+ """Initialize with file path."""
+ try:
+ import unstructured # noqa:F401
+ except ImportError:
+ raise ValueError(
+ "unstructured package not found, please install it with "
+ "`pip install unstructured`"
+ )
+ self.file_path = file_path
+
+ def load(self) -> List[Document]:
+ """Load file."""
+ from unstructured.partition.email import partition_email
+
+ elements = partition_email(filename=self.file_path)
+ text = "\n\n".join([str(el) for el in elements])
+ metadata = {"source": self.file_path}
+ return [Document(page_content=text, metadata=metadata)]
diff --git a/langchain/document_loaders/obsidian.py b/langchain/document_loaders/obsidian.py
new file mode 100644
index 0000000000..13403b2fe4
--- /dev/null
+++ b/langchain/document_loaders/obsidian.py
@@ -0,0 +1,25 @@
+"""Loader that loads Notion directory dump."""
+from pathlib import Path
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class ObsidianLoader(BaseLoader):
+ """Loader that loads Obsidian files from disk."""
+
+ def __init__(self, path: str):
+ """Initialize with path."""
+ self.file_path = path
+
+ def load(self) -> List[Document]:
+ """Load documents."""
+ ps = list(Path(self.file_path).glob("**/*.md"))
+ docs = []
+ for p in ps:
+ with open(p) as f:
+ text = f.read()
+ metadata = {"source": str(p)}
+ docs.append(Document(page_content=text, metadata=metadata))
+ return docs
diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py
index 629e0e0bf1..687e010f09 100644
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@@ -1,4 +1,4 @@
-"""Loader that loads PowerPoint files."""
+"""Loader that loads PDF files."""
from typing import List
from langchain.docstore.document import Document
@@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
class UnstructuredPDFLoader(BaseLoader):
- """Loader that uses unstructured to load PowerPoint files."""
+ """Loader that uses unstructured to load PDF files."""
def __init__(self, file_path: str):
"""Initialize with file path."""
diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py
index d75f7c600b..40bb9825aa 100644
--- a/langchain/document_loaders/powerpoint.py
+++ b/langchain/document_loaders/powerpoint.py
@@ -1,4 +1,4 @@
-"""Loader that loads PDF files."""
+"""Loader that loads powerpoint files."""
from typing import List
from langchain.docstore.document import Document
@@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
class UnstructuredPowerPointLoader(BaseLoader):
- """Loader that uses unstructured to load PDF files."""
+ """Loader that uses unstructured to load powerpoint files."""
def __init__(self, file_path: str):
"""Initialize with file path."""