diff --git a/docs/modules/document_loaders/examples/email.ipynb b/docs/modules/document_loaders/examples/email.ipynb new file mode 100644 index 0000000000..ef04f4bb1b --- /dev/null +++ b/docs/modules/document_loaders/examples/email.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9fdbd55d", + "metadata": {}, + "source": [ + "# Email\n", + "\n", + "This notebook shows how to load email (`.eml`) files." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "40cd9806", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredEmailLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2d20b852", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredEmailLoader('example_data/fake-email.eml')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "579fa702", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "90c1d899", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='This is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ef9a5f4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/example_data/fake-content.html b/docs/modules/document_loaders/examples/example_data/fake-content.html new file mode 100644 index 0000000000..9ad19d308e --- /dev/null +++ b/docs/modules/document_loaders/examples/example_data/fake-content.html @@ -0,0 +1,9 @@ + + + + +

My First Heading

+

My first paragraph.

+ + + diff --git a/docs/modules/document_loaders/examples/example_data/fake-email.eml b/docs/modules/document_loaders/examples/example_data/fake-email.eml new file mode 100644 index 0000000000..9615367e6e --- /dev/null +++ b/docs/modules/document_loaders/examples/example_data/fake-email.eml @@ -0,0 +1,20 @@ +MIME-Version: 1.0 +Date: Fri, 16 Dec 2022 17:04:16 -0500 +Message-ID: +Subject: Test Email +From: Matthew Robinson +To: Matthew Robinson +Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" + +--00000000000095c9b205eff92630 +Content-Type: text/plain; charset="UTF-8" +This is a test email to use for unit tests. +Important points: + - Roses are red + - Violets are blue +--00000000000095c9b205eff92630 +Content-Type: text/html; charset="UTF-8" + +
This is a test email to use for unit tests.

Important points:
  • Roses are red
  • Violets are blue
+ +--00000000000095c9b205eff92630-- diff --git a/docs/modules/document_loaders/examples/example_data/fake-power-point.pptx b/docs/modules/document_loaders/examples/example_data/fake-power-point.pptx new file mode 100644 index 0000000000..01d8449489 Binary files /dev/null and b/docs/modules/document_loaders/examples/example_data/fake-power-point.pptx differ diff --git a/docs/modules/document_loaders/examples/example_data/fake.docx b/docs/modules/document_loaders/examples/example_data/fake.docx new file mode 100644 index 0000000000..566aa64571 Binary files /dev/null and b/docs/modules/document_loaders/examples/example_data/fake.docx differ diff --git a/docs/modules/document_loaders/examples/example_data/layout-parser-paper.pdf b/docs/modules/document_loaders/examples/example_data/layout-parser-paper.pdf new file mode 100644 index 0000000000..c4b6c2ef88 Binary files /dev/null and b/docs/modules/document_loaders/examples/example_data/layout-parser-paper.pdf differ diff --git a/docs/modules/document_loaders/examples/microsoft_word.ipynb b/docs/modules/document_loaders/examples/microsoft_word.ipynb new file mode 100644 index 0000000000..ae1c35ab2b --- /dev/null +++ b/docs/modules/document_loaders/examples/microsoft_word.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "34c90eed", + "metadata": {}, + "source": [ + "# Microsoft Word\n", + "\n", + "This notebook shows how to load text from Microsoft word documents." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "28ded768", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredDocxLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f1f26035", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredDocxLoader('example_data/fake.docx')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2c87dde9", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0e4a884c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61953c83", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/examples/obsidian.ipynb b/docs/modules/document_loaders/examples/obsidian.ipynb new file mode 100644 index 0000000000..e92b9c2b96 --- /dev/null +++ b/docs/modules/document_loaders/examples/obsidian.ipynb @@ -0,0 +1,66 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1dc7df1d", + "metadata": {}, + "source": [ + "# Obsidian\n", + "This notebook covers how to load documents from an Obsidian database.\n", + "\n", + "Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "007c5cbf", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import ObsidianLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1caec59", + "metadata": {}, + "outputs": [], + "source": [ + "loader = ObsidianLoader(\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1c30ff7", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst index daa8a352ba..7d64326ddd 100644 --- a/docs/modules/document_loaders/how_to_guides.rst +++ b/docs/modules/document_loaders/how_to_guides.rst @@ -17,6 +17,14 @@ There are a lot of different document loaders that LangChain supports. Below are `PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file. +`Email <./examples/email.html>`_: A walkthrough of how to load data from an email (`.eml`) file. + +`GoogleDrive <./examples/googledrive.html>`_: A walkthrough of how to load data from Google drive. + +`Microsoft Word <./examples/microsoft_word.html>`_: A walkthrough of how to load data from Microsoft Word files. + +`Obsidian <./examples/obsidian.html>`_: A walkthrough of how to load data from an Obsidian file dump. + .. toctree:: :maxdepth: 1 :glob: diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 328fcdb9a7..0b5b3b6c78 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -1,9 +1,12 @@ """All different types of document loaders.""" from langchain.document_loaders.directory import DirectoryLoader +from langchain.document_loaders.docx import UnstructuredDocxLoader +from langchain.document_loaders.email import UnstructuredEmailLoader from langchain.document_loaders.googledrive import GoogleDriveLoader from langchain.document_loaders.html import UnstructuredHTMLLoader from langchain.document_loaders.notion import NotionDirectoryLoader +from langchain.document_loaders.obsidian import ObsidianLoader from langchain.document_loaders.pdf import UnstructuredPDFLoader from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader from langchain.document_loaders.readthedocs import ReadTheDocsLoader @@ -18,4 +21,7 @@ __all__ = [ "UnstructuredHTMLLoader", "UnstructuredPowerPointLoader", "UnstructuredPDFLoader", + "ObsidianLoader", + "UnstructuredDocxLoader", + "UnstructuredEmailLoader", ] diff --git a/langchain/document_loaders/docx.py b/langchain/document_loaders/docx.py new file mode 100644 index 0000000000..0b595ece5a --- /dev/null +++ b/langchain/document_loaders/docx.py @@ -0,0 +1,29 @@ +"""Loader that loads Microsoft Word files.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class UnstructuredDocxLoader(BaseLoader): + """Loader that uses unstructured to load Microsoft Word files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + try: + import unstructured # noqa:F401 + except ImportError: + raise ValueError( + "unstructured package not found, please install it with " + "`pip install unstructured`" + ) + self.file_path = file_path + + def load(self) -> List[Document]: + """Load file.""" + from unstructured.partition.docx import partition_docx + + elements = partition_docx(filename=self.file_path) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)] diff --git a/langchain/document_loaders/email.py b/langchain/document_loaders/email.py new file mode 100644 index 0000000000..ec22601fb3 --- /dev/null +++ b/langchain/document_loaders/email.py @@ -0,0 +1,29 @@ +"""Loader that loads email files.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class UnstructuredEmailLoader(BaseLoader): + """Loader that uses unstructured to load email files.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + try: + import unstructured # noqa:F401 + except ImportError: + raise ValueError( + "unstructured package not found, please install it with " + "`pip install unstructured`" + ) + self.file_path = file_path + + def load(self) -> List[Document]: + """Load file.""" + from unstructured.partition.email import partition_email + + elements = partition_email(filename=self.file_path) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)] diff --git a/langchain/document_loaders/obsidian.py b/langchain/document_loaders/obsidian.py new file mode 100644 index 0000000000..13403b2fe4 --- /dev/null +++ b/langchain/document_loaders/obsidian.py @@ -0,0 +1,25 @@ +"""Loader that loads Notion directory dump.""" +from pathlib import Path +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class ObsidianLoader(BaseLoader): + """Loader that loads Obsidian files from disk.""" + + def __init__(self, path: str): + """Initialize with path.""" + self.file_path = path + + def load(self) -> List[Document]: + """Load documents.""" + ps = list(Path(self.file_path).glob("**/*.md")) + docs = [] + for p in ps: + with open(p) as f: + text = f.read() + metadata = {"source": str(p)} + docs.append(Document(page_content=text, metadata=metadata)) + return docs diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index 629e0e0bf1..687e010f09 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -1,4 +1,4 @@ -"""Loader that loads PowerPoint files.""" +"""Loader that loads PDF files.""" from typing import List from langchain.docstore.document import Document @@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader class UnstructuredPDFLoader(BaseLoader): - """Loader that uses unstructured to load PowerPoint files.""" + """Loader that uses unstructured to load PDF files.""" def __init__(self, file_path: str): """Initialize with file path.""" diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py index d75f7c600b..40bb9825aa 100644 --- a/langchain/document_loaders/powerpoint.py +++ b/langchain/document_loaders/powerpoint.py @@ -1,4 +1,4 @@ -"""Loader that loads PDF files.""" +"""Loader that loads powerpoint files.""" from typing import List from langchain.docstore.document import Document @@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader class UnstructuredPowerPointLoader(BaseLoader): - """Loader that uses unstructured to load PDF files.""" + """Loader that uses unstructured to load powerpoint files.""" def __init__(self, file_path: str): """Initialize with file path."""