diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/example_data/README.org b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/README.org new file mode 100644 index 0000000000..5b9f472804 --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/README.org @@ -0,0 +1,27 @@ +* Example Docs + +The sample docs directory contains the following files: + +- ~example-10k.html~ - A 10-K SEC filing in HTML format +- ~layout-parser-paper.pdf~ - A PDF copy of the layout parser paper +- ~factbook.xml~ / ~factbook.xsl~ - Example XML/XLS files that you + can use to test stylesheets + +These documents can be used to test out the parsers in the library. In +addition, here are instructions for pulling in some sample docs that are +too big to store in the repo. + +** XBRL 10-K + +You can get an example 10-K in inline XBRL format using the following +~curl~. Note, you need to have the user agent set in the header or the +SEC site will reject your request. + +#+BEGIN_SRC bash + + curl -O \ + -A '${organization} ${email}' + https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt +#+END_SRC + +You can parse this document using the HTML parser. diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/org_mode.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/org_mode.ipynb new file mode 100644 index 0000000000..b749e6be1a --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/org_mode.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Org-mode\n", + "\n", + ">A [Org Mode document](https://en.wikipedia.org/wiki/Org-mode) is a document editing, formatting, and organizing mode, designed for notes, planning, and authoring within the free software text editor Emacs." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `UnstructuredOrgModeLoader`\n", + "\n", + "You can load data from Org-mode files with `UnstructuredOrgModeLoader` using the following workflow." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredOrgModeLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredOrgModeLoader(\n", + " file_path=\"example_data/README.org\", mode=\"elements\"\n", + ")\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='Example Docs' metadata={'source': 'example_data/README.org', 'filename': 'README.org', 'file_directory': 'example_data', 'filetype': 'text/org', 'page_number': 1, 'category': 'Title'}\n" + ] + } + ], + "source": [ + "print(docs[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 2bf8f76e76..6bb6b7b0b8 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -78,6 +78,7 @@ from langchain.document_loaders.odt import UnstructuredODTLoader from langchain.document_loaders.onedrive import OneDriveLoader from langchain.document_loaders.onedrive_file import OneDriveFileLoader from langchain.document_loaders.open_city_data import OpenCityDataLoader +from langchain.document_loaders.org_mode import UnstructuredOrgModeLoader from langchain.document_loaders.pdf import ( MathpixPDFLoader, OnlinePDFLoader, @@ -262,6 +263,7 @@ __all__ = [ "UnstructuredImageLoader", "UnstructuredMarkdownLoader", "UnstructuredODTLoader", + "UnstructuredOrgModeLoader", "UnstructuredPDFLoader", "UnstructuredPowerPointLoader", "UnstructuredRSTLoader", diff --git a/langchain/document_loaders/org_mode.py b/langchain/document_loaders/org_mode.py new file mode 100644 index 0000000000..15c4a2c090 --- /dev/null +++ b/langchain/document_loaders/org_mode.py @@ -0,0 +1,22 @@ +"""Loader that loads Org-Mode files.""" +from typing import Any, List + +from langchain.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) + + +class UnstructuredOrgModeLoader(UnstructuredFileLoader): + """Loader that uses unstructured to load Org-Mode files.""" + + def __init__( + self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + ): + validate_unstructured_version(min_unstructured_version="0.7.9") + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + + def _get_elements(self) -> List: + from unstructured.partition.org import partition_org + + return partition_org(filename=self.file_path, **self.unstructured_kwargs) diff --git a/tests/integration_tests/document_loaders/test_org_mode.py b/tests/integration_tests/document_loaders/test_org_mode.py new file mode 100644 index 0000000000..157d76c0c4 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_org_mode.py @@ -0,0 +1,15 @@ +import os +from pathlib import Path + +from langchain.document_loaders import UnstructuredOrgModeLoader + +EXAMPLE_DIRECTORY = file_path = Path(__file__).parent.parent / "examples" + + +def test_unstructured_org_mode_loader() -> None: + """Test unstructured loader.""" + file_path = os.path.join(EXAMPLE_DIRECTORY, "README.org") + loader = UnstructuredOrgModeLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 diff --git a/tests/integration_tests/examples/README.org b/tests/integration_tests/examples/README.org new file mode 100644 index 0000000000..5b9f472804 --- /dev/null +++ b/tests/integration_tests/examples/README.org @@ -0,0 +1,27 @@ +* Example Docs + +The sample docs directory contains the following files: + +- ~example-10k.html~ - A 10-K SEC filing in HTML format +- ~layout-parser-paper.pdf~ - A PDF copy of the layout parser paper +- ~factbook.xml~ / ~factbook.xsl~ - Example XML/XLS files that you + can use to test stylesheets + +These documents can be used to test out the parsers in the library. In +addition, here are instructions for pulling in some sample docs that are +too big to store in the repo. + +** XBRL 10-K + +You can get an example 10-K in inline XBRL format using the following +~curl~. Note, you need to have the user agent set in the header or the +SEC site will reject your request. + +#+BEGIN_SRC bash + + curl -O \ + -A '${organization} ${email}' + https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt +#+END_SRC + +You can parse this document using the HTML parser.