diff --git a/docs/modules/document_loaders/examples/word_document.ipynb b/docs/modules/document_loaders/examples/word_document.ipynb new file mode 100644 index 0000000000..daf70e8400 --- /dev/null +++ b/docs/modules/document_loaders/examples/word_document.ipynb @@ -0,0 +1,137 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "39af9ecd", + "metadata": {}, + "source": [ + "# Word Documents\n", + "\n", + "This covers how to load Word documents into a document format that we can use downstream." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "721c48aa", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredWordDocumentLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9d3d0e35", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredWordDocumentLoader(\"fake.docx\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "06073f91", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c9adc5cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "markdown", + "id": "525d6b67", + "metadata": {}, + "source": [ + "## Retain Elements\n", + "\n", + "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "064f9162", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredWordDocumentLoader(\"fake.docx\", mode=\"elements\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "abefbbdb", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a547c534", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx', 'filename': 'fake.docx', 'category': 'Title'}, lookup_index=0)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index d54ed8e187..5de8ae02ca 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -36,6 +36,7 @@ from langchain.document_loaders.unstructured import ( ) from langchain.document_loaders.url import UnstructuredURLLoader from langchain.document_loaders.web_base import WebBaseLoader +from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader from langchain.document_loaders.youtube import YoutubeLoader __all__ = [ @@ -48,6 +49,7 @@ __all__ = [ "GoogleDriveLoader", "UnstructuredHTMLLoader", "UnstructuredPowerPointLoader", + "UnstructuredWordDocumentLoader", "UnstructuredPDFLoader", "ObsidianLoader", "UnstructuredDocxLoader", diff --git a/langchain/document_loaders/word_document.py b/langchain/document_loaders/word_document.py new file mode 100644 index 0000000000..139f4d30c2 --- /dev/null +++ b/langchain/document_loaders/word_document.py @@ -0,0 +1,43 @@ +"""Loader that loads word documents.""" +import os +from typing import List + +from langchain.document_loaders.unstructured import UnstructuredFileLoader + + +class UnstructuredWordDocumentLoader(UnstructuredFileLoader): + """Loader that uses unstructured to load word documents.""" + + def _get_elements(self) -> List: + from unstructured.__version__ import __version__ as __unstructured_version__ + from unstructured.file_utils.filetype import FileType, detect_filetype + + unstructured_version = tuple( + [int(x) for x in __unstructured_version__.split(".")] + ) + # NOTE(MthwRobinson) - magic will raise an import error if the libmagic + # system dependency isn't installed. If it's not installed, we'll just + # check the file extension + try: + import magic # noqa: F401 + + is_doc = detect_filetype(self.file_path) == FileType.DOC + except ImportError: + _, extension = os.path.splitext(self.file_path) + is_doc = extension == ".doc" + + if is_doc and unstructured_version < (0, 4, 11): + raise ValueError( + f"You are on unstructured version {__unstructured_version__}. " + "Partitioning .doc files is only supported in unstructured>=0.4.11. " + "Please upgrade the unstructured package and try again." + ) + + if is_doc: + from unstructured.partition.doc import partition_doc + + return partition_doc(filename=self.file_path) + else: + from unstructured.partition.docx import partition_docx + + return partition_docx(filename=self.file_path)