"""Loader that loads word documents.""" import os from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader class UnstructuredWordDocumentLoader(UnstructuredFileLoader): """Loader that uses unstructured to load word documents.""" def _get_elements(self) -> List: from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.file_utils.filetype import FileType, detect_filetype unstructured_version = tuple( [int(x) for x in __unstructured_version__.split(".")] ) # NOTE(MthwRobinson) - magic will raise an import error if the libmagic # system dependency isn't installed. If it's not installed, we'll just # check the file extension try: import magic # noqa: F401 is_doc = detect_filetype(self.file_path) == FileType.DOC except ImportError: _, extension = os.path.splitext(self.file_path) is_doc = extension == ".doc" if is_doc and unstructured_version < (0, 4, 11): raise ValueError( f"You are on unstructured version {__unstructured_version__}. " "Partitioning .doc files is only supported in unstructured>=0.4.11. " "Please upgrade the unstructured package and try again." ) if is_doc: from unstructured.partition.doc import partition_doc return partition_doc(filename=self.file_path) else: from unstructured.partition.docx import partition_docx return partition_docx(filename=self.file_path)