You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/langchain/document_loaders/word_document.py

44 lines
1.6 KiB
Python

"""Loader that loads word documents."""
import os
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load word documents."""
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import FileType, detect_filetype
unstructured_version = tuple(
[int(x) for x in __unstructured_version__.split(".")]
)
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
# system dependency isn't installed. If it's not installed, we'll just
# check the file extension
try:
import magic # noqa: F401
is_doc = detect_filetype(self.file_path) == FileType.DOC
except ImportError:
_, extension = os.path.splitext(self.file_path)
is_doc = extension == ".doc"
if is_doc and unstructured_version < (0, 4, 11):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning .doc files is only supported in unstructured>=0.4.11. "
"Please upgrade the unstructured package and try again."
)
if is_doc:
from unstructured.partition.doc import partition_doc
return partition_doc(filename=self.file_path)
else:
from unstructured.partition.docx import partition_docx
return partition_docx(filename=self.file_path)