From 2bee8d4941e13701d69f05f76ddbad39c8230f62 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 17 Feb 2023 16:03:25 -0500 Subject: [PATCH] feat: add support for `.ppt` files in `UnstructuredPowerPointLoader` (#1124) ### Summary Adds support for older `.ppt` file in the PowerPoint loader. ### Testing The following should work on `unstructured==0.4.11` using the example docs from the `unstructured` repo. ```python from langchain.document_loaders import UnstructuredPowerPointLoader filename = "../unstructured/example-docs/fake-power-point.pptx" loader = UnstructuredPowerPointLoader(filename) loader.load() filename = "../unstructured/example-docs/fake-power-point.ppt" loader = UnstructuredPowerPointLoader(filename) loader.load() ``` Now downgrade `unstructured` to version `0.4.10`. The following should work: ```python from langchain.document_loaders import UnstructuredPowerPointLoader filename = "../unstructured/example-docs/fake-power-point.pptx" loader = UnstructuredPowerPointLoader(filename) loader.load() ``` and the following should give you a `ValueError` and invite you to upgrade `unstructured`. ```python from langchain.document_loaders import UnstructuredPowerPointLoader filename = "../unstructured/example-docs/fake-power-point.ppt" loader = UnstructuredPowerPointLoader(filename) loader.load() ``` --- langchain/document_loaders/powerpoint.py | 33 ++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py index d8709b9b..d3443ec7 100644 --- a/langchain/document_loaders/powerpoint.py +++ b/langchain/document_loaders/powerpoint.py @@ -1,13 +1,42 @@ """Loader that loads powerpoint files.""" +import os from typing import List +from unstructured.__version__ import __version__ as __unstructured_version__ +from unstructured.file_utils.filetype import FileType, detect_filetype + from langchain.document_loaders.unstructured import UnstructuredFileLoader +unstructured_version = tuple([int(x) for x in __unstructured_version__.split(".")]) + class UnstructuredPowerPointLoader(UnstructuredFileLoader): """Loader that uses unstructured to load powerpoint files.""" def _get_elements(self) -> List: - from unstructured.partition.pptx import partition_pptx + # NOTE(MthwRobinson) - magic will raise an import error if the libmagic + # system dependency isn't installed. If it's not installed, we'll just + # check the file extension + try: + import magic # noqa: F401 + + is_ppt = detect_filetype(self.file_path) == FileType.PPT + except ImportError: + _, extension = os.path.splitext(self.file_path) + is_ppt = extension == ".ppt" + + if is_ppt and unstructured_version < (0, 4, 11): + raise ValueError( + f"You are on unstructured version {__unstructured_version__}. " + "Partitioning .ppt files is only supported in unstructured>=0.4.11. " + "Please upgrade the unstructured package and try again." + ) + + if is_ppt: + from unstructured.partition.ppt import partition_ppt + + return partition_ppt(filename=self.file_path) + else: + from unstructured.partition.pptx import partition_pptx - return partition_pptx(filename=self.file_path) + return partition_pptx(filename=self.file_path)