mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
feat: add support for .ppt
files in UnstructuredPowerPointLoader
(#1124)
### Summary Adds support for older `.ppt` file in the PowerPoint loader. ### Testing The following should work on `unstructured==0.4.11` using the example docs from the `unstructured` repo. ```python from langchain.document_loaders import UnstructuredPowerPointLoader filename = "../unstructured/example-docs/fake-power-point.pptx" loader = UnstructuredPowerPointLoader(filename) loader.load() filename = "../unstructured/example-docs/fake-power-point.ppt" loader = UnstructuredPowerPointLoader(filename) loader.load() ``` Now downgrade `unstructured` to version `0.4.10`. The following should work: ```python from langchain.document_loaders import UnstructuredPowerPointLoader filename = "../unstructured/example-docs/fake-power-point.pptx" loader = UnstructuredPowerPointLoader(filename) loader.load() ``` and the following should give you a `ValueError` and invite you to upgrade `unstructured`. ```python from langchain.document_loaders import UnstructuredPowerPointLoader filename = "../unstructured/example-docs/fake-power-point.ppt" loader = UnstructuredPowerPointLoader(filename) loader.load() ```
This commit is contained in:
parent
b956070f08
commit
2bee8d4941
@ -1,13 +1,42 @@
|
||||
"""Loader that loads powerpoint files."""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
unstructured_version = tuple([int(x) for x in __unstructured_version__.split(".")])
|
||||
|
||||
|
||||
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load powerpoint files."""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
|
||||
# system dependency isn't installed. If it's not installed, we'll just
|
||||
# check the file extension
|
||||
try:
|
||||
import magic # noqa: F401
|
||||
|
||||
return partition_pptx(filename=self.file_path)
|
||||
is_ppt = detect_filetype(self.file_path) == FileType.PPT
|
||||
except ImportError:
|
||||
_, extension = os.path.splitext(self.file_path)
|
||||
is_ppt = extension == ".ppt"
|
||||
|
||||
if is_ppt and unstructured_version < (0, 4, 11):
|
||||
raise ValueError(
|
||||
f"You are on unstructured version {__unstructured_version__}. "
|
||||
"Partitioning .ppt files is only supported in unstructured>=0.4.11. "
|
||||
"Please upgrade the unstructured package and try again."
|
||||
)
|
||||
|
||||
if is_ppt:
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
|
||||
return partition_ppt(filename=self.file_path)
|
||||
else:
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
return partition_pptx(filename=self.file_path)
|
||||
|
Loading…
Reference in New Issue
Block a user