feat: add support for .ppt files in UnstructuredPowerPointLoader (#1124)

###  Summary

Adds support for older `.ppt` file in the PowerPoint loader. 

### Testing

The following should work on `unstructured==0.4.11` using the example
docs from the `unstructured` repo.

```python
from langchain.document_loaders import UnstructuredPowerPointLoader

filename = "../unstructured/example-docs/fake-power-point.pptx"
loader = UnstructuredPowerPointLoader(filename)
loader.load()

filename = "../unstructured/example-docs/fake-power-point.ppt"
loader = UnstructuredPowerPointLoader(filename)
loader.load()
```

Now downgrade `unstructured` to version `0.4.10`. The following should
work:

```python
from langchain.document_loaders import UnstructuredPowerPointLoader

filename = "../unstructured/example-docs/fake-power-point.pptx"
loader = UnstructuredPowerPointLoader(filename)
loader.load()
```

and the following should give you a `ValueError` and invite you to
upgrade `unstructured`.


```python
from langchain.document_loaders import UnstructuredPowerPointLoader

filename = "../unstructured/example-docs/fake-power-point.ppt"
loader = UnstructuredPowerPointLoader(filename)
loader.load()
```
This commit is contained in:
Matt Robinson 2023-02-17 16:03:25 -05:00 committed by GitHub
parent b956070f08
commit 2bee8d4941
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,13 +1,42 @@
"""Loader that loads powerpoint files."""
import os
from typing import List
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import FileType, detect_filetype
from langchain.document_loaders.unstructured import UnstructuredFileLoader
unstructured_version = tuple([int(x) for x in __unstructured_version__.split(".")])
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load powerpoint files."""
def _get_elements(self) -> List:
from unstructured.partition.pptx import partition_pptx
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
# system dependency isn't installed. If it's not installed, we'll just
# check the file extension
try:
import magic # noqa: F401
return partition_pptx(filename=self.file_path)
is_ppt = detect_filetype(self.file_path) == FileType.PPT
except ImportError:
_, extension = os.path.splitext(self.file_path)
is_ppt = extension == ".ppt"
if is_ppt and unstructured_version < (0, 4, 11):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning .ppt files is only supported in unstructured>=0.4.11. "
"Please upgrade the unstructured package and try again."
)
if is_ppt:
from unstructured.partition.ppt import partition_ppt
return partition_ppt(filename=self.file_path)
else:
from unstructured.partition.pptx import partition_pptx
return partition_pptx(filename=self.file_path)