Fix PyMuPDFLoader kwargs (#11434)

- **Description:** Fix the `PyMuPDFLoader` to accept `loader_kwargs`
from the document loader's `loader_kwargs` option. This provides more
flexibility in formatting the output from documents.

- **Issue:** The `loader_kwargs` is not passed into the `load` method
from the document loader, which limits configuration options.

- **Dependencies:**  None

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/11304/head^2
ashish-dahal 10 months ago committed by GitHub
parent e4a46747dc
commit 1655ff2ded
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -298,7 +298,9 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
class PyMuPDFLoader(BasePDFLoader):
"""Load `PDF` files using `PyMuPDF`."""
def __init__(self, file_path: str, *, headers: Optional[Dict] = None) -> None:
def __init__(
self, file_path: str, *, headers: Optional[Dict] = None, **kwargs: Any
) -> None:
"""Initialize with a file path."""
try:
import fitz # noqa:F401
@ -307,13 +309,19 @@ class PyMuPDFLoader(BasePDFLoader):
"`PyMuPDF` package not found, please install it with "
"`pip install pymupdf`"
)
super().__init__(file_path, headers=headers)
self.text_kwargs = kwargs
def load(self, **kwargs: Optional[Any]) -> List[Document]:
def load(self, **kwargs: Any) -> List[Document]:
"""Load file."""
if kwargs:
logger.warning(
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
f" is deprecated. Please pass arguments during initialization instead."
)
parser = PyMuPDFParser(text_kwargs=kwargs)
text_kwargs = {**self.text_kwargs, **kwargs}
parser = PyMuPDFParser(text_kwargs=text_kwargs)
blob = Blob.from_path(self.file_path)
return parser.parse(blob)

Loading…
Cancel
Save