From 1655ff2dedd5920bbbeffdd7151247ef0a55db73 Mon Sep 17 00:00:00 2001 From: ashish-dahal <35001414+ashish-dahal@users.noreply.github.com> Date: Thu, 5 Oct 2023 23:25:19 +0300 Subject: [PATCH] Fix PyMuPDFLoader kwargs (#11434) - **Description:** Fix the `PyMuPDFLoader` to accept `loader_kwargs` from the document loader's `loader_kwargs` option. This provides more flexibility in formatting the output from documents. - **Issue:** The `loader_kwargs` is not passed into the `load` method from the document loader, which limits configuration options. - **Dependencies:** None --------- Co-authored-by: Bagatur --- libs/langchain/langchain/document_loaders/pdf.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index 67743effd9..b137e13821 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -298,7 +298,9 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader): class PyMuPDFLoader(BasePDFLoader): """Load `PDF` files using `PyMuPDF`.""" - def __init__(self, file_path: str, *, headers: Optional[Dict] = None) -> None: + def __init__( + self, file_path: str, *, headers: Optional[Dict] = None, **kwargs: Any + ) -> None: """Initialize with a file path.""" try: import fitz # noqa:F401 @@ -307,13 +309,19 @@ class PyMuPDFLoader(BasePDFLoader): "`PyMuPDF` package not found, please install it with " "`pip install pymupdf`" ) - super().__init__(file_path, headers=headers) + self.text_kwargs = kwargs - def load(self, **kwargs: Optional[Any]) -> List[Document]: + def load(self, **kwargs: Any) -> List[Document]: """Load file.""" + if kwargs: + logger.warning( + f"Received runtime arguments {kwargs}. Passing runtime args to `load`" + f" is deprecated. Please pass arguments during initialization instead." + ) - parser = PyMuPDFParser(text_kwargs=kwargs) + text_kwargs = {**self.text_kwargs, **kwargs} + parser = PyMuPDFParser(text_kwargs=text_kwargs) blob = Blob.from_path(self.file_path) return parser.parse(blob)