Add password to PyPDR loader and parser (#6908)

Add password to PyPDR loader and parser

---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
pull/6948/head
lucasiscovici 1 year ago committed by GitHub
parent 429f4dbe4d
commit e9950392dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,5 +1,5 @@
"""Module contains common parsers for PDFs."""
from typing import Any, Iterator, Mapping, Optional
from typing import Any, Iterator, Mapping, Optional, Union
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
@ -9,12 +9,15 @@ from langchain.schema import Document
class PyPDFParser(BaseBlobParser):
"""Loads a PDF with pypdf and chunks at character level."""
def __init__(self, password: Optional[Union[str, bytes]] = None):
self.password = password
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import pypdf
with blob.as_bytes_io() as pdf_file_obj:
pdf_reader = pypdf.PdfReader(pdf_file_obj)
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
yield from [
Document(
page_content=page.extract_text(),

@ -7,7 +7,7 @@ import time
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import Any, Iterator, List, Mapping, Optional
from typing import Any, Iterator, List, Mapping, Optional, Union
from urllib.parse import urlparse
import requests
@ -100,7 +100,9 @@ class PyPDFLoader(BasePDFLoader):
Loader also stores page numbers in metadatas.
"""
def __init__(self, file_path: str) -> None:
def __init__(
self, file_path: str, password: Optional[Union[str, bytes]] = None
) -> None:
"""Initialize with file path."""
try:
import pypdf # noqa:F401
@ -108,7 +110,7 @@ class PyPDFLoader(BasePDFLoader):
raise ImportError(
"pypdf package not found, please install it with " "`pip install pypdf`"
)
self.parser = PyPDFParser()
self.parser = PyPDFParser(password=password)
super().__init__(file_path)
def load(self) -> List[Document]:

Loading…
Cancel
Save