diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index 910075efdf..ff25037064 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -154,8 +154,8 @@ class PyPDFLoader(BasePDFLoader): raise ImportError( "pypdf package not found, please install it with " "`pip install pypdf`" ) - self.parser = PyPDFParser(password=password, extract_images=extract_images) super().__init__(file_path, headers=headers) + self.parser = PyPDFParser(password=password, extract_images=extract_images) def load(self) -> List[Document]: """Load given path as pages.""" @@ -165,7 +165,10 @@ class PyPDFLoader(BasePDFLoader): self, ) -> Iterator[Document]: """Lazy load given path as pages.""" - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) yield from self.parser.parse(blob)