From 8bd3ce59cd4ef80db5eb52922bb31588596939e1 Mon Sep 17 00:00:00 2001 From: 123-fake-st <34491334+123-fake-st@users.noreply.github.com> Date: Wed, 1 Nov 2023 10:27:00 -0500 Subject: [PATCH] PyPDFLoader use url in metadata source if file is a web path (#12092) **Description:** Update `langchain.document_loaders.pdf.PyPDFLoader` to store url in metadata (instead of a temporary file path) if user provides a web path to a pdf - **Issue:** Related to #7034; the reporter on that issue submitted a PR updating `PyMuPDFParser` for this behavior, but it has unresolved merge issues as of 20 Oct 2023 #7077 - In addition to `PyPDFLoader` and `PyMuPDFParser`, these other classes in `langchain.document_loaders.pdf` exhibit similar behavior and could benefit from an update: `PyPDFium2Loader`, `PDFMinerLoader`, `PDFMinerPDFasHTMLLoader`, `PDFPlumberLoader` (I'm happy to contribute to some/all of that, including assisting with `PyMuPDFParser`, if my work is agreeable) - The root cause is that the underlying pdf parser classes, e.g. `langchain.document_loaders.parsers.pdf.PyPDFParser`, never receive information about the url; the parsers receive a `langchain.document_loaders.blob_loaders.blob`, which contains the pdf contents and local file path, but not the url - This update passes the web path directly to the parser since it's minimally invasive and doesn't require further changes to maintain existing behavior for local files... bigger picture, I'd consider extending `blob` so that extra information like this can be communicated, but that has much bigger implications on the codebase which I think warrants maintainer input - **Dependencies:** None ```python # old behavior >>> from langchain.document_loaders import PyPDFLoader >>> loader = PyPDFLoader('https://arxiv.org/pdf/1706.03762.pdf') >>> docs = loader.load() >>> docs[0].metadata {'source': '/var/folders/w2/zx77z1cs01s1thx5dhshkd58h3jtrv/T/tmpfgrorsi5/tmp.pdf', 'page': 0} # new behavior >>> from langchain.document_loaders import PyPDFLoader >>> loader = PyPDFLoader('https://arxiv.org/pdf/1706.03762.pdf') >>> docs = loader.load() >>> docs[0].metadata {'source': 'https://arxiv.org/pdf/1706.03762.pdf', 'page': 0} ``` --- libs/langchain/langchain/document_loaders/pdf.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py index 910075efdf..ff25037064 100644 --- a/libs/langchain/langchain/document_loaders/pdf.py +++ b/libs/langchain/langchain/document_loaders/pdf.py @@ -154,8 +154,8 @@ class PyPDFLoader(BasePDFLoader): raise ImportError( "pypdf package not found, please install it with " "`pip install pypdf`" ) - self.parser = PyPDFParser(password=password, extract_images=extract_images) super().__init__(file_path, headers=headers) + self.parser = PyPDFParser(password=password, extract_images=extract_images) def load(self) -> List[Document]: """Load given path as pages.""" @@ -165,7 +165,10 @@ class PyPDFLoader(BasePDFLoader): self, ) -> Iterator[Document]: """Lazy load given path as pages.""" - blob = Blob.from_path(self.file_path) + if self.web_path: + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + else: + blob = Blob.from_path(self.file_path) yield from self.parser.parse(blob)