From 8bd3ce59cd4ef80db5eb52922bb31588596939e1 Mon Sep 17 00:00:00 2001
From: 123-fake-st <34491334+123-fake-st@users.noreply.github.com>
Date: Wed, 1 Nov 2023 10:27:00 -0500
Subject: [PATCH] PyPDFLoader use url in metadata source if file is a web path
 (#12092)

**Description:** Update `langchain.document_loaders.pdf.PyPDFLoader` to
store url in metadata (instead of a temporary file path) if user
provides a web path to a pdf

- **Issue:** Related to #7034; the reporter on that issue submitted a PR
updating `PyMuPDFParser` for this behavior, but it has unresolved merge
issues as of 20 Oct 2023 #7077
- In addition to `PyPDFLoader` and `PyMuPDFParser`, these other classes
in `langchain.document_loaders.pdf` exhibit similar behavior and could
benefit from an update: `PyPDFium2Loader`, `PDFMinerLoader`,
`PDFMinerPDFasHTMLLoader`, `PDFPlumberLoader` (I'm happy to contribute
to some/all of that, including assisting with `PyMuPDFParser`, if my
work is agreeable)
- The root cause is that the underlying pdf parser classes, e.g.
`langchain.document_loaders.parsers.pdf.PyPDFParser`, never receive
information about the url; the parsers receive a
`langchain.document_loaders.blob_loaders.blob`, which contains the pdf
contents and local file path, but not the url
- This update passes the web path directly to the parser since it's
minimally invasive and doesn't require further changes to maintain
existing behavior for local files... bigger picture, I'd consider
extending `blob` so that extra information like this can be
communicated, but that has much bigger implications on the codebase
which I think warrants maintainer input

  - **Dependencies:** None

```python
# old behavior
>>> from langchain.document_loaders import PyPDFLoader
>>> loader = PyPDFLoader('https://arxiv.org/pdf/1706.03762.pdf')
>>> docs = loader.load()
>>> docs[0].metadata
{'source': '/var/folders/w2/zx77z1cs01s1thx5dhshkd58h3jtrv/T/tmpfgrorsi5/tmp.pdf', 'page': 0}

# new behavior
>>> from langchain.document_loaders import PyPDFLoader
>>> loader = PyPDFLoader('https://arxiv.org/pdf/1706.03762.pdf')
>>> docs = loader.load()
>>> docs[0].metadata
{'source': 'https://arxiv.org/pdf/1706.03762.pdf', 'page': 0}
```
---
 libs/langchain/langchain/document_loaders/pdf.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libs/langchain/langchain/document_loaders/pdf.py b/libs/langchain/langchain/document_loaders/pdf.py
index 910075efdf..ff25037064 100644
--- a/libs/langchain/langchain/document_loaders/pdf.py
+++ b/libs/langchain/langchain/document_loaders/pdf.py
@@ -154,8 +154,8 @@ class PyPDFLoader(BasePDFLoader):
             raise ImportError(
                 "pypdf package not found, please install it with " "`pip install pypdf`"
             )
-        self.parser = PyPDFParser(password=password, extract_images=extract_images)
         super().__init__(file_path, headers=headers)
+        self.parser = PyPDFParser(password=password, extract_images=extract_images)
 
     def load(self) -> List[Document]:
         """Load given path as pages."""
@@ -165,7 +165,10 @@ class PyPDFLoader(BasePDFLoader):
         self,
     ) -> Iterator[Document]:
         """Lazy load given path as pages."""
-        blob = Blob.from_path(self.file_path)
+        if self.web_path:
+            blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
+        else:
+            blob = Blob.from_path(self.file_path)
         yield from self.parser.parse(blob)