From b25dbcb5b3494cc32476115a75e21f0560036ec4 Mon Sep 17 00:00:00 2001 From: Tim Asp <707699+timothyasp@users.noreply.github.com> Date: Tue, 28 Mar 2023 13:22:05 -0700 Subject: [PATCH] add missing `source` field to pymupdf output (#2110) To be consistent with other loaders for use with the `Sources` vector workflows. --- langchain/document_loaders/pdf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index b7e9cd1c..0ff912b8 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -156,6 +156,7 @@ class PyMuPDFLoader(BasePDFLoader): page_content=page.get_text(**kwargs).encode("utf-8"), metadata=dict( { + "source": file_path, "file_path": file_path, "page_number": page.number + 1, "total_pages": len(doc),