From 220a7076ac81662236393577373c102fbb733ea9 Mon Sep 17 00:00:00 2001 From: Davis Chase <130488702+dev2049@users.noreply.github.com> Date: Fri, 28 Apr 2023 20:11:22 -0700 Subject: [PATCH] Add Mathpix pdf loader (#3727) Inspo https://twitter.com/danielgross/status/1651695062307274754?s=46&t=1zHLap5WG4I_kQPPjfW9fA Co-authored-by: Harrison Chase --- .../document_loaders/examples/pdf.ipynb | 50 +++++++- langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/pdf.py | 107 +++++++++++++++++- .../document_loaders/test_pdf.py | 17 +++ 4 files changed, 167 insertions(+), 9 deletions(-) diff --git a/docs/modules/indexes/document_loaders/examples/pdf.ipynb b/docs/modules/indexes/document_loaders/examples/pdf.ipynb index 8d2e4270..e3d3887c 100644 --- a/docs/modules/indexes/document_loaders/examples/pdf.ipynb +++ b/docs/modules/indexes/document_loaders/examples/pdf.ipynb @@ -155,6 +155,46 @@ " print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)" ] }, + { + "cell_type": "markdown", + "id": "6d5c9879", + "metadata": {}, + "source": [ + "## Using MathPix\n", + "\n", + "Inspired by Daniel Gross's [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "950eb58f", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import MathpixPDFLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb6fd473", + "metadata": {}, + "outputs": [], + "source": [ + "loader = MathpixPDFLoader(\"example_data/layout-parser-paper.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1d41e1a", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, { "cell_type": "markdown", "id": "09d64998", @@ -568,7 +608,7 @@ }, { "cell_type": "markdown", - "id": "f0048206", + "id": "15b57eab", "metadata": {}, "source": [ "## PyPDF Directory\n", @@ -579,7 +619,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "ecd0cb16", + "id": "b9e521d9", "metadata": {}, "outputs": [], "source": [ @@ -589,7 +629,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "96592167", + "id": "4b20590f", "metadata": {}, "outputs": [], "source": [ @@ -599,7 +639,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "c750454c", + "id": "e5ead943", "metadata": {}, "outputs": [], "source": [ @@ -609,7 +649,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ab7f8fdb", + "id": "ea25b03c", "metadata": {}, "outputs": [], "source": [] diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 3bf89a2c..510b3a47 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -51,6 +51,7 @@ from langchain.document_loaders.notion import NotionDirectoryLoader from langchain.document_loaders.notiondb import NotionDBLoader from langchain.document_loaders.obsidian import ObsidianLoader from langchain.document_loaders.pdf import ( + MathpixPDFLoader, OnlinePDFLoader, PDFMinerLoader, PDFMinerPDFasHTMLLoader, @@ -170,4 +171,5 @@ __all__ = [ "WhatsAppChatLoader", "YoutubeLoader", "PyPDFDirectoryLoader", + "MathpixPDFLoader", ] diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index 9ff57d77..95a41f15 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -1,7 +1,9 @@ """Loader that loads PDF files.""" +import json import logging import os import tempfile +import time from abc import ABC from io import StringIO from pathlib import Path @@ -13,6 +15,7 @@ import requests from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader +from langchain.utils import get_from_dict_or_env logger = logging.getLogger(__file__) @@ -33,12 +36,10 @@ class BasePDFLoader(BaseLoader, ABC): to a temporary file, and use that, then clean up the temporary file after completion """ - file_path: str - web_path: Optional[str] = None - def __init__(self, file_path: str): """Initialize with file path.""" self.file_path = file_path + self.web_path = None if "~" in self.file_path: self.file_path = os.path.expanduser(self.file_path) @@ -69,6 +70,10 @@ class BasePDFLoader(BaseLoader, ABC): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) + @property + def source(self) -> str: + return self.web_path if self.web_path is not None else self.file_path + class OnlinePDFLoader(BasePDFLoader): """Loader that loads online PDFs.""" @@ -249,8 +254,102 @@ class PyMuPDFLoader(BasePDFLoader): k: doc.metadata[k] for k in doc.metadata if type(doc.metadata[k]) in [str, int] - } + }, ), ) for page in doc ] + + +# MathpixPDFLoader implementation taken largely from Daniel Gross's: +# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21 +class MathpixPDFLoader(BasePDFLoader): + def __init__( + self, + file_path: str, + processed_file_format: str = "mmd", + max_wait_time_seconds: int = 500, + should_clean_pdf: bool = False, + **kwargs: Any, + ) -> None: + super().__init__(file_path) + self.mathpix_api_key = get_from_dict_or_env( + kwargs, "mathpix_api_key", "MATHPIX_API_KEY" + ) + self.mathpix_api_id = get_from_dict_or_env( + kwargs, "mathpix_api_id", "MATHPIX_API_ID" + ) + self.processed_file_format = processed_file_format + self.max_wait_time_seconds = max_wait_time_seconds + self.should_clean_pdf = should_clean_pdf + + @property + def headers(self) -> dict: + return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key} + + @property + def url(self) -> str: + return "https://api.mathpix.com/v3/pdf" + + @property + def data(self) -> dict: + options = {"conversion_formats": {self.processed_file_format: True}} + return {"options_json": json.dumps(options)} + + def send_pdf(self) -> str: + with open(self.file_path, "rb") as f: + files = {"file": f} + response = requests.post( + self.url, headers=self.headers, files=files, data=self.data + ) + response_data = response.json() + if "pdf_id" in response_data: + pdf_id = response_data["pdf_id"] + return pdf_id + else: + raise ValueError("Unable to send PDF to Mathpix.") + + def wait_for_processing(self, pdf_id: str) -> None: + url = self.url + "/" + pdf_id + for _ in range(0, self.max_wait_time_seconds, 5): + response = requests.get(url, headers=self.headers) + response_data = response.json() + status = response_data.get("status", None) + + if status == "completed": + return + elif status == "error": + raise ValueError("Unable to retrieve PDF from Mathpix") + else: + print(f"Status: {status}, waiting for processing to complete") + time.sleep(5) + raise TimeoutError + + def get_processed_pdf(self, pdf_id: str) -> str: + self.wait_for_processing(pdf_id) + url = f"{self.url}/{pdf_id}.{self.processed_file_format}" + response = requests.get(url, headers=self.headers) + return response.content.decode("utf-8") + + def clean_pdf(self, contents: str) -> str: + contents = "\n".join( + [line for line in contents.split("\n") if not line.startswith("![]")] + ) + # replace \section{Title} with # Title + contents = contents.replace("\\section{", "# ").replace("}", "") + # replace the "\" slash that Mathpix adds to escape $, %, (, etc. + contents = ( + contents.replace("\$", "$") + .replace("\%", "%") + .replace("\(", "(") + .replace("\)", ")") + ) + return contents + + def load(self) -> List[Document]: + pdf_id = self.send_pdf() + contents = self.get_processed_pdf(pdf_id) + if self.should_clean_pdf: + contents = self.clean_pdf(contents) + metadata = {"source": self.source, "file_path": self.source} + return [Document(page_content=contents, metadata=metadata)] diff --git a/tests/integration_tests/document_loaders/test_pdf.py b/tests/integration_tests/document_loaders/test_pdf.py index f7a768a2..8aa7fd7b 100644 --- a/tests/integration_tests/document_loaders/test_pdf.py +++ b/tests/integration_tests/document_loaders/test_pdf.py @@ -6,6 +6,7 @@ from langchain.document_loaders import ( PyMuPDFLoader, UnstructuredPDFLoader, ) +from langchain.document_loaders.pdf import MathpixPDFLoader def test_unstructured_pdf_loader() -> None: @@ -69,3 +70,19 @@ def test_pymupdf_loader() -> None: assert loader.web_path == web_path assert loader.file_path != web_path assert len(docs) == 1 + + +def test_mathpix_loader() -> None: + file_path = Path(__file__).parent.parent / "examples/hello.pdf" + loader = MathpixPDFLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + print(docs[0].page_content) + + file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" + loader = MathpixPDFLoader(str(file_path)) + + docs = loader.load() + assert len(docs) == 1 + print(docs[0].page_content)