Add Mathpix pdf loader (#3727)

Inspo https://twitter.com/danielgross/status/1651695062307274754?s=46&t=1zHLap5WG4I_kQPPjfW9fA Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
1 year ago · 220a7076ac
parent 37ed6f2177
commit 220a7076ac
4 changed files with 167 additions and 9 deletions
--- a/docs/modules/indexes/document_loaders/examples/pdf.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/pdf.ipynb
@ -155,6 +155,46 @@
    "    print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "6d5c9879",
+   "metadata": {},
+   "source": [
+    "## Using MathPix\n",
+    "\n",
+    "Inspired by Daniel Gross's [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "950eb58f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import MathpixPDFLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb6fd473",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = MathpixPDFLoader(\"example_data/layout-parser-paper.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1d41e1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "09d64998",
@ -568,7 +608,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "f0048206",
+   "id": "15b57eab",
   "metadata": {},
   "source": [
    "## PyPDF Directory\n",
@ -579,7 +619,7 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "id": "ecd0cb16",
+   "id": "b9e521d9",
   "metadata": {},
   "outputs": [],
   "source": [
@ -589,7 +629,7 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "id": "96592167",
+   "id": "4b20590f",
   "metadata": {},
   "outputs": [],
   "source": [
@ -599,7 +639,7 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "id": "c750454c",
+   "id": "e5ead943",
   "metadata": {},
   "outputs": [],
   "source": [
@ -609,7 +649,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "ab7f8fdb",
+   "id": "ea25b03c",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -51,6 +51,7 @@ from langchain.document_loaders.notion import NotionDirectoryLoader
 from langchain.document_loaders.notiondb import NotionDBLoader
 from langchain.document_loaders.obsidian import ObsidianLoader
 from langchain.document_loaders.pdf import (
+    MathpixPDFLoader,
    OnlinePDFLoader,
    PDFMinerLoader,
    PDFMinerPDFasHTMLLoader,
@ -170,4 +171,5 @@ __all__ = [
    "WhatsAppChatLoader",
    "YoutubeLoader",
    "PyPDFDirectoryLoader",
+    "MathpixPDFLoader",
 ]
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@ -1,7 +1,9 @@
 """Loader that loads PDF files."""
+import json
 import logging
 import os
 import tempfile
+import time
 from abc import ABC
 from io import StringIO
 from pathlib import Path
@ -13,6 +15,7 @@ import requests
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
+from langchain.utils import get_from_dict_or_env

 logger = logging.getLogger(__file__)

@ -33,12 +36,10 @@ class BasePDFLoader(BaseLoader, ABC):
    to a temporary file, and use that, then clean up the temporary file after completion
    """

-    file_path: str
-    web_path: Optional[str] = None
-
    def __init__(self, file_path: str):
        """Initialize with file path."""
        self.file_path = file_path
+        self.web_path = None
        if "~" in self.file_path:
            self.file_path = os.path.expanduser(self.file_path)

@ -69,6 +70,10 @@ class BasePDFLoader(BaseLoader, ABC):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

+    @property
+    def source(self) -> str:
+        return self.web_path if self.web_path is not None else self.file_path
+

 class OnlinePDFLoader(BasePDFLoader):
    """Loader that loads online PDFs."""
@ -249,8 +254,102 @@ class PyMuPDFLoader(BasePDFLoader):
                        k: doc.metadata[k]
                        for k in doc.metadata
                        if type(doc.metadata[k]) in [str, int]
-                    }
+                    },
                ),
            )
            for page in doc
        ]
+
+
+# MathpixPDFLoader implementation taken largely from Daniel Gross's:
+# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
+class MathpixPDFLoader(BasePDFLoader):
+    def __init__(
+        self,
+        file_path: str,
+        processed_file_format: str = "mmd",
+        max_wait_time_seconds: int = 500,
+        should_clean_pdf: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(file_path)
+        self.mathpix_api_key = get_from_dict_or_env(
+            kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
+        )
+        self.mathpix_api_id = get_from_dict_or_env(
+            kwargs, "mathpix_api_id", "MATHPIX_API_ID"
+        )
+        self.processed_file_format = processed_file_format
+        self.max_wait_time_seconds = max_wait_time_seconds
+        self.should_clean_pdf = should_clean_pdf
+
+    @property
+    def headers(self) -> dict:
+        return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
+
+    @property
+    def url(self) -> str:
+        return "https://api.mathpix.com/v3/pdf"
+
+    @property
+    def data(self) -> dict:
+        options = {"conversion_formats": {self.processed_file_format: True}}
+        return {"options_json": json.dumps(options)}
+
+    def send_pdf(self) -> str:
+        with open(self.file_path, "rb") as f:
+            files = {"file": f}
+            response = requests.post(
+                self.url, headers=self.headers, files=files, data=self.data
+            )
+        response_data = response.json()
+        if "pdf_id" in response_data:
+            pdf_id = response_data["pdf_id"]
+            return pdf_id
+        else:
+            raise ValueError("Unable to send PDF to Mathpix.")
+
+    def wait_for_processing(self, pdf_id: str) -> None:
+        url = self.url + "/" + pdf_id
+        for _ in range(0, self.max_wait_time_seconds, 5):
+            response = requests.get(url, headers=self.headers)
+            response_data = response.json()
+            status = response_data.get("status", None)
+
+            if status == "completed":
+                return
+            elif status == "error":
+                raise ValueError("Unable to retrieve PDF from Mathpix")
+            else:
+                print(f"Status: {status}, waiting for processing to complete")
+                time.sleep(5)
+        raise TimeoutError
+
+    def get_processed_pdf(self, pdf_id: str) -> str:
+        self.wait_for_processing(pdf_id)
+        url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
+        response = requests.get(url, headers=self.headers)
+        return response.content.decode("utf-8")
+
+    def clean_pdf(self, contents: str) -> str:
+        contents = "\n".join(
+            [line for line in contents.split("\n") if not line.startswith("![]")]
+        )
+        # replace \section{Title} with # Title
+        contents = contents.replace("\\section{", "# ").replace("}", "")
+        # replace the "\" slash that Mathpix adds to escape $, %, (, etc.
+        contents = (
+            contents.replace("\$", "$")
+            .replace("\%", "%")
+            .replace("\(", "(")
+            .replace("\)", ")")
+        )
+        return contents
+
+    def load(self) -> List[Document]:
+        pdf_id = self.send_pdf()
+        contents = self.get_processed_pdf(pdf_id)
+        if self.should_clean_pdf:
+            contents = self.clean_pdf(contents)
+        metadata = {"source": self.source, "file_path": self.source}
+        return [Document(page_content=contents, metadata=metadata)]
--- a/tests/integration_tests/document_loaders/test_pdf.py
+++ b/tests/integration_tests/document_loaders/test_pdf.py
@ -6,6 +6,7 @@ from langchain.document_loaders import (
    PyMuPDFLoader,
    UnstructuredPDFLoader,
 )
+from langchain.document_loaders.pdf import MathpixPDFLoader


 def test_unstructured_pdf_loader() -> None:
@ -69,3 +70,19 @@ def test_pymupdf_loader() -> None:
    assert loader.web_path == web_path
    assert loader.file_path != web_path
    assert len(docs) == 1
+
+
+def test_mathpix_loader() -> None:
+    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
+    loader = MathpixPDFLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
+    print(docs[0].page_content)
+
+    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
+    loader = MathpixPDFLoader(str(file_path))
+
+    docs = loader.load()
+    assert len(docs) == 1
+    print(docs[0].page_content)