Add Mathpix pdf loader (#3727)

Inspo https://twitter.com/danielgross/status/1651695062307274754?s=46&t=1zHLap5WG4I_kQPPjfW9fA Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
1 year ago · 220a7076ac
parent 37ed6f2177
commit 220a7076ac
4 changed files with 167 additions and 9 deletions
--- a/docs/modules/indexes/document_loaders/examples/pdf.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/pdf.ipynb
@ -155,6 +155,46 @@
    "    print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d5c9879",
   "metadata": {},
   "source": [
    "## Using MathPix\n",
    "\n",
    "Inspired by Daniel Gross's [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "950eb58f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import MathpixPDFLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb6fd473",
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = MathpixPDFLoader(\"example_data/layout-parser-paper.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1d41e1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loader.load()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09d64998",
@ -568,7 +608,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "f0048206",
+   "id": "15b57eab",
   "metadata": {},
   "source": [
    "## PyPDF Directory\n",
@ -579,7 +619,7 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "id": "ecd0cb16",
+   "id": "b9e521d9",
   "metadata": {},
   "outputs": [],
   "source": [
@ -589,7 +629,7 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "id": "96592167",
+   "id": "4b20590f",
   "metadata": {},
   "outputs": [],
   "source": [
@ -599,7 +639,7 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "id": "c750454c",
+   "id": "e5ead943",
   "metadata": {},
   "outputs": [],
   "source": [
@ -609,7 +649,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "ab7f8fdb",
+   "id": "ea25b03c",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -51,6 +51,7 @@ from langchain.document_loaders.notion import NotionDirectoryLoader
 from langchain.document_loaders.notiondb import NotionDBLoader
 from langchain.document_loaders.obsidian import ObsidianLoader
 from langchain.document_loaders.pdf import (
    MathpixPDFLoader,
    OnlinePDFLoader,
    PDFMinerLoader,
    PDFMinerPDFasHTMLLoader,
@ -170,4 +171,5 @@ __all__ = [
    "WhatsAppChatLoader",
    "YoutubeLoader",
    "PyPDFDirectoryLoader",
    "MathpixPDFLoader",
 ]
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@ -1,7 +1,9 @@
 """Loader that loads PDF files."""
 import json
 import logging
 import os
 import tempfile
 import time
 from abc import ABC
 from io import StringIO
 from pathlib import Path
@ -13,6 +15,7 @@ import requests
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
 from langchain.utils import get_from_dict_or_env
 logger = logging.getLogger(__file__)
@ -33,12 +36,10 @@ class BasePDFLoader(BaseLoader, ABC):
    to a temporary file, and use that, then clean up the temporary file after completion
    """
    file_path: str
    web_path: Optional[str] = None
    def __init__(self, file_path: str):
        """Initialize with file path."""
        self.file_path = file_path
        self.web_path = None
        if "~" in self.file_path:
            self.file_path = os.path.expanduser(self.file_path)
@ -69,6 +70,10 @@ class BasePDFLoader(BaseLoader, ABC):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)
    @property
    def source(self) -> str:
        return self.web_path if self.web_path is not None else self.file_path
 class OnlinePDFLoader(BasePDFLoader):
    """Loader that loads online PDFs."""
@ -249,8 +254,102 @@ class PyMuPDFLoader(BasePDFLoader):
                        k: doc.metadata[k]
                        for k in doc.metadata
                        if type(doc.metadata[k]) in [str, int]
-                    }
+                    },
                ),
            )
            for page in doc
        ]
 # MathpixPDFLoader implementation taken largely from Daniel Gross's:
 # https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
 class MathpixPDFLoader(BasePDFLoader):
    def __init__(
        self,
        file_path: str,
        processed_file_format: str = "mmd",
        max_wait_time_seconds: int = 500,
        should_clean_pdf: bool = False,
        **kwargs: Any,
    ) -> None:
        super().__init__(file_path)
        self.mathpix_api_key = get_from_dict_or_env(
            kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
        )
        self.mathpix_api_id = get_from_dict_or_env(
            kwargs, "mathpix_api_id", "MATHPIX_API_ID"
        )
        self.processed_file_format = processed_file_format
        self.max_wait_time_seconds = max_wait_time_seconds
        self.should_clean_pdf = should_clean_pdf
    @property
    def headers(self) -> dict:
        return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
    @property
    def url(self) -> str:
        return "https://api.mathpix.com/v3/pdf"
    @property
    def data(self) -> dict:
        options = {"conversion_formats": {self.processed_file_format: True}}
        return {"options_json": json.dumps(options)}
    def send_pdf(self) -> str:
        with open(self.file_path, "rb") as f:
            files = {"file": f}
            response = requests.post(
                self.url, headers=self.headers, files=files, data=self.data
            )
        response_data = response.json()
        if "pdf_id" in response_data:
            pdf_id = response_data["pdf_id"]
            return pdf_id
        else:
            raise ValueError("Unable to send PDF to Mathpix.")
    def wait_for_processing(self, pdf_id: str) -> None:
        url = self.url + "/" + pdf_id
        for _ in range(0, self.max_wait_time_seconds, 5):
            response = requests.get(url, headers=self.headers)
            response_data = response.json()
            status = response_data.get("status", None)
            if status == "completed":
                return
            elif status == "error":
                raise ValueError("Unable to retrieve PDF from Mathpix")
            else:
                print(f"Status: {status}, waiting for processing to complete")
                time.sleep(5)
        raise TimeoutError
    def get_processed_pdf(self, pdf_id: str) -> str:
        self.wait_for_processing(pdf_id)
        url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
        response = requests.get(url, headers=self.headers)
        return response.content.decode("utf-8")
    def clean_pdf(self, contents: str) -> str:
        contents = "\n".join(
            [line for line in contents.split("\n") if not line.startswith("![]")]
        )
        # replace \section{Title} with # Title
        contents = contents.replace("\\section{", "# ").replace("}", "")
        # replace the "\" slash that Mathpix adds to escape $, %, (, etc.
        contents = (
            contents.replace("\$", "$")
            .replace("\%", "%")
            .replace("\(", "(")
            .replace("\)", ")")
        )
        return contents
    def load(self) -> List[Document]:
        pdf_id = self.send_pdf()
        contents = self.get_processed_pdf(pdf_id)
        if self.should_clean_pdf:
            contents = self.clean_pdf(contents)
        metadata = {"source": self.source, "file_path": self.source}
        return [Document(page_content=contents, metadata=metadata)]
--- a/tests/integration_tests/document_loaders/test_pdf.py
+++ b/tests/integration_tests/document_loaders/test_pdf.py
@ -6,6 +6,7 @@ from langchain.document_loaders import (
    PyMuPDFLoader,
    UnstructuredPDFLoader,
 )
 from langchain.document_loaders.pdf import MathpixPDFLoader
 def test_unstructured_pdf_loader() -> None:
@ -69,3 +70,19 @@ def test_pymupdf_loader() -> None:
    assert loader.web_path == web_path
    assert loader.file_path != web_path
    assert len(docs) == 1
 def test_mathpix_loader() -> None:
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = MathpixPDFLoader(str(file_path))
    docs = loader.load()
    assert len(docs) == 1
    print(docs[0].page_content)
    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
    loader = MathpixPDFLoader(str(file_path))
    docs = loader.load()
    assert len(docs) == 1
    print(docs[0].page_content)