Add Mathpix pdf loader (#3727)

Inspo
https://twitter.com/danielgross/status/1651695062307274754?s=46&t=1zHLap5WG4I_kQPPjfW9fA

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
Davis Chase 2023-04-28 20:11:22 -07:00 committed by GitHub
parent 37ed6f2177
commit 220a7076ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 167 additions and 9 deletions

View File

@ -155,6 +155,46 @@
" print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)" " print(str(doc.metadata[\"page\"]) + \":\", doc.page_content)"
] ]
}, },
{
"cell_type": "markdown",
"id": "6d5c9879",
"metadata": {},
"source": [
"## Using MathPix\n",
"\n",
"Inspired by Daniel Gross's [https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21](https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "950eb58f",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import MathpixPDFLoader"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb6fd473",
"metadata": {},
"outputs": [],
"source": [
"loader = MathpixPDFLoader(\"example_data/layout-parser-paper.pdf\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1d41e1a",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "09d64998", "id": "09d64998",
@ -568,7 +608,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "f0048206", "id": "15b57eab",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## PyPDF Directory\n", "## PyPDF Directory\n",
@ -579,7 +619,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 1,
"id": "ecd0cb16", "id": "b9e521d9",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -589,7 +629,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 2,
"id": "96592167", "id": "4b20590f",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -599,7 +639,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 3,
"id": "c750454c", "id": "e5ead943",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -609,7 +649,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "ab7f8fdb", "id": "ea25b03c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": []

View File

@ -51,6 +51,7 @@ from langchain.document_loaders.notion import NotionDirectoryLoader
from langchain.document_loaders.notiondb import NotionDBLoader from langchain.document_loaders.notiondb import NotionDBLoader
from langchain.document_loaders.obsidian import ObsidianLoader from langchain.document_loaders.obsidian import ObsidianLoader
from langchain.document_loaders.pdf import ( from langchain.document_loaders.pdf import (
MathpixPDFLoader,
OnlinePDFLoader, OnlinePDFLoader,
PDFMinerLoader, PDFMinerLoader,
PDFMinerPDFasHTMLLoader, PDFMinerPDFasHTMLLoader,
@ -170,4 +171,5 @@ __all__ = [
"WhatsAppChatLoader", "WhatsAppChatLoader",
"YoutubeLoader", "YoutubeLoader",
"PyPDFDirectoryLoader", "PyPDFDirectoryLoader",
"MathpixPDFLoader",
] ]

View File

@ -1,7 +1,9 @@
"""Loader that loads PDF files.""" """Loader that loads PDF files."""
import json
import logging import logging
import os import os
import tempfile import tempfile
import time
from abc import ABC from abc import ABC
from io import StringIO from io import StringIO
from pathlib import Path from pathlib import Path
@ -13,6 +15,7 @@ import requests
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.utils import get_from_dict_or_env
logger = logging.getLogger(__file__) logger = logging.getLogger(__file__)
@ -33,12 +36,10 @@ class BasePDFLoader(BaseLoader, ABC):
to a temporary file, and use that, then clean up the temporary file after completion to a temporary file, and use that, then clean up the temporary file after completion
""" """
file_path: str
web_path: Optional[str] = None
def __init__(self, file_path: str): def __init__(self, file_path: str):
"""Initialize with file path.""" """Initialize with file path."""
self.file_path = file_path self.file_path = file_path
self.web_path = None
if "~" in self.file_path: if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path) self.file_path = os.path.expanduser(self.file_path)
@ -69,6 +70,10 @@ class BasePDFLoader(BaseLoader, ABC):
parsed = urlparse(url) parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme) return bool(parsed.netloc) and bool(parsed.scheme)
@property
def source(self) -> str:
return self.web_path if self.web_path is not None else self.file_path
class OnlinePDFLoader(BasePDFLoader): class OnlinePDFLoader(BasePDFLoader):
"""Loader that loads online PDFs.""" """Loader that loads online PDFs."""
@ -249,8 +254,102 @@ class PyMuPDFLoader(BasePDFLoader):
k: doc.metadata[k] k: doc.metadata[k]
for k in doc.metadata for k in doc.metadata
if type(doc.metadata[k]) in [str, int] if type(doc.metadata[k]) in [str, int]
} },
), ),
) )
for page in doc for page in doc
] ]
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
class MathpixPDFLoader(BasePDFLoader):
def __init__(
self,
file_path: str,
processed_file_format: str = "mmd",
max_wait_time_seconds: int = 500,
should_clean_pdf: bool = False,
**kwargs: Any,
) -> None:
super().__init__(file_path)
self.mathpix_api_key = get_from_dict_or_env(
kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
)
self.mathpix_api_id = get_from_dict_or_env(
kwargs, "mathpix_api_id", "MATHPIX_API_ID"
)
self.processed_file_format = processed_file_format
self.max_wait_time_seconds = max_wait_time_seconds
self.should_clean_pdf = should_clean_pdf
@property
def headers(self) -> dict:
return {"app_id": self.mathpix_api_id, "app_key": self.mathpix_api_key}
@property
def url(self) -> str:
return "https://api.mathpix.com/v3/pdf"
@property
def data(self) -> dict:
options = {"conversion_formats": {self.processed_file_format: True}}
return {"options_json": json.dumps(options)}
def send_pdf(self) -> str:
with open(self.file_path, "rb") as f:
files = {"file": f}
response = requests.post(
self.url, headers=self.headers, files=files, data=self.data
)
response_data = response.json()
if "pdf_id" in response_data:
pdf_id = response_data["pdf_id"]
return pdf_id
else:
raise ValueError("Unable to send PDF to Mathpix.")
def wait_for_processing(self, pdf_id: str) -> None:
url = self.url + "/" + pdf_id
for _ in range(0, self.max_wait_time_seconds, 5):
response = requests.get(url, headers=self.headers)
response_data = response.json()
status = response_data.get("status", None)
if status == "completed":
return
elif status == "error":
raise ValueError("Unable to retrieve PDF from Mathpix")
else:
print(f"Status: {status}, waiting for processing to complete")
time.sleep(5)
raise TimeoutError
def get_processed_pdf(self, pdf_id: str) -> str:
self.wait_for_processing(pdf_id)
url = f"{self.url}/{pdf_id}.{self.processed_file_format}"
response = requests.get(url, headers=self.headers)
return response.content.decode("utf-8")
def clean_pdf(self, contents: str) -> str:
contents = "\n".join(
[line for line in contents.split("\n") if not line.startswith("![]")]
)
# replace \section{Title} with # Title
contents = contents.replace("\\section{", "# ").replace("}", "")
# replace the "\" slash that Mathpix adds to escape $, %, (, etc.
contents = (
contents.replace("\$", "$")
.replace("\%", "%")
.replace("\(", "(")
.replace("\)", ")")
)
return contents
def load(self) -> List[Document]:
pdf_id = self.send_pdf()
contents = self.get_processed_pdf(pdf_id)
if self.should_clean_pdf:
contents = self.clean_pdf(contents)
metadata = {"source": self.source, "file_path": self.source}
return [Document(page_content=contents, metadata=metadata)]

View File

@ -6,6 +6,7 @@ from langchain.document_loaders import (
PyMuPDFLoader, PyMuPDFLoader,
UnstructuredPDFLoader, UnstructuredPDFLoader,
) )
from langchain.document_loaders.pdf import MathpixPDFLoader
def test_unstructured_pdf_loader() -> None: def test_unstructured_pdf_loader() -> None:
@ -69,3 +70,19 @@ def test_pymupdf_loader() -> None:
assert loader.web_path == web_path assert loader.web_path == web_path
assert loader.file_path != web_path assert loader.file_path != web_path
assert len(docs) == 1 assert len(docs) == 1
def test_mathpix_loader() -> None:
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = MathpixPDFLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
print(docs[0].page_content)
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = MathpixPDFLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
print(docs[0].page_content)