forked from Archives/langchain
Add PDF parser implementations (#4356)
# Add PDF parser implementations This PR separates the data loading from the parsing for a number of existing PDF loaders. Parser tests have been designed to help encourage developers to create a consistent interface for parsing PDFs. This interface can be made more consistent in the future by adding information into the initializer on desired behavior with respect to splitting by page etc. This code is expected to be backwards compatible -- with the exception of a bug fix with pymupdf parser which was returning `bytes` in the page content rather than strings. Also changing the lazy parser method of document loader to return an Iterator rather than Iterable over documents. ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: @ <!-- For a quicker response, figure out the right person to tag with @ @hwchase17 - project lead Tracing / Callbacks - @agola11 Async - @agola11 DataLoader Abstractions - @eyurtsev LLM/Chat Wrappers - @hwchase17 - @agola11 Tools / Toolkits - @vowelparrot -->
This commit is contained in:
parent
ae0c3382dd
commit
2ceb807da2
@ -1,6 +1,6 @@
|
|||||||
"""Abstract interface for document loader implementations."""
|
"""Abstract interface for document loader implementations."""
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Iterable, Iterator, List, Optional
|
from typing import Iterator, List, Optional
|
||||||
|
|
||||||
from langchain.document_loaders.blob_loaders import Blob
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
@ -39,7 +39,7 @@ class BaseLoader(ABC):
|
|||||||
# implemented in all the existing subclasses.
|
# implemented in all the existing subclasses.
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
) -> Iterable[Document]:
|
) -> Iterator[Document]:
|
||||||
"""A lazy loader for document content."""
|
"""A lazy loader for document content."""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"{self.__class__.__name__} does not implement lazy_load()"
|
f"{self.__class__.__name__} does not implement lazy_load()"
|
||||||
|
@ -0,0 +1,8 @@
|
|||||||
|
from langchain.document_loaders.parsers.pdf import (
|
||||||
|
PDFMinerParser,
|
||||||
|
PyMuPDFParser,
|
||||||
|
PyPDFium2Parser,
|
||||||
|
PyPDFParser,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = ["PyPDFParser", "PDFMinerParser", "PyMuPDFParser", "PyPDFium2Parser"]
|
101
langchain/document_loaders/parsers/pdf.py
Normal file
101
langchain/document_loaders/parsers/pdf.py
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
"""Module contains common parsers for PDFs."""
|
||||||
|
from typing import Any, Iterator, Mapping, Optional
|
||||||
|
|
||||||
|
from langchain.document_loaders.base import BaseBlobParser
|
||||||
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
|
from langchain.schema import Document
|
||||||
|
|
||||||
|
|
||||||
|
class PyPDFParser(BaseBlobParser):
|
||||||
|
"""Loads a PDF with pypdf and chunks at character level."""
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Lazily parse the blob."""
|
||||||
|
import pypdf
|
||||||
|
|
||||||
|
with blob.as_bytes_io() as pdf_file_obj:
|
||||||
|
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
||||||
|
yield from [
|
||||||
|
Document(
|
||||||
|
page_content=page.extract_text(),
|
||||||
|
metadata={"source": blob.source, "page": page_number},
|
||||||
|
)
|
||||||
|
for page_number, page in enumerate(pdf_reader.pages)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class PDFMinerParser(BaseBlobParser):
|
||||||
|
"""Parse PDFs with PDFMiner."""
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Lazily parse the blob."""
|
||||||
|
from pdfminer.high_level import extract_text
|
||||||
|
|
||||||
|
with blob.as_bytes_io() as pdf_file_obj:
|
||||||
|
text = extract_text(pdf_file_obj)
|
||||||
|
metadata = {"source": blob.source}
|
||||||
|
yield Document(page_content=text, metadata=metadata)
|
||||||
|
|
||||||
|
|
||||||
|
class PyMuPDFParser(BaseBlobParser):
|
||||||
|
"""Parse PDFs with PyMuPDF."""
|
||||||
|
|
||||||
|
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
|
||||||
|
"""Initialize the parser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
|
||||||
|
"""
|
||||||
|
self.text_kwargs = text_kwargs or {}
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Lazily parse the blob."""
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
with blob.as_bytes_io() as file_path:
|
||||||
|
doc = fitz.open(file_path) # open document
|
||||||
|
|
||||||
|
yield from [
|
||||||
|
Document(
|
||||||
|
page_content=page.get_text(**self.text_kwargs),
|
||||||
|
metadata=dict(
|
||||||
|
{
|
||||||
|
"source": blob.source,
|
||||||
|
"file_path": blob.source,
|
||||||
|
"page": page.number,
|
||||||
|
"total_pages": len(doc),
|
||||||
|
},
|
||||||
|
**{
|
||||||
|
k: doc.metadata[k]
|
||||||
|
for k in doc.metadata
|
||||||
|
if type(doc.metadata[k]) in [str, int]
|
||||||
|
},
|
||||||
|
),
|
||||||
|
)
|
||||||
|
for page in doc
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class PyPDFium2Parser(BaseBlobParser):
|
||||||
|
"""Parse PDFs with PyPDFium2."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
"""Initialize the parser."""
|
||||||
|
try:
|
||||||
|
import pypdfium2 # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"pypdfium2 package not found, please install it with"
|
||||||
|
" `pip install pypdfium2`"
|
||||||
|
)
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Lazily parse the blob."""
|
||||||
|
import pypdfium2
|
||||||
|
|
||||||
|
with blob.as_bytes_io() as f:
|
||||||
|
pdf_reader = pypdfium2.PdfDocument(f)
|
||||||
|
for page_number, page in enumerate(pdf_reader):
|
||||||
|
content = page.get_textpage().get_text_range()
|
||||||
|
metadata = {"source": blob.source, "page": page_number}
|
||||||
|
yield Document(page_content=content, metadata=metadata)
|
@ -7,13 +7,20 @@ import time
|
|||||||
from abc import ABC
|
from abc import ABC
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List, Optional
|
from typing import Any, Iterator, List, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
|
from langchain.document_loaders.parsers.pdf import (
|
||||||
|
PDFMinerParser,
|
||||||
|
PyMuPDFParser,
|
||||||
|
PyPDFium2Parser,
|
||||||
|
PyPDFParser,
|
||||||
|
)
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
from langchain.utils import get_from_dict_or_env
|
from langchain.utils import get_from_dict_or_env
|
||||||
|
|
||||||
@ -90,7 +97,7 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
Loader also stores page numbers in metadatas.
|
Loader also stores page numbers in metadatas.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str) -> None:
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
try:
|
try:
|
||||||
import pypdf # noqa:F401
|
import pypdf # noqa:F401
|
||||||
@ -98,21 +105,19 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||||
)
|
)
|
||||||
|
self.parser = PyPDFParser()
|
||||||
super().__init__(file_path)
|
super().__init__(file_path)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load given path as pages."""
|
"""Load given path as pages."""
|
||||||
import pypdf
|
return list(self.lazy_load())
|
||||||
|
|
||||||
with open(self.file_path, "rb") as pdf_file_obj:
|
def lazy_load(
|
||||||
pdf_reader = pypdf.PdfReader(pdf_file_obj)
|
self,
|
||||||
return [
|
) -> Iterator[Document]:
|
||||||
Document(
|
"""Lazy load given path as pages."""
|
||||||
page_content=page.extract_text(),
|
blob = Blob.from_path(self.file_path)
|
||||||
metadata={"source": self.file_path, "page": i},
|
yield from self.parser.parse(blob)
|
||||||
)
|
|
||||||
for i, page in enumerate(pdf_reader.pages)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class PyPDFium2Loader(BasePDFLoader):
|
class PyPDFium2Loader(BasePDFLoader):
|
||||||
@ -120,27 +125,19 @@ class PyPDFium2Loader(BasePDFLoader):
|
|||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
try:
|
|
||||||
import pypdfium2 # noqa:F401
|
|
||||||
except ImportError:
|
|
||||||
raise ValueError(
|
|
||||||
"pypdfium2 package not found, please install it with"
|
|
||||||
" `pip install pypdfium2`"
|
|
||||||
)
|
|
||||||
super().__init__(file_path)
|
super().__init__(file_path)
|
||||||
|
self.parser = PyPDFium2Parser()
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load given path as pages."""
|
"""Load given path as pages."""
|
||||||
import pypdfium2
|
return list(self.lazy_load())
|
||||||
|
|
||||||
with open(self.file_path, "rb") as f:
|
def lazy_load(
|
||||||
pdf_reader = pypdfium2.PdfDocument(f)
|
self,
|
||||||
docs = []
|
) -> Iterator[Document]:
|
||||||
for i, page in enumerate(pdf_reader):
|
"""Lazy load given path as pages."""
|
||||||
content = page.get_textpage().get_text_range()
|
blob = Blob.from_path(self.file_path)
|
||||||
metadata = {"source": self.file_path, "page": i}
|
yield from self.parser.parse(blob)
|
||||||
docs.append(Document(page_content=content, metadata=metadata))
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
class PyPDFDirectoryLoader(BaseLoader):
|
class PyPDFDirectoryLoader(BaseLoader):
|
||||||
@ -191,7 +188,7 @@ class PyPDFDirectoryLoader(BaseLoader):
|
|||||||
class PDFMinerLoader(BasePDFLoader):
|
class PDFMinerLoader(BasePDFLoader):
|
||||||
"""Loader that uses PDFMiner to load PDF files."""
|
"""Loader that uses PDFMiner to load PDF files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str) -> None:
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
try:
|
try:
|
||||||
from pdfminer.high_level import extract_text # noqa:F401
|
from pdfminer.high_level import extract_text # noqa:F401
|
||||||
@ -202,14 +199,18 @@ class PDFMinerLoader(BasePDFLoader):
|
|||||||
)
|
)
|
||||||
|
|
||||||
super().__init__(file_path)
|
super().__init__(file_path)
|
||||||
|
self.parser = PDFMinerParser()
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load file."""
|
"""Eagerly load the content."""
|
||||||
from pdfminer.high_level import extract_text
|
return list(self.lazy_load())
|
||||||
|
|
||||||
text = extract_text(self.file_path)
|
def lazy_load(
|
||||||
metadata = {"source": self.file_path}
|
self,
|
||||||
return [Document(page_content=text, metadata=metadata)]
|
) -> Iterator[Document]:
|
||||||
|
"""Lazily lod documents."""
|
||||||
|
blob = Blob.from_path(self.file_path)
|
||||||
|
yield from self.parser.parse(blob)
|
||||||
|
|
||||||
|
|
||||||
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||||
@ -249,7 +250,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
|||||||
class PyMuPDFLoader(BasePDFLoader):
|
class PyMuPDFLoader(BasePDFLoader):
|
||||||
"""Loader that uses PyMuPDF to load PDF files."""
|
"""Loader that uses PyMuPDF to load PDF files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str) -> None:
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
try:
|
try:
|
||||||
import fitz # noqa:F401
|
import fitz # noqa:F401
|
||||||
@ -263,30 +264,10 @@ class PyMuPDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
def load(self, **kwargs: Optional[Any]) -> List[Document]:
|
def load(self, **kwargs: Optional[Any]) -> List[Document]:
|
||||||
"""Load file."""
|
"""Load file."""
|
||||||
import fitz
|
|
||||||
|
|
||||||
doc = fitz.open(self.file_path) # open document
|
parser = PyMuPDFParser(text_kwargs=kwargs)
|
||||||
file_path = self.file_path if self.web_path is None else self.web_path
|
blob = Blob.from_path(self.file_path)
|
||||||
|
return parser.parse(blob)
|
||||||
return [
|
|
||||||
Document(
|
|
||||||
page_content=page.get_text(**kwargs).encode("utf-8"),
|
|
||||||
metadata=dict(
|
|
||||||
{
|
|
||||||
"source": file_path,
|
|
||||||
"file_path": file_path,
|
|
||||||
"page_number": page.number + 1,
|
|
||||||
"total_pages": len(doc),
|
|
||||||
},
|
|
||||||
**{
|
|
||||||
k: doc.metadata[k]
|
|
||||||
for k in doc.metadata
|
|
||||||
if type(doc.metadata[k]) in [str, int]
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
|
||||||
for page in doc
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
||||||
@ -367,10 +348,10 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
contents = contents.replace("\\section{", "# ").replace("}", "")
|
contents = contents.replace("\\section{", "# ").replace("}", "")
|
||||||
# replace the "\" slash that Mathpix adds to escape $, %, (, etc.
|
# replace the "\" slash that Mathpix adds to escape $, %, (, etc.
|
||||||
contents = (
|
contents = (
|
||||||
contents.replace("\$", "$")
|
contents.replace(r"\$", "$")
|
||||||
.replace("\%", "%")
|
.replace(r"\%", "%")
|
||||||
.replace("\(", "(")
|
.replace(r"\(", "(")
|
||||||
.replace("\)", ")")
|
.replace(r"\)", ")")
|
||||||
)
|
)
|
||||||
return contents
|
return contents
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Union
|
from typing import Iterator, List, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
@ -22,7 +22,7 @@ class TomlLoader(BaseLoader):
|
|||||||
"""Load and return all documents."""
|
"""Load and return all documents."""
|
||||||
return list(self.lazy_load())
|
return list(self.lazy_load())
|
||||||
|
|
||||||
def lazy_load(self) -> Iterable[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Lazily load the TOML documents from the source file or directory."""
|
"""Lazily load the TOML documents from the source file or directory."""
|
||||||
import tomli
|
import tomli
|
||||||
|
|
||||||
|
@ -0,0 +1,80 @@
|
|||||||
|
"""Tests for the various PDF parsers."""
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
from langchain.document_loaders.base import BaseBlobParser
|
||||||
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
|
from langchain.document_loaders.parsers.pdf import (
|
||||||
|
PDFMinerParser,
|
||||||
|
PyMuPDFParser,
|
||||||
|
PyPDFium2Parser,
|
||||||
|
PyPDFParser,
|
||||||
|
)
|
||||||
|
|
||||||
|
# PDFs to test parsers on.
|
||||||
|
HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
|
||||||
|
|
||||||
|
LAYOUT_PARSER_PAPER_PDF = (
|
||||||
|
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
|
||||||
|
"""Standard tests to verify that the given parser works.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
parser (BaseBlobParser): The parser to test.
|
||||||
|
splits_by_page (bool): Whether the parser splits by page or not by default.
|
||||||
|
"""
|
||||||
|
blob = Blob.from_path(HELLO_PDF)
|
||||||
|
doc_generator = parser.lazy_parse(blob)
|
||||||
|
assert isinstance(doc_generator, Iterator)
|
||||||
|
docs = list(doc_generator)
|
||||||
|
assert len(docs) == 1
|
||||||
|
page_content = docs[0].page_content
|
||||||
|
assert isinstance(page_content, str)
|
||||||
|
# The different parsers return different amount of whitespace, so using
|
||||||
|
# startswith instead of equals.
|
||||||
|
assert docs[0].page_content.startswith("Hello world!")
|
||||||
|
|
||||||
|
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
|
||||||
|
doc_generator = parser.lazy_parse(blob)
|
||||||
|
assert isinstance(doc_generator, Iterator)
|
||||||
|
docs = list(doc_generator)
|
||||||
|
|
||||||
|
if splits_by_page:
|
||||||
|
assert len(docs) == 16
|
||||||
|
else:
|
||||||
|
assert len(docs) == 1
|
||||||
|
# Test is imprecise since the parsers yield different parse information depending
|
||||||
|
# on configuration. Each parser seems to yield a slightly different result
|
||||||
|
# for this page!
|
||||||
|
assert "LayoutParser" in docs[0].page_content
|
||||||
|
metadata = docs[0].metadata
|
||||||
|
|
||||||
|
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
|
||||||
|
|
||||||
|
if splits_by_page:
|
||||||
|
assert metadata["page"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_pymupdf_loader() -> None:
|
||||||
|
"""Test PyMuPDF loader."""
|
||||||
|
_assert_with_parser(PyMuPDFParser())
|
||||||
|
|
||||||
|
|
||||||
|
def test_pypdf_parser() -> None:
|
||||||
|
"""Test PyPDF parser."""
|
||||||
|
_assert_with_parser(PyPDFParser())
|
||||||
|
|
||||||
|
|
||||||
|
def test_pdfminer_parser() -> None:
|
||||||
|
"""Test PDFMiner parser."""
|
||||||
|
# Does not follow defaults to split by page.
|
||||||
|
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pypdfium2_parser() -> None:
|
||||||
|
"""Test PyPDFium2 parser."""
|
||||||
|
# Does not follow defaults to split by page.
|
||||||
|
_assert_with_parser(PyPDFium2Parser())
|
@ -0,0 +1,11 @@
|
|||||||
|
from langchain.document_loaders.parsers import __all__
|
||||||
|
|
||||||
|
|
||||||
|
def test_parsers_public_api_correct() -> None:
|
||||||
|
"""Test public API of parsers for breaking changes."""
|
||||||
|
assert set(__all__) == {
|
||||||
|
"PyPDFParser",
|
||||||
|
"PDFMinerParser",
|
||||||
|
"PyMuPDFParser",
|
||||||
|
"PyPDFium2Parser",
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user