diff --git a/langchain/document_loaders/base.py b/langchain/document_loaders/base.py index 9d92b641..d4e9fed7 100644 --- a/langchain/document_loaders/base.py +++ b/langchain/document_loaders/base.py @@ -1,6 +1,6 @@ """Abstract interface for document loader implementations.""" from abc import ABC, abstractmethod -from typing import Iterable, Iterator, List, Optional +from typing import Iterator, List, Optional from langchain.document_loaders.blob_loaders import Blob from langchain.schema import Document @@ -39,7 +39,7 @@ class BaseLoader(ABC): # implemented in all the existing subclasses. def lazy_load( self, - ) -> Iterable[Document]: + ) -> Iterator[Document]: """A lazy loader for document content.""" raise NotImplementedError( f"{self.__class__.__name__} does not implement lazy_load()" diff --git a/langchain/document_loaders/parsers/__init__.py b/langchain/document_loaders/parsers/__init__.py index e69de29b..b79b4942 100644 --- a/langchain/document_loaders/parsers/__init__.py +++ b/langchain/document_loaders/parsers/__init__.py @@ -0,0 +1,8 @@ +from langchain.document_loaders.parsers.pdf import ( + PDFMinerParser, + PyMuPDFParser, + PyPDFium2Parser, + PyPDFParser, +) + +__all__ = ["PyPDFParser", "PDFMinerParser", "PyMuPDFParser", "PyPDFium2Parser"] diff --git a/langchain/document_loaders/parsers/pdf.py b/langchain/document_loaders/parsers/pdf.py new file mode 100644 index 00000000..dcc729bd --- /dev/null +++ b/langchain/document_loaders/parsers/pdf.py @@ -0,0 +1,101 @@ +"""Module contains common parsers for PDFs.""" +from typing import Any, Iterator, Mapping, Optional + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.schema import Document + + +class PyPDFParser(BaseBlobParser): + """Loads a PDF with pypdf and chunks at character level.""" + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + import pypdf + + with blob.as_bytes_io() as pdf_file_obj: + pdf_reader = pypdf.PdfReader(pdf_file_obj) + yield from [ + Document( + page_content=page.extract_text(), + metadata={"source": blob.source, "page": page_number}, + ) + for page_number, page in enumerate(pdf_reader.pages) + ] + + +class PDFMinerParser(BaseBlobParser): + """Parse PDFs with PDFMiner.""" + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + from pdfminer.high_level import extract_text + + with blob.as_bytes_io() as pdf_file_obj: + text = extract_text(pdf_file_obj) + metadata = {"source": blob.source} + yield Document(page_content=text, metadata=metadata) + + +class PyMuPDFParser(BaseBlobParser): + """Parse PDFs with PyMuPDF.""" + + def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None: + """Initialize the parser. + + Args: + text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``. + """ + self.text_kwargs = text_kwargs or {} + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + import fitz + + with blob.as_bytes_io() as file_path: + doc = fitz.open(file_path) # open document + + yield from [ + Document( + page_content=page.get_text(**self.text_kwargs), + metadata=dict( + { + "source": blob.source, + "file_path": blob.source, + "page": page.number, + "total_pages": len(doc), + }, + **{ + k: doc.metadata[k] + for k in doc.metadata + if type(doc.metadata[k]) in [str, int] + }, + ), + ) + for page in doc + ] + + +class PyPDFium2Parser(BaseBlobParser): + """Parse PDFs with PyPDFium2.""" + + def __init__(self) -> None: + """Initialize the parser.""" + try: + import pypdfium2 # noqa:F401 + except ImportError: + raise ValueError( + "pypdfium2 package not found, please install it with" + " `pip install pypdfium2`" + ) + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + import pypdfium2 + + with blob.as_bytes_io() as f: + pdf_reader = pypdfium2.PdfDocument(f) + for page_number, page in enumerate(pdf_reader): + content = page.get_textpage().get_text_range() + metadata = {"source": blob.source, "page": page_number} + yield Document(page_content=content, metadata=metadata) diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index d950f3ff..e2763205 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -7,13 +7,20 @@ import time from abc import ABC from io import StringIO from pathlib import Path -from typing import Any, List, Optional +from typing import Any, Iterator, List, Optional from urllib.parse import urlparse import requests from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.parsers.pdf import ( + PDFMinerParser, + PyMuPDFParser, + PyPDFium2Parser, + PyPDFParser, +) from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.utils import get_from_dict_or_env @@ -90,7 +97,7 @@ class PyPDFLoader(BasePDFLoader): Loader also stores page numbers in metadatas. """ - def __init__(self, file_path: str): + def __init__(self, file_path: str) -> None: """Initialize with file path.""" try: import pypdf # noqa:F401 @@ -98,21 +105,19 @@ class PyPDFLoader(BasePDFLoader): raise ValueError( "pypdf package not found, please install it with " "`pip install pypdf`" ) + self.parser = PyPDFParser() super().__init__(file_path) def load(self) -> List[Document]: """Load given path as pages.""" - import pypdf + return list(self.lazy_load()) - with open(self.file_path, "rb") as pdf_file_obj: - pdf_reader = pypdf.PdfReader(pdf_file_obj) - return [ - Document( - page_content=page.extract_text(), - metadata={"source": self.file_path, "page": i}, - ) - for i, page in enumerate(pdf_reader.pages) - ] + def lazy_load( + self, + ) -> Iterator[Document]: + """Lazy load given path as pages.""" + blob = Blob.from_path(self.file_path) + yield from self.parser.parse(blob) class PyPDFium2Loader(BasePDFLoader): @@ -120,27 +125,19 @@ class PyPDFium2Loader(BasePDFLoader): def __init__(self, file_path: str): """Initialize with file path.""" - try: - import pypdfium2 # noqa:F401 - except ImportError: - raise ValueError( - "pypdfium2 package not found, please install it with" - " `pip install pypdfium2`" - ) super().__init__(file_path) + self.parser = PyPDFium2Parser() def load(self) -> List[Document]: """Load given path as pages.""" - import pypdfium2 + return list(self.lazy_load()) - with open(self.file_path, "rb") as f: - pdf_reader = pypdfium2.PdfDocument(f) - docs = [] - for i, page in enumerate(pdf_reader): - content = page.get_textpage().get_text_range() - metadata = {"source": self.file_path, "page": i} - docs.append(Document(page_content=content, metadata=metadata)) - return docs + def lazy_load( + self, + ) -> Iterator[Document]: + """Lazy load given path as pages.""" + blob = Blob.from_path(self.file_path) + yield from self.parser.parse(blob) class PyPDFDirectoryLoader(BaseLoader): @@ -191,7 +188,7 @@ class PyPDFDirectoryLoader(BaseLoader): class PDFMinerLoader(BasePDFLoader): """Loader that uses PDFMiner to load PDF files.""" - def __init__(self, file_path: str): + def __init__(self, file_path: str) -> None: """Initialize with file path.""" try: from pdfminer.high_level import extract_text # noqa:F401 @@ -202,14 +199,18 @@ class PDFMinerLoader(BasePDFLoader): ) super().__init__(file_path) + self.parser = PDFMinerParser() def load(self) -> List[Document]: - """Load file.""" - from pdfminer.high_level import extract_text + """Eagerly load the content.""" + return list(self.lazy_load()) - text = extract_text(self.file_path) - metadata = {"source": self.file_path} - return [Document(page_content=text, metadata=metadata)] + def lazy_load( + self, + ) -> Iterator[Document]: + """Lazily lod documents.""" + blob = Blob.from_path(self.file_path) + yield from self.parser.parse(blob) class PDFMinerPDFasHTMLLoader(BasePDFLoader): @@ -249,7 +250,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader): class PyMuPDFLoader(BasePDFLoader): """Loader that uses PyMuPDF to load PDF files.""" - def __init__(self, file_path: str): + def __init__(self, file_path: str) -> None: """Initialize with file path.""" try: import fitz # noqa:F401 @@ -263,30 +264,10 @@ class PyMuPDFLoader(BasePDFLoader): def load(self, **kwargs: Optional[Any]) -> List[Document]: """Load file.""" - import fitz - doc = fitz.open(self.file_path) # open document - file_path = self.file_path if self.web_path is None else self.web_path - - return [ - Document( - page_content=page.get_text(**kwargs).encode("utf-8"), - metadata=dict( - { - "source": file_path, - "file_path": file_path, - "page_number": page.number + 1, - "total_pages": len(doc), - }, - **{ - k: doc.metadata[k] - for k in doc.metadata - if type(doc.metadata[k]) in [str, int] - }, - ), - ) - for page in doc - ] + parser = PyMuPDFParser(text_kwargs=kwargs) + blob = Blob.from_path(self.file_path) + return parser.parse(blob) # MathpixPDFLoader implementation taken largely from Daniel Gross's: @@ -367,10 +348,10 @@ class MathpixPDFLoader(BasePDFLoader): contents = contents.replace("\\section{", "# ").replace("}", "") # replace the "\" slash that Mathpix adds to escape $, %, (, etc. contents = ( - contents.replace("\$", "$") - .replace("\%", "%") - .replace("\(", "(") - .replace("\)", ")") + contents.replace(r"\$", "$") + .replace(r"\%", "%") + .replace(r"\(", "(") + .replace(r"\)", ")") ) return contents diff --git a/langchain/document_loaders/toml.py b/langchain/document_loaders/toml.py index 1a36eb6e..0f52d314 100644 --- a/langchain/document_loaders/toml.py +++ b/langchain/document_loaders/toml.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Iterable, List, Union +from typing import Iterator, List, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -22,7 +22,7 @@ class TomlLoader(BaseLoader): """Load and return all documents.""" return list(self.lazy_load()) - def lazy_load(self) -> Iterable[Document]: + def lazy_load(self) -> Iterator[Document]: """Lazily load the TOML documents from the source file or directory.""" import tomli diff --git a/tests/integration_tests/document_loaders/parsers/__init__.py b/tests/integration_tests/document_loaders/parsers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py new file mode 100644 index 00000000..f847fb82 --- /dev/null +++ b/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -0,0 +1,80 @@ +"""Tests for the various PDF parsers.""" +from pathlib import Path +from typing import Iterator + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.parsers.pdf import ( + PDFMinerParser, + PyMuPDFParser, + PyPDFium2Parser, + PyPDFParser, +) + +# PDFs to test parsers on. +HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf" + +LAYOUT_PARSER_PAPER_PDF = ( + Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf" +) + + +def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None: + """Standard tests to verify that the given parser works. + + Args: + parser (BaseBlobParser): The parser to test. + splits_by_page (bool): Whether the parser splits by page or not by default. + """ + blob = Blob.from_path(HELLO_PDF) + doc_generator = parser.lazy_parse(blob) + assert isinstance(doc_generator, Iterator) + docs = list(doc_generator) + assert len(docs) == 1 + page_content = docs[0].page_content + assert isinstance(page_content, str) + # The different parsers return different amount of whitespace, so using + # startswith instead of equals. + assert docs[0].page_content.startswith("Hello world!") + + blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF) + doc_generator = parser.lazy_parse(blob) + assert isinstance(doc_generator, Iterator) + docs = list(doc_generator) + + if splits_by_page: + assert len(docs) == 16 + else: + assert len(docs) == 1 + # Test is imprecise since the parsers yield different parse information depending + # on configuration. Each parser seems to yield a slightly different result + # for this page! + assert "LayoutParser" in docs[0].page_content + metadata = docs[0].metadata + + assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF) + + if splits_by_page: + assert metadata["page"] == 0 + + +def test_pymupdf_loader() -> None: + """Test PyMuPDF loader.""" + _assert_with_parser(PyMuPDFParser()) + + +def test_pypdf_parser() -> None: + """Test PyPDF parser.""" + _assert_with_parser(PyPDFParser()) + + +def test_pdfminer_parser() -> None: + """Test PDFMiner parser.""" + # Does not follow defaults to split by page. + _assert_with_parser(PDFMinerParser(), splits_by_page=False) + + +def test_pypdfium2_parser() -> None: + """Test PyPDFium2 parser.""" + # Does not follow defaults to split by page. + _assert_with_parser(PyPDFium2Parser()) diff --git a/tests/integration_tests/document_loaders/parsers/test_public_api.py b/tests/integration_tests/document_loaders/parsers/test_public_api.py new file mode 100644 index 00000000..52ce7e8e --- /dev/null +++ b/tests/integration_tests/document_loaders/parsers/test_public_api.py @@ -0,0 +1,11 @@ +from langchain.document_loaders.parsers import __all__ + + +def test_parsers_public_api_correct() -> None: + """Test public API of parsers for breaking changes.""" + assert set(__all__) == { + "PyPDFParser", + "PDFMinerParser", + "PyMuPDFParser", + "PyPDFium2Parser", + }