mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
00c6ec8a2d
# Fix Telegram API loader + add tests. I was testing this integration and it was broken with next error: ```python message_threads = loader._get_message_threads(df) KeyError: False ``` Also, this particular loader didn't have any tests / related group in poetry, so I added those as well. @hwchase17 / @eyurtsev please take a look on this fix PR. --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
80 lines
2.4 KiB
Python
80 lines
2.4 KiB
Python
"""Tests for the various PDF parsers."""
|
|
from typing import Iterator
|
|
|
|
import pytest
|
|
|
|
from langchain.document_loaders.base import BaseBlobParser
|
|
from langchain.document_loaders.blob_loaders import Blob
|
|
from langchain.document_loaders.parsers.pdf import (
|
|
PDFMinerParser,
|
|
PyMuPDFParser,
|
|
PyPDFium2Parser,
|
|
PyPDFParser,
|
|
)
|
|
from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF
|
|
|
|
|
|
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
|
|
"""Standard tests to verify that the given parser works.
|
|
|
|
Args:
|
|
parser (BaseBlobParser): The parser to test.
|
|
splits_by_page (bool): Whether the parser splits by page or not by default.
|
|
"""
|
|
blob = Blob.from_path(HELLO_PDF)
|
|
doc_generator = parser.lazy_parse(blob)
|
|
assert isinstance(doc_generator, Iterator)
|
|
docs = list(doc_generator)
|
|
assert len(docs) == 1
|
|
page_content = docs[0].page_content
|
|
assert isinstance(page_content, str)
|
|
# The different parsers return different amount of whitespace, so using
|
|
# startswith instead of equals.
|
|
assert docs[0].page_content.startswith("Hello world!")
|
|
|
|
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
|
|
doc_generator = parser.lazy_parse(blob)
|
|
assert isinstance(doc_generator, Iterator)
|
|
docs = list(doc_generator)
|
|
|
|
if splits_by_page:
|
|
assert len(docs) == 16
|
|
else:
|
|
assert len(docs) == 1
|
|
# Test is imprecise since the parsers yield different parse information depending
|
|
# on configuration. Each parser seems to yield a slightly different result
|
|
# for this page!
|
|
assert "LayoutParser" in docs[0].page_content
|
|
metadata = docs[0].metadata
|
|
|
|
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
|
|
|
|
if splits_by_page:
|
|
assert metadata["page"] == 0
|
|
|
|
|
|
@pytest.mark.requires("pypdf")
|
|
def test_pypdf_parser() -> None:
|
|
"""Test PyPDF parser."""
|
|
_assert_with_parser(PyPDFParser())
|
|
|
|
|
|
@pytest.mark.requires("pdfminer")
|
|
def test_pdfminer_parser() -> None:
|
|
"""Test PDFMiner parser."""
|
|
# Does not follow defaults to split by page.
|
|
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
|
|
|
|
|
@pytest.mark.requires("fitz") # package is PyMuPDF
|
|
def test_pymupdf_loader() -> None:
|
|
"""Test PyMuPDF loader."""
|
|
_assert_with_parser(PyMuPDFParser())
|
|
|
|
|
|
@pytest.mark.requires("pypdfium2")
|
|
def test_pypdfium2_parser() -> None:
|
|
"""Test PyPDFium2 parser."""
|
|
# Does not follow defaults to split by page.
|
|
_assert_with_parser(PyPDFium2Parser())
|