import os from pathlib import Path from langchain_community.document_loaders import ( DedocAPIFileLoader, DedocFileLoader, DedocPDFLoader, ) EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/") FILE_NAMES = [ "example.html", "example.json", "fake-email-attachment.eml", "layout-parser-paper.pdf", "slack_export.zip", "stanley-cups.csv", "stanley-cups.xlsx", "whatsapp_chat.txt", ] def test_dedoc_file_loader() -> None: for file_name in FILE_NAMES: file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) loader = DedocFileLoader( file_path, split="document", with_tables=False, pdf_with_text_layer="tabby", pages=":1", ) docs = loader.load() assert len(docs) == 1 def test_dedoc_pdf_loader() -> None: file_name = "layout-parser-paper.pdf" for mode in ("true", "tabby"): file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) loader = DedocPDFLoader( file_path, split="document", with_tables=False, pdf_with_text_layer=mode, pages=":1", ) docs = loader.load() assert len(docs) == 1 def test_dedoc_content_html() -> None: file_name = "example.html" file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) loader = DedocFileLoader( file_path, split="line", with_tables=False, ) docs = loader.load() assert docs[0].metadata["file_name"] == "example.html" assert docs[0].metadata["file_type"] == "text/html" assert "Instead of drinking water from the cat bowl" in docs[0].page_content assert "Chase the red dot" not in docs[0].page_content def test_dedoc_content_pdf() -> None: file_name = "layout-parser-paper.pdf" file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name) loader = DedocFileLoader( file_path, split="page", pdf_with_text_layer="tabby", pages=":5" ) docs = loader.load() table_list = [item for item in docs if item.metadata.get("type", "") == "table"] assert len(docs) == 6 assert docs[0].metadata["file_name"] == "layout-parser-paper.pdf" assert docs[0].metadata["file_type"] == "application/pdf" assert "This paper introduces LayoutParser, an open-source" in docs[0].page_content assert "layout detection [38, 22], table detection [26]" in docs[1].page_content assert "LayoutParser: A Uniļ¬ed Toolkit for DL-Based DIA" in docs[2].page_content assert len(table_list) > 0 assert ( '\n
\n