langchain/libs/community/tests/integration_tests/document_loaders/test_dedoc.py

import os
from pathlib import Path

from langchain_community.document_loaders import (
    DedocAPIFileLoader,
    DedocFileLoader,
    DedocPDFLoader,
)

EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")

FILE_NAMES = [
    "example.html",
    "example.json",
    "fake-email-attachment.eml",
    "layout-parser-paper.pdf",
    "slack_export.zip",
    "stanley-cups.csv",
    "stanley-cups.xlsx",
    "whatsapp_chat.txt",
]


def test_dedoc_file_loader() -> None:
    for file_name in FILE_NAMES:
        file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
        loader = DedocFileLoader(
            file_path,
            split="document",
            with_tables=False,
            pdf_with_text_layer="tabby",
            pages=":1",
        )
        docs = loader.load()

        assert len(docs) == 1


def test_dedoc_pdf_loader() -> None:
    file_name = "layout-parser-paper.pdf"
    for mode in ("true", "tabby"):
        file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
        loader = DedocPDFLoader(
            file_path,
            split="document",
            with_tables=False,
            pdf_with_text_layer=mode,
            pages=":1",
        )
        docs = loader.load()

        assert len(docs) == 1


def test_dedoc_content_html() -> None:
    file_name = "example.html"
    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
    loader = DedocFileLoader(
        file_path,
        split="line",
        with_tables=False,
    )
    docs = loader.load()

    assert docs[0].metadata["file_name"] == "example.html"
    assert docs[0].metadata["file_type"] == "text/html"
    assert "Instead of drinking water from the cat bowl" in docs[0].page_content
    assert "Chase the red dot" not in docs[0].page_content


def test_dedoc_content_pdf() -> None:
    file_name = "layout-parser-paper.pdf"
    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
    loader = DedocFileLoader(
        file_path, split="page", pdf_with_text_layer="tabby", pages=":5"
    )
    docs = loader.load()
    table_list = [item for item in docs if item.metadata.get("type", "") == "table"]

    assert len(docs) == 6
    assert docs[0].metadata["file_name"] == "layout-parser-paper.pdf"
    assert docs[0].metadata["file_type"] == "application/pdf"
    assert "This paper introduces LayoutParser, an open-source" in docs[0].page_content
    assert "layout detection [38, 22], table detection [26]" in docs[1].page_content
    assert "LayoutParser: A Uniﬁed Toolkit for DL-Based DIA" in docs[2].page_content
    assert len(table_list) > 0
    assert (
        '\n<tbody>\n<tr>\n<td colspan="1" rowspan="1">'
        in table_list[0].metadata["text_as_html"]
    )


def test_dedoc_content_json() -> None:
    file_name = "example.json"
    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
    loader = DedocFileLoader(file_path, split="node")
    docs = loader.load()

    assert len(docs) == 11
    assert docs[0].metadata["file_name"] == "example.json"
    assert docs[0].metadata["file_type"] == "application/json"
    assert "Bye!" in docs[0].page_content


def test_dedoc_content_txt() -> None:
    file_name = "whatsapp_chat.txt"
    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
    loader = DedocFileLoader(file_path, split="line")
    docs = loader.load()

    assert len(docs) == 10
    assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
    assert docs[0].metadata["file_type"] == "text/plain"
    assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
    assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
    assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content


def test_dedoc_table_handling() -> None:
    file_name = "stanley-cups.csv"
    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
    loader = DedocFileLoader(file_path, split="document")
    docs = loader.load()

    assert len(docs) == 2
    assert docs[0].metadata["file_name"] == "stanley-cups.csv"
    assert docs[0].metadata["file_type"] == "text/csv"
    assert docs[1].metadata["type"] == "table"
    assert '<td colspan="1" rowspan="1">1</td>' in docs[1].metadata["text_as_html"]
    assert "Maple Leafs\tTOR\t13" in docs[1].page_content


def test_dedoc_api_file_loader() -> None:
    file_name = "whatsapp_chat.txt"
    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
    loader = DedocAPIFileLoader(
        file_path, split="line", url="https://dedoc-readme.hf.space"
    )
    docs = loader.load()

    assert len(docs) == 10
    assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
    assert docs[0].metadata["file_type"] == "text/plain"
    assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
    assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
    assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content
community[minor]: added new document loaders based on dedoc library (#24303) ### Description This pull request added new document loaders to load documents of various formats using [Dedoc](https://github.com/ispras/dedoc): - `DedocFileLoader` (determine file types automatically and parse) - `DedocPDFLoader` (for `PDF` and images parsing) - `DedocAPIFileLoader` (determine file types automatically and parse using Dedoc API without library installation) [Dedoc](https://dedoc.readthedocs.io) is an open-source library/service that extracts texts, tables, attached files and document structure (e.g., titles, list items, etc.) from files of various formats. The library is actively developed and maintained by a group of developers. `Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more. Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1). For `PDF` documents, `Dedoc` allows to determine textual layer correctness and split the document into paragraphs. ### Issue This pull request extends variety of document loaders supported by `langchain_community` allowing users to choose the most suitable option for raw documents parsing. ### Dependencies The PR added a new (optional) dependency `dedoc>=2.2.5` ([library documentation](https://dedoc.readthedocs.io)) to the `extended_testing_deps.txt` ### Twitter handle None ### Add tests and docs 1. Test for the integration: `libs/community/tests/integration_tests/document_loaders/test_dedoc.py` 2. Example notebook: `docs/docs/integrations/document_loaders/dedoc.ipynb` 3. Information about the library: `docs/docs/integrations/providers/dedoc.mdx` ### Lint and test Done locally: - `make format` - `make lint` - `make integration_tests` - `make docs_build` (from the project root) --------- Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru> 2 months ago			`import os`
			`from pathlib import Path`

			`from langchain_community.document_loaders import (`
			`DedocAPIFileLoader,`
			`DedocFileLoader,`
			`DedocPDFLoader,`
			`)`

			`EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")`

			`FILE_NAMES = [`
			`"example.html",`
			`"example.json",`
			`"fake-email-attachment.eml",`
			`"layout-parser-paper.pdf",`
			`"slack_export.zip",`
			`"stanley-cups.csv",`
			`"stanley-cups.xlsx",`
			`"whatsapp_chat.txt",`
			`]`


			`def test_dedoc_file_loader() -> None:`
			`for file_name in FILE_NAMES:`
			`file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)`
			`loader = DedocFileLoader(`
			`file_path,`
			`split="document",`
			`with_tables=False,`
			`pdf_with_text_layer="tabby",`
			`pages=":1",`
			`)`
			`docs = loader.load()`

			`assert len(docs) == 1`


			`def test_dedoc_pdf_loader() -> None:`
			`file_name = "layout-parser-paper.pdf"`
			`for mode in ("true", "tabby"):`
			`file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)`
			`loader = DedocPDFLoader(`
			`file_path,`
			`split="document",`
			`with_tables=False,`
			`pdf_with_text_layer=mode,`
			`pages=":1",`
			`)`
			`docs = loader.load()`

			`assert len(docs) == 1`


			`def test_dedoc_content_html() -> None:`
			`file_name = "example.html"`
			`file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)`
			`loader = DedocFileLoader(`
			`file_path,`
			`split="line",`
			`with_tables=False,`
			`)`
			`docs = loader.load()`

			`assert docs[0].metadata["file_name"] == "example.html"`
			`assert docs[0].metadata["file_type"] == "text/html"`
			`assert "Instead of drinking water from the cat bowl" in docs[0].page_content`
			`assert "Chase the red dot" not in docs[0].page_content`


			`def test_dedoc_content_pdf() -> None:`
			`file_name = "layout-parser-paper.pdf"`
			`file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)`
			`loader = DedocFileLoader(`
			`file_path, split="page", pdf_with_text_layer="tabby", pages=":5"`
			`)`
			`docs = loader.load()`
			`table_list = [item for item in docs if item.metadata.get("type", "") == "table"]`

			`assert len(docs) == 6`
			`assert docs[0].metadata["file_name"] == "layout-parser-paper.pdf"`
			`assert docs[0].metadata["file_type"] == "application/pdf"`
			`assert "This paper introduces LayoutParser, an open-source" in docs[0].page_content`
			`assert "layout detection [38, 22], table detection [26]" in docs[1].page_content`
			`assert "LayoutParser: A Uniﬁed Toolkit for DL-Based DIA" in docs[2].page_content`
			`assert len(table_list) > 0`
			`assert (`
			`'\n<tbody>\n<tr>\n<td colspan="1" rowspan="1">'`
			`in table_list[0].metadata["text_as_html"]`
			`)`


			`def test_dedoc_content_json() -> None:`
			`file_name = "example.json"`
			`file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)`
			`loader = DedocFileLoader(file_path, split="node")`
			`docs = loader.load()`

			`assert len(docs) == 11`
			`assert docs[0].metadata["file_name"] == "example.json"`
			`assert docs[0].metadata["file_type"] == "application/json"`
			`assert "Bye!" in docs[0].page_content`


			`def test_dedoc_content_txt() -> None:`
			`file_name = "whatsapp_chat.txt"`
			`file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)`
			`loader = DedocFileLoader(file_path, split="line")`
			`docs = loader.load()`

			`assert len(docs) == 10`
			`assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"`
			`assert docs[0].metadata["file_type"] == "text/plain"`
			`assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content`
			`assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content`
			`assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content`


			`def test_dedoc_table_handling() -> None:`
			`file_name = "stanley-cups.csv"`
			`file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)`
			`loader = DedocFileLoader(file_path, split="document")`
			`docs = loader.load()`

			`assert len(docs) == 2`
			`assert docs[0].metadata["file_name"] == "stanley-cups.csv"`
			`assert docs[0].metadata["file_type"] == "text/csv"`
			`assert docs[1].metadata["type"] == "table"`
			`assert '<td colspan="1" rowspan="1">1</td>' in docs[1].metadata["text_as_html"]`
			`assert "Maple Leafs\tTOR\t13" in docs[1].page_content`


			`def test_dedoc_api_file_loader() -> None:`
			`file_name = "whatsapp_chat.txt"`
			`file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)`
			`loader = DedocAPIFileLoader(`
			`file_path, split="line", url="https://dedoc-readme.hf.space"`
			`)`
			`docs = loader.load()`

			`assert len(docs) == 10`
			`assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"`
			`assert docs[0].metadata["file_type"] == "text/plain"`
			`assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content`
			`assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content`
			`assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content`