You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/tests/integration_tests/document_loaders/test_dedoc.py

147 lines
4.8 KiB
Python

community[minor]: added new document loaders based on dedoc library (#24303) ### Description This pull request added new document loaders to load documents of various formats using [Dedoc](https://github.com/ispras/dedoc): - `DedocFileLoader` (determine file types automatically and parse) - `DedocPDFLoader` (for `PDF` and images parsing) - `DedocAPIFileLoader` (determine file types automatically and parse using Dedoc API without library installation) [Dedoc](https://dedoc.readthedocs.io) is an open-source library/service that extracts texts, tables, attached files and document structure (e.g., titles, list items, etc.) from files of various formats. The library is actively developed and maintained by a group of developers. `Dedoc` supports `DOCX`, `XLSX`, `PPTX`, `EML`, `HTML`, `PDF`, images and more. Full list of supported formats can be found [here](https://dedoc.readthedocs.io/en/latest/#id1). For `PDF` documents, `Dedoc` allows to determine textual layer correctness and split the document into paragraphs. ### Issue This pull request extends variety of document loaders supported by `langchain_community` allowing users to choose the most suitable option for raw documents parsing. ### Dependencies The PR added a new (optional) dependency `dedoc>=2.2.5` ([library documentation](https://dedoc.readthedocs.io)) to the `extended_testing_deps.txt` ### Twitter handle None ### Add tests and docs 1. Test for the integration: `libs/community/tests/integration_tests/document_loaders/test_dedoc.py` 2. Example notebook: `docs/docs/integrations/document_loaders/dedoc.ipynb` 3. Information about the library: `docs/docs/integrations/providers/dedoc.mdx` ### Lint and test Done locally: - `make format` - `make lint` - `make integration_tests` - `make docs_build` (from the project root) --------- Co-authored-by: Nasty <bogatenkova.anastasiya@mail.ru>
2 months ago
import os
from pathlib import Path
from langchain_community.document_loaders import (
DedocAPIFileLoader,
DedocFileLoader,
DedocPDFLoader,
)
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
FILE_NAMES = [
"example.html",
"example.json",
"fake-email-attachment.eml",
"layout-parser-paper.pdf",
"slack_export.zip",
"stanley-cups.csv",
"stanley-cups.xlsx",
"whatsapp_chat.txt",
]
def test_dedoc_file_loader() -> None:
for file_name in FILE_NAMES:
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
loader = DedocFileLoader(
file_path,
split="document",
with_tables=False,
pdf_with_text_layer="tabby",
pages=":1",
)
docs = loader.load()
assert len(docs) == 1
def test_dedoc_pdf_loader() -> None:
file_name = "layout-parser-paper.pdf"
for mode in ("true", "tabby"):
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
loader = DedocPDFLoader(
file_path,
split="document",
with_tables=False,
pdf_with_text_layer=mode,
pages=":1",
)
docs = loader.load()
assert len(docs) == 1
def test_dedoc_content_html() -> None:
file_name = "example.html"
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
loader = DedocFileLoader(
file_path,
split="line",
with_tables=False,
)
docs = loader.load()
assert docs[0].metadata["file_name"] == "example.html"
assert docs[0].metadata["file_type"] == "text/html"
assert "Instead of drinking water from the cat bowl" in docs[0].page_content
assert "Chase the red dot" not in docs[0].page_content
def test_dedoc_content_pdf() -> None:
file_name = "layout-parser-paper.pdf"
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
loader = DedocFileLoader(
file_path, split="page", pdf_with_text_layer="tabby", pages=":5"
)
docs = loader.load()
table_list = [item for item in docs if item.metadata.get("type", "") == "table"]
assert len(docs) == 6
assert docs[0].metadata["file_name"] == "layout-parser-paper.pdf"
assert docs[0].metadata["file_type"] == "application/pdf"
assert "This paper introduces LayoutParser, an open-source" in docs[0].page_content
assert "layout detection [38, 22], table detection [26]" in docs[1].page_content
assert "LayoutParser: A Unified Toolkit for DL-Based DIA" in docs[2].page_content
assert len(table_list) > 0
assert (
'\n<tbody>\n<tr>\n<td colspan="1" rowspan="1">'
in table_list[0].metadata["text_as_html"]
)
def test_dedoc_content_json() -> None:
file_name = "example.json"
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
loader = DedocFileLoader(file_path, split="node")
docs = loader.load()
assert len(docs) == 11
assert docs[0].metadata["file_name"] == "example.json"
assert docs[0].metadata["file_type"] == "application/json"
assert "Bye!" in docs[0].page_content
def test_dedoc_content_txt() -> None:
file_name = "whatsapp_chat.txt"
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
loader = DedocFileLoader(file_path, split="line")
docs = loader.load()
assert len(docs) == 10
assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
assert docs[0].metadata["file_type"] == "text/plain"
assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content
def test_dedoc_table_handling() -> None:
file_name = "stanley-cups.csv"
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
loader = DedocFileLoader(file_path, split="document")
docs = loader.load()
assert len(docs) == 2
assert docs[0].metadata["file_name"] == "stanley-cups.csv"
assert docs[0].metadata["file_type"] == "text/csv"
assert docs[1].metadata["type"] == "table"
assert '<td colspan="1" rowspan="1">1</td>' in docs[1].metadata["text_as_html"]
assert "Maple Leafs\tTOR\t13" in docs[1].page_content
def test_dedoc_api_file_loader() -> None:
file_name = "whatsapp_chat.txt"
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, file_name)
loader = DedocAPIFileLoader(
file_path, split="line", url="https://dedoc-readme.hf.space"
)
docs = loader.load()
assert len(docs) == 10
assert docs[0].metadata["file_name"] == "whatsapp_chat.txt"
assert docs[0].metadata["file_type"] == "text/plain"
assert "[05.05.23, 15:48:11] James: Hi here" in docs[0].page_content
assert "[11/8/21, 9:41:32 AM] User name: Message 123" in docs[1].page_content
assert "1/23/23, 3:19 AM - User 2: Bye!" in docs[2].page_content