mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
87802c86d9
MHTML is a very interesting format since it's used both for emails but also for archived webpages. Some scraping projects want to store pages in disk to process them later, mhtml is perfect for that use case. This is heavily inspired from the beautifulsoup html loader, but extracting the html part from the mhtml file. --------- Co-authored-by: rlm <pexpresss31@gmail.com>
26 lines
654 B
Python
26 lines
654 B
Python
from pathlib import Path
|
||
|
||
import pytest
|
||
|
||
from langchain.document_loaders.mhtml import MHTMLLoader
|
||
|
||
HERE = Path(__file__).parent
|
||
EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
|
||
|
||
|
||
@pytest.mark.requires("bs4", "lxml")
|
||
def test_mhtml_loader() -> None:
|
||
"""Test mhtml loader."""
|
||
file_path = EXAMPLES / "example.mht"
|
||
loader = MHTMLLoader(str(file_path))
|
||
docs = loader.load()
|
||
|
||
assert len(docs) == 1
|
||
|
||
metadata = docs[0].metadata
|
||
content = docs[0].page_content
|
||
|
||
assert metadata["title"] == "LangChain"
|
||
assert metadata["source"] == str(file_path)
|
||
assert "LANG CHAIN 🦜️🔗Official Home Page" in content
|