"""Unit tests for html2text document transformer.""" import pytest from langchain_core.documents import Document from langchain_community.document_transformers import Html2TextTransformer @pytest.mark.requires("html2text") def test_transform_empty_html() -> None: html2text_transformer = Html2TextTransformer() empty_html = "" documents = [Document(page_content=empty_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "\n\n" @pytest.mark.requires("html2text") def test_extract_paragraphs() -> None: html2text_transformer = Html2TextTransformer() paragraphs_html = ( "
First paragraph.
" "Second paragraph.
First paragraph.
" "Middle of html tag" "Second paragraph.
" "End of html tag" "" ) documents = [Document(page_content=paragraphs_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "Begin of html tag\n\n" "# Header\n\n" "First paragraph.\n\n" "Middle of html tag\n\n" "Second paragraph.\n\n" "End of html tag\n\n" ) @pytest.mark.requires("html2text") def test_remove_style() -> None: html2text_transformer = Html2TextTransformer() with_style_html = ( "First paragraph.
" ) documents = [Document(page_content=with_style_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "First paragraph.\n\n" @pytest.mark.requires("html2text") def test_ignore_links() -> None: html2text_transformer = Html2TextTransformer(ignore_links=False) multiple_tags_html = ( "First paragraph with an example
" ) documents = [Document(page_content=multiple_tags_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "# First heading.\n\n" "First paragraph with an [example](http://example.com)\n\n" ) html2text_transformer = Html2TextTransformer(ignore_links=True) docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "# First heading.\n\n" "First paragraph with an example\n\n" ) @pytest.mark.requires("html2text") def test_ignore_images() -> None: html2text_transformer = Html2TextTransformer(ignore_images=False) multiple_tags_html = ( "First paragraph with an " "
" ) documents = [Document(page_content=multiple_tags_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "# First heading.\n\n" "First paragraph with an ![Example image](example.jpg)\n\n" ) html2text_transformer = Html2TextTransformer(ignore_images=True) docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "# First heading.\n\n" "First paragraph with an\n\n" )