"""Unit tests for html2text document transformer.""" import pytest from langchain_core.documents import Document from langchain_community.document_transformers import Html2TextTransformer @pytest.mark.requires("html2text") def test_transform_empty_html() -> None: html2text_transformer = Html2TextTransformer() empty_html = "" documents = [Document(page_content=empty_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "\n\n" @pytest.mark.requires("html2text") def test_extract_paragraphs() -> None: html2text_transformer = Html2TextTransformer() paragraphs_html = ( "

Header

First paragraph.

" "

Second paragraph.

Ignore at end

" ) documents = [Document(page_content=paragraphs_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "# Header\n\n" "First paragraph.\n\n" "Second paragraph.\n\n" "# Ignore at end\n\n" ) @pytest.mark.requires("html2text") def test_extract_html() -> None: html2text_transformer = Html2TextTransformer() paragraphs_html = ( "Begin of html tag" "

Header

" "

First paragraph.

" "Middle of html tag" "

Second paragraph.

" "End of html tag" "" ) documents = [Document(page_content=paragraphs_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "Begin of html tag\n\n" "# Header\n\n" "First paragraph.\n\n" "Middle of html tag\n\n" "Second paragraph.\n\n" "End of html tag\n\n" ) @pytest.mark.requires("html2text") def test_remove_style() -> None: html2text_transformer = Html2TextTransformer() with_style_html = ( "

First paragraph.

" ) documents = [Document(page_content=with_style_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "First paragraph.\n\n" @pytest.mark.requires("html2text") def test_ignore_links() -> None: html2text_transformer = Html2TextTransformer(ignore_links=False) multiple_tags_html = ( "

First heading.

" "

First paragraph with an example

" ) documents = [Document(page_content=multiple_tags_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "# First heading.\n\n" "First paragraph with an [example](http://example.com)\n\n" ) html2text_transformer = Html2TextTransformer(ignore_links=True) docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "# First heading.\n\n" "First paragraph with an example\n\n" ) @pytest.mark.requires("html2text") def test_ignore_images() -> None: html2text_transformer = Html2TextTransformer(ignore_images=False) multiple_tags_html = ( "

First heading.

" "

First paragraph with an " "Example image

" ) documents = [Document(page_content=multiple_tags_html)] docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "# First heading.\n\n" "First paragraph with an ![Example image](example.jpg)\n\n" ) html2text_transformer = Html2TextTransformer(ignore_images=True) docs_transformed = html2text_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "# First heading.\n\n" "First paragraph with an\n\n" )