"""Unit tests for beautiful soup document transformer.""" import pytest from langchain_core.documents import Document from langchain_community.document_transformers import BeautifulSoupTransformer @pytest.mark.requires("bs4") def test_transform_empty_html() -> None: bs_transformer = BeautifulSoupTransformer() empty_html = "" documents = [Document(page_content=empty_html)] docs_transformed = bs_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "" @pytest.mark.requires("bs4") def test_extract_paragraphs() -> None: bs_transformer = BeautifulSoupTransformer() paragraphs_html = ( "

Header

First paragraph.

" "

Second paragraph.

Ignore at end

" ) documents = [Document(page_content=paragraphs_html)] docs_transformed = bs_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "First paragraph. Second paragraph." @pytest.mark.requires("bs4") def test_strip_whitespace() -> None: bs_transformer = BeautifulSoupTransformer() paragraphs_html = ( "

Header

First paragraph.

" "

Second paragraph.

" ) documents = [Document(page_content=paragraphs_html)] docs_transformed = bs_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "First paragraph. Second paragraph." @pytest.mark.requires("bs4") def test_extract_html() -> None: bs_transformer = BeautifulSoupTransformer() paragraphs_html = ( "Begin of html tag" "

Header

" "

First paragraph.

" "Middle of html tag" "

Second paragraph.

" "End of html tag" "" ) documents = [Document(page_content=paragraphs_html)] docs_transformed = bs_transformer.transform_documents( documents, tags_to_extract=["html", "p"] ) assert docs_transformed[0].page_content == ( "Begin of html tag " "Header First paragraph. " "Middle of html tag " "Second paragraph. " "End of html tag" ) @pytest.mark.requires("bs4") def test_remove_style() -> None: bs_transformer = BeautifulSoupTransformer() with_style_html = ( "

First paragraph.

" ) documents = [Document(page_content=with_style_html)] docs_transformed = bs_transformer.transform_documents( documents, tags_to_extract=["html"] ) assert docs_transformed[0].page_content == "First paragraph." @pytest.mark.requires("bs4") def test_remove_nested_tags() -> None: """ If a tag_to_extract is _inside_ an unwanted_tag, it should be removed (e.g. a

inside a if
is unwanted).) If an unwanted tag is _inside_ a tag_to_extract, it should be removed, but the rest of the tag_to_extract should stay. This means that "unwanted_tags" have a higher "priority" than "tags_to_extract". """ bs_transformer = BeautifulSoupTransformer() with_style_html = ( "" "
First paragraph, inside a table.
" "

Second paragraph
with a cell
.

" "" ) documents = [Document(page_content=with_style_html)] docs_transformed = bs_transformer.transform_documents( documents, unwanted_tags=["script", "style", "table"] ) assert docs_transformed[0].page_content == "Second paragraph." @pytest.mark.requires("bs4") def test_remove_unwanted_lines() -> None: bs_transformer = BeautifulSoupTransformer() with_lines_html = "\n\n

First \n\n paragraph.

\n\n\n" documents = [Document(page_content=with_lines_html)] docs_transformed = bs_transformer.transform_documents(documents, remove_lines=True) assert docs_transformed[0].page_content == "First paragraph." @pytest.mark.requires("bs4") def test_do_not_remove_repeated_content() -> None: bs_transformer = BeautifulSoupTransformer() with_lines_html = "

1\n1\n1\n1

" documents = [Document(page_content=with_lines_html)] docs_transformed = bs_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "1 1 1 1" @pytest.mark.requires("bs4") def test_extract_nested_tags() -> None: bs_transformer = BeautifulSoupTransformer() nested_html = ( "

" "

First paragraph.

" "

Second

paragraph.

" "

Third paragraph.

" "

" ) documents = [Document(page_content=nested_html)] docs_transformed = bs_transformer.transform_documents(documents) assert ( docs_transformed[0].page_content == "First paragraph. Second paragraph. Third paragraph." ) @pytest.mark.requires("bs4") def test_extract_more_nested_tags() -> None: bs_transformer = BeautifulSoupTransformer() nested_html = ( "

" "

First paragraph.

" "

Second paragraph.

" "

Third paragraph with a list:" "

First list item.
Second list item.

" "

Fourth paragraph.

" "

" ) documents = [Document(page_content=nested_html)] docs_transformed = bs_transformer.transform_documents(documents) assert docs_transformed[0].page_content == ( "First paragraph. Second paragraph. " "Third paragraph with a list: " "First list item. Second list item. " "Fourth paragraph." ) @pytest.mark.requires("bs4") def test_transform_keeps_order() -> None: bs_transformer = BeautifulSoupTransformer() multiple_tags_html = ( "

First heading.

" "

First paragraph.

" "

Second heading.

" "

Second paragraph.

" ) documents = [Document(page_content=multiple_tags_html)] # Order of "p" and "h1" in the "tags_to_extract" parameter is NOT important here: # it will keep the order of the original HTML. docs_transformed_p_then_h1 = bs_transformer.transform_documents( documents, tags_to_extract=["p", "h1"] ) assert ( docs_transformed_p_then_h1[0].page_content == "First heading. First paragraph. Second heading. Second paragraph." ) # Recreating `documents` because transform_documents() modifies it. documents = [Document(page_content=multiple_tags_html)] # changing the order of "h1" and "p" in "tags_to_extract" does NOT flip the order # of the extracted tags: docs_transformed_h1_then_p = bs_transformer.transform_documents( documents, tags_to_extract=["h1", "p"] ) assert ( docs_transformed_h1_then_p[0].page_content == "First heading. First paragraph. Second heading. Second paragraph." ) @pytest.mark.requires("bs4") def test_extracts_href() -> None: bs_transformer = BeautifulSoupTransformer() multiple_tags_html = ( "

First heading.

" "

First paragraph with an example

" "

Second paragraph with an a tag without href

" ) documents = [Document(page_content=multiple_tags_html)] docs_transformed = bs_transformer.transform_documents( documents, tags_to_extract=["p"] ) assert docs_transformed[0].page_content == ( "First paragraph with an example (http://example.com) " "Second paragraph with an a tag without href" ) @pytest.mark.requires("bs4") def test_invalid_html() -> None: bs_transformer = BeautifulSoupTransformer() invalid_html_1 = "

First heading." invalid_html_2 = "