"""Unit tests for beautiful soup document transformer.""" import pytest from langchain_core.documents import Document from langchain_community.document_transformers import BeautifulSoupTransformer @pytest.mark.requires("bs4") def test_transform_empty_html() -> None: bs_transformer = BeautifulSoupTransformer() empty_html = "" documents = [Document(page_content=empty_html)] docs_transformed = bs_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "" @pytest.mark.requires("bs4") def test_extract_paragraphs() -> None: bs_transformer = BeautifulSoupTransformer() paragraphs_html = ( "
First paragraph.
" "Second paragraph.
First paragraph.
" "Second paragraph.
" ) documents = [Document(page_content=paragraphs_html)] docs_transformed = bs_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "First paragraph. Second paragraph." @pytest.mark.requires("bs4") def test_extract_html() -> None: bs_transformer = BeautifulSoupTransformer() paragraphs_html = ( "Begin of html tag" "First paragraph.
" "Middle of html tag" "Second paragraph.
" "End of html tag" "" ) documents = [Document(page_content=paragraphs_html)] docs_transformed = bs_transformer.transform_documents( documents, tags_to_extract=["html", "p"] ) assert docs_transformed[0].page_content == ( "Begin of html tag " "Header First paragraph. " "Middle of html tag " "Second paragraph. " "End of html tag" ) @pytest.mark.requires("bs4") def test_remove_style() -> None: bs_transformer = BeautifulSoupTransformer() with_style_html = ( "First paragraph.
" ) documents = [Document(page_content=with_style_html)] docs_transformed = bs_transformer.transform_documents( documents, tags_to_extract=["html"] ) assert docs_transformed[0].page_content == "First paragraph." @pytest.mark.requires("bs4") def test_remove_nested_tags() -> None: """ If a tag_to_extract is _inside_ an unwanted_tag, it should be removed (e.g. ainside a
First paragraph, inside a table. |
Second paragraph
with a cell |
First \n\n paragraph.
\n\n\n" documents = [Document(page_content=with_lines_html)] docs_transformed = bs_transformer.transform_documents(documents, remove_lines=True) assert docs_transformed[0].page_content == "First paragraph." @pytest.mark.requires("bs4") def test_do_not_remove_repeated_content() -> None: bs_transformer = BeautifulSoupTransformer() with_lines_html = "1\n1\n1\n1
" documents = [Document(page_content=with_lines_html)] docs_transformed = bs_transformer.transform_documents(documents) assert docs_transformed[0].page_content == "1 1 1 1" @pytest.mark.requires("bs4") def test_extract_nested_tags() -> None: bs_transformer = BeautifulSoupTransformer() nested_html = ( "First paragraph.
" "Second
Third paragraph.
" "First paragraph.
" "Second paragraph.
" "Third paragraph with a list:" "
Fourth paragraph.
" "First paragraph.
" "Second paragraph.
" ) documents = [Document(page_content=multiple_tags_html)] # Order of "p" and "h1" in the "tags_to_extract" parameter is NOT important here: # it will keep the order of the original HTML. docs_transformed_p_then_h1 = bs_transformer.transform_documents( documents, tags_to_extract=["p", "h1"] ) assert ( docs_transformed_p_then_h1[0].page_content == "First heading. First paragraph. Second heading. Second paragraph." ) # Recreating `documents` because transform_documents() modifies it. documents = [Document(page_content=multiple_tags_html)] # changing the order of "h1" and "p" in "tags_to_extract" does NOT flip the order # of the extracted tags: docs_transformed_h1_then_p = bs_transformer.transform_documents( documents, tags_to_extract=["h1", "p"] ) assert ( docs_transformed_h1_then_p[0].page_content == "First heading. First paragraph. Second heading. Second paragraph." ) @pytest.mark.requires("bs4") def test_extracts_href() -> None: bs_transformer = BeautifulSoupTransformer() multiple_tags_html = ( "First paragraph with an example
" "Second paragraph with an a tag without href
" ) documents = [Document(page_content=multiple_tags_html)] docs_transformed = bs_transformer.transform_documents( documents, tags_to_extract=["p"] ) assert docs_transformed[0].page_content == ( "First paragraph with an example (http://example.com) " "Second paragraph with an a tag without href" ) @pytest.mark.requires("bs4") def test_invalid_html() -> None: bs_transformer = BeautifulSoupTransformer() invalid_html_1 = "