From 3468c038baecec2f55acc0765a7bf4422bcc98f6 Mon Sep 17 00:00:00 2001 From: Peter Vandenabeele Date: Mon, 30 Oct 2023 00:24:47 +0100 Subject: [PATCH] Add unit tests for document_transformers/beautiful_soup_transformer.py (#12520) - **Description:** * Add unit tests for document_transformers/beautiful_soup_transformer.py * Basic functionality is tested (extract tags, remove tags, drop lines) * add a FIXME comment about the order of tags that is not preserved (and a passing test, but with the expected tags now out-of-order) - **Issue:** None - **Dependencies:** None - **Tag maintainer:** @rlancemartin - **Twitter handle:** `peter_v` Please make sure your PR is passing linting and testing before submitting. => OK: I ran `make format`, `make test` (passing after install of beautifulsoup4) and `make lint`. --- .../test_beautiful_soup_transformer.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py diff --git a/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py b/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py new file mode 100644 index 0000000000..d3ebe1d8c7 --- /dev/null +++ b/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py @@ -0,0 +1,88 @@ +"""Unit tests for beautiful soup document transformer.""" +import pytest + +from langchain.document_transformers import BeautifulSoupTransformer +from langchain.schema.document import Document + + +@pytest.mark.requires("bs4") +def test_transform_empty_html() -> None: + bs_transformer = BeautifulSoupTransformer() + empty_html = "" + documents = [Document(page_content=empty_html)] + docs_transformed = bs_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == "" + + +@pytest.mark.requires("bs4") +def test_extract_paragraph() -> None: + bs_transformer = BeautifulSoupTransformer() + paragraphs_html = "

First paragraph.

Second paragraph.

" + documents = [Document(page_content=paragraphs_html)] + docs_transformed = bs_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == "First paragraph. Second paragraph." + + +@pytest.mark.requires("bs4") +def test_remove_style() -> None: + bs_transformer = BeautifulSoupTransformer() + with_style_html = ( + "

First paragraph.

" + ) + documents = [Document(page_content=with_style_html)] + docs_transformed = bs_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == "First paragraph." + + +@pytest.mark.requires("bs4") +def test_remove_unwanted_lines() -> None: + bs_transformer = BeautifulSoupTransformer() + with_lines_html = "\n\n

First \n\n paragraph.

\n\n\n" + documents = [Document(page_content=with_lines_html)] + docs_transformed = bs_transformer.transform_documents(documents) + assert docs_transformed[0].page_content == "First paragraph." + + +# FIXME: This test proves that the order of the tags is NOT preserved. +# Documenting the current behavior here, but this should be fixed. +@pytest.mark.requires("bs4") +def test_transform_keeps_order() -> None: + bs_transformer = BeautifulSoupTransformer() + multiple_tags_html = ( + "

First heading.

" + "

First paragraph.

" + "

Second heading.

" + "

Second paragraph.

" + ) + documents = [Document(page_content=multiple_tags_html)] + + # order of "p" and "h1" in the "tags_to_extract" parameter is important here: + # it will first extract all "p" tags, then all "h1" tags, breaking the order + # of the HTML. + docs_transformed_p_then_h1 = bs_transformer.transform_documents( + documents, tags_to_extract=["p", "h1"] + ) + assert ( + docs_transformed_p_then_h1[0].page_content + == "First paragraph. Second paragraph. First heading. Second heading." + ) + + # Recreating `documents` because transform_documents() modifies it. + documents = [Document(page_content=multiple_tags_html)] + + # changing the order of "h1" and "p" in "tags_to_extract" flips the order of + # the extracted tags: + docs_transformed_h1_then_p = bs_transformer.transform_documents( + documents, tags_to_extract=["h1", "p"] + ) + assert ( + docs_transformed_h1_then_p[0].page_content + == "First heading. Second heading. First paragraph. Second paragraph." + ) + + # The correct result should be: + # + # "First heading. First paragraph. Second heading. Second paragraph." + # + # That is the order in the original HTML, that should be preserved to preserve + # the semantic "meaning" of the text.