Fix BeautifulSoupTransformer: no more duplicates and correct order of tags + tests (#12596)

11 months ago · 7f1964b264
parent 937d7c41f3
commit 7f1964b264
2 changed files with 187 additions and 39 deletions
--- a/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py
+++ b/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py
@ -1,4 +1,4 @@
-from typing import Any, List, Sequence
+from typing import Any, Iterator, List, Sequence, cast

 from langchain.schema import BaseDocumentTransformer, Document

@ -98,18 +98,15 @@ class BeautifulSoupTransformer(BaseDocumentTransformer):
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(html_content, "html.parser")
-        text_parts = []
-        for tag in tags:
-            elements = soup.find_all(tag)
-            for element in elements:
-                if tag == "a":
-                    href = element.get("href")
-                    if href:
-                        text_parts.append(f"{element.get_text()} ({href})")
-                    else:
-                        text_parts.append(element.get_text())
-                else:
-                    text_parts.append(element.get_text())
+        text_parts: List[str] = []
+        for element in soup.find_all():
+            if element.name in tags:
+                # Extract all navigable strings recursively from this element.
+                text_parts += get_navigable_strings(element)
+
+                # To avoid duplicate text, remove all descendants from the soup.
+                element.decompose()
+
        return " ".join(text_parts)

    @staticmethod
@ -126,13 +123,7 @@ class BeautifulSoupTransformer(BaseDocumentTransformer):
        lines = content.split("\n")
        stripped_lines = [line.strip() for line in lines]
        non_empty_lines = [line for line in stripped_lines if line]
-        seen = set()
-        deduped_lines = []
-        for line in non_empty_lines:
-            if line not in seen:
-                seen.add(line)
-                deduped_lines.append(line)
-        cleaned_content = " ".join(deduped_lines)
+        cleaned_content = " ".join(non_empty_lines)
        return cleaned_content

    async def atransform_documents(
@ -141,3 +132,16 @@ class BeautifulSoupTransformer(BaseDocumentTransformer):
        **kwargs: Any,
    ) -> Sequence[Document]:
        raise NotImplementedError
+
+
+def get_navigable_strings(element: Any) -> Iterator[str]:
+    from bs4 import NavigableString, Tag
+
+    for child in cast(Tag, element).children:
+        if isinstance(child, Tag):
+            yield from get_navigable_strings(child)
+        elif isinstance(child, NavigableString):
+            if (element.name == "a") and (href := element.get("href")):
+                yield f"{child.strip()} ({href})"
+            else:
+                yield child.strip()
--- a/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py
+++ b/libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py
@ -15,14 +15,54 @@ def test_transform_empty_html() -> None:


@pytest.mark.requires("bs4")
-def test_extract_paragraph() -> None:
+def test_extract_paragraphs() -> None:
    bs_transformer = BeautifulSoupTransformer()
-    paragraphs_html = "<html><p>First paragraph.</p><p>Second paragraph.</p><html>"
+    paragraphs_html = (
+        "<html><h1>Header</h1><p>First paragraph.</p>"
+        "<p>Second paragraph.</p><h1>Ignore at end</h1></html>"
+    )
+    documents = [Document(page_content=paragraphs_html)]
+    docs_transformed = bs_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == "First paragraph. Second paragraph."
+
+
+@pytest.mark.requires("bs4")
+def test_strip_whitespace() -> None:
+    bs_transformer = BeautifulSoupTransformer()
+    paragraphs_html = (
+        "<html><h1>Header</h1><p><span>First</span>   paragraph.</p>"
+        "<p>Second paragraph. </p></html>"
+    )
    documents = [Document(page_content=paragraphs_html)]
    docs_transformed = bs_transformer.transform_documents(documents)
    assert docs_transformed[0].page_content == "First paragraph. Second paragraph."


+@pytest.mark.requires("bs4")
+def test_extract_html() -> None:
+    bs_transformer = BeautifulSoupTransformer()
+    paragraphs_html = (
+        "<html>Begin of html tag"
+        "<h1>Header</h1>"
+        "<p>First paragraph.</p>"
+        "Middle of html tag"
+        "<p>Second paragraph.</p>"
+        "End of html tag"
+        "</html>"
+    )
+    documents = [Document(page_content=paragraphs_html)]
+    docs_transformed = bs_transformer.transform_documents(
+        documents, tags_to_extract=["html", "p"]
+    )
+    assert docs_transformed[0].page_content == (
+        "Begin of html tag "
+        "Header First paragraph. "
+        "Middle of html tag "
+        "Second paragraph. "
+        "End of html tag"
+    )
+
+
@pytest.mark.requires("bs4")
 def test_remove_style() -> None:
    bs_transformer = BeautifulSoupTransformer()
@ -30,21 +70,97 @@ def test_remove_style() -> None:
        "<html><style>my_funky_style</style><p>First paragraph.</p></html>"
    )
    documents = [Document(page_content=with_style_html)]
-    docs_transformed = bs_transformer.transform_documents(documents)
+    docs_transformed = bs_transformer.transform_documents(
+        documents, tags_to_extract=["html"]
+    )
    assert docs_transformed[0].page_content == "First paragraph."


+@pytest.mark.requires("bs4")
+def test_remove_nested_tags() -> None:
+    """
+    If a tag_to_extract is _inside_ an unwanted_tag, it should be removed
+    (e.g. a <p> inside a <table> if <table> is unwanted).)
+    If an unwanted tag is _inside_ a tag_to_extract, it should be removed,
+    but the rest of the tag_to_extract should stay.
+    This means that "unwanted_tags" have a higher "priority" than "tags_to_extract".
+    """
+    bs_transformer = BeautifulSoupTransformer()
+    with_style_html = (
+        "<html><style>my_funky_style</style>"
+        "<table><td><p>First paragraph, inside a table.</p></td></table>"
+        "<p>Second paragraph<table><td> with a cell </td></table>.</p>"
+        "</html>"
+    )
+    documents = [Document(page_content=with_style_html)]
+    docs_transformed = bs_transformer.transform_documents(
+        documents, unwanted_tags=["script", "style", "table"]
+    )
+    assert docs_transformed[0].page_content == "Second paragraph."
+
+
@pytest.mark.requires("bs4")
 def test_remove_unwanted_lines() -> None:
    bs_transformer = BeautifulSoupTransformer()
    with_lines_html = "<html>\n\n<p>First \n\n     paragraph.</p>\n</html>\n\n"
    documents = [Document(page_content=with_lines_html)]
-    docs_transformed = bs_transformer.transform_documents(documents)
+    docs_transformed = bs_transformer.transform_documents(documents, remove_lines=True)
    assert docs_transformed[0].page_content == "First paragraph."


-# FIXME: This test proves that the order of the tags is NOT preserved.
-#        Documenting the current behavior here, but this should be fixed.
+@pytest.mark.requires("bs4")
+def test_do_not_remove_repeated_content() -> None:
+    bs_transformer = BeautifulSoupTransformer()
+    with_lines_html = "<p>1\n1\n1\n1</p>"
+    documents = [Document(page_content=with_lines_html)]
+    docs_transformed = bs_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == "1 1 1 1"
+
+
+@pytest.mark.requires("bs4")
+def test_extract_nested_tags() -> None:
+    bs_transformer = BeautifulSoupTransformer()
+    nested_html = (
+        "<html><div class='some_style'>"
+        "<p><span>First</span> paragraph.</p>"
+        "<p>Second <div>paragraph.</div></p>"
+        "<p><p>Third paragraph.</p></p>"
+        "</div></html>"
+    )
+    documents = [Document(page_content=nested_html)]
+    docs_transformed = bs_transformer.transform_documents(documents)
+    assert (
+        docs_transformed[0].page_content
+        == "First paragraph. Second paragraph. Third paragraph."
+    )
+
+
+@pytest.mark.requires("bs4")
+def test_extract_more_nested_tags() -> None:
+    bs_transformer = BeautifulSoupTransformer()
+    nested_html = (
+        "<html><div class='some_style'>"
+        "<p><span>First</span> paragraph.</p>"
+        "<p>Second paragraph.</p>"
+        "<p>Third paragraph with a list:"
+        "<ul>"
+        "<li>First list item.</li>"
+        "<li>Second list item.</li>"
+        "</ul>"
+        "</p>"
+        "<p>Fourth paragraph.</p>"
+        "</div></html>"
+    )
+    documents = [Document(page_content=nested_html)]
+    docs_transformed = bs_transformer.transform_documents(documents)
+    assert docs_transformed[0].page_content == (
+        "First paragraph. Second paragraph. "
+        "Third paragraph with a list: "
+        "First list item. Second list item. "
+        "Fourth paragraph."
+    )
+
+
@pytest.mark.requires("bs4")
 def test_transform_keeps_order() -> None:
    bs_transformer = BeautifulSoupTransformer()
@ -56,33 +172,61 @@ def test_transform_keeps_order() -> None:
    )
    documents = [Document(page_content=multiple_tags_html)]

-    # order of "p" and "h1" in the "tags_to_extract" parameter is important here:
-    # it will first extract all "p" tags, then all "h1" tags, breaking the order
-    # of the HTML.
+    # Order of "p" and "h1" in the "tags_to_extract" parameter is NOT important here:
+    # it will keep the order of the original HTML.
    docs_transformed_p_then_h1 = bs_transformer.transform_documents(
        documents, tags_to_extract=["p", "h1"]
    )
    assert (
        docs_transformed_p_then_h1[0].page_content
-        == "First paragraph. Second paragraph. First heading. Second heading."
+        == "First heading. First paragraph. Second heading. Second paragraph."
    )

    # Recreating `documents` because transform_documents() modifies it.
    documents = [Document(page_content=multiple_tags_html)]

-    # changing the order of "h1" and "p" in "tags_to_extract" flips the order of
-    # the extracted tags:
+    # changing the order of "h1" and "p" in "tags_to_extract" does NOT flip the order
+    # of the extracted tags:
    docs_transformed_h1_then_p = bs_transformer.transform_documents(
        documents, tags_to_extract=["h1", "p"]
    )
    assert (
        docs_transformed_h1_then_p[0].page_content
-        == "First heading. Second heading. First paragraph. Second paragraph."
+        == "First heading. First paragraph. Second heading. Second paragraph."
    )

-    # The correct result should be:
-    #
-    #   "First heading. First paragraph. Second heading. Second paragraph."
-    #
-    # That is the order in the original HTML, that should be preserved to preserve
-    # the semantic "meaning" of the text.
+
+@pytest.mark.requires("bs4")
+def test_extracts_href() -> None:
+    bs_transformer = BeautifulSoupTransformer()
+    multiple_tags_html = (
+        "<h1>First heading.</h1>"
+        "<p>First paragraph with an <a href='http://example.com'>example</a></p>"
+        "<p>Second paragraph with an <a>a tag without href</a></p>"
+    )
+    documents = [Document(page_content=multiple_tags_html)]
+
+    docs_transformed = bs_transformer.transform_documents(
+        documents, tags_to_extract=["p"]
+    )
+    assert docs_transformed[0].page_content == (
+        "First paragraph with an example (http://example.com) "
+        "Second paragraph with an a tag without href"
+    )
+
+
+@pytest.mark.requires("bs4")
+def test_invalid_html() -> None:
+    bs_transformer = BeautifulSoupTransformer()
+    invalid_html_1 = "<html><h1>First heading."
+    invalid_html_2 = "<html 1234 xyz"
+    documents = [
+        Document(page_content=invalid_html_1),
+        Document(page_content=invalid_html_2),
+    ]
+
+    docs_transformed = bs_transformer.transform_documents(
+        documents, tags_to_extract=["h1"]
+    )
+    assert docs_transformed[0].page_content == "First heading."
+    assert docs_transformed[1].page_content == ""