diff --git a/libs/community/langchain_community/document_transformers/beautiful_soup_transformer.py b/libs/community/langchain_community/document_transformers/beautiful_soup_transformer.py index 45723dfff2..b70af98789 100644 --- a/libs/community/langchain_community/document_transformers/beautiful_soup_transformer.py +++ b/libs/community/langchain_community/document_transformers/beautiful_soup_transformer.py @@ -36,6 +36,8 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): unwanted_tags: List[str] = ["script", "style"], tags_to_extract: List[str] = ["p", "li", "div", "a"], remove_lines: bool = True, + *, + remove_comments: bool = False, **kwargs: Any, ) -> Sequence[Document]: """ @@ -45,8 +47,8 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): documents: A sequence of Document objects containing HTML content. unwanted_tags: A list of tags to be removed from the HTML. tags_to_extract: A list of tags whose content will be extracted. - remove_lines: If set to True, unnecessary lines will be - removed from the HTML content. + remove_lines: If set to True, unnecessary lines will be removed. + remove_comments: If set to True, comments will be removed. Returns: A sequence of Document objects with transformed content. @@ -56,7 +58,9 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags) - cleaned_content = self.extract_tags(cleaned_content, tags_to_extract) + cleaned_content = self.extract_tags( + cleaned_content, tags_to_extract, remove_comments=remove_comments + ) if remove_lines: cleaned_content = self.remove_unnecessary_lines(cleaned_content) @@ -86,7 +90,9 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): return str(soup) @staticmethod - def extract_tags(html_content: str, tags: List[str]) -> str: + def extract_tags( + html_content: str, tags: List[str], *, remove_comments: bool = False + ) -> str: """ Extract specific tags from a given HTML content. @@ -104,7 +110,9 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): for element in soup.find_all(): if element.name in tags: # Extract all navigable strings recursively from this element. - text_parts += get_navigable_strings(element) + text_parts += get_navigable_strings( + element, remove_comments=remove_comments + ) # To avoid duplicate text, remove all descendants from the soup. element.decompose() @@ -136,7 +144,9 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): raise NotImplementedError -def get_navigable_strings(element: Any) -> Iterator[str]: +def get_navigable_strings( + element: Any, *, remove_comments: bool = False +) -> Iterator[str]: """Get all navigable strings from a BeautifulSoup element. Args: @@ -146,11 +156,13 @@ def get_navigable_strings(element: Any) -> Iterator[str]: A generator of strings. """ - from bs4 import NavigableString, Tag + from bs4 import Comment, NavigableString, Tag for child in cast(Tag, element).children: + if isinstance(child, Comment) and remove_comments: + continue if isinstance(child, Tag): - yield from get_navigable_strings(child) + yield from get_navigable_strings(child, remove_comments=remove_comments) elif isinstance(child, NavigableString): if (element.name == "a") and (href := element.get("href")): yield f"{child.strip()} ({href})" diff --git a/libs/community/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py b/libs/community/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py index cc3076854e..199ab5e4e0 100644 --- a/libs/community/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py +++ b/libs/community/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py @@ -230,3 +230,36 @@ def test_invalid_html() -> None: ) assert docs_transformed[0].page_content == "First heading." assert docs_transformed[1].page_content == "" + + +@pytest.mark.requires("bs4") +def test_remove_comments() -> None: + bs_transformer = BeautifulSoupTransformer() + html_with_comments = ( + "

First paragraph." + ) + documents = [ + Document(page_content=html_with_comments), + ] + + docs_transformed = bs_transformer.transform_documents( + documents, tags_to_extract=["html"], remove_comments=True + ) + assert docs_transformed[0].page_content == "First paragraph." + + +@pytest.mark.requires("bs4") +def test_do_not_remove_comments() -> None: + bs_transformer = BeautifulSoupTransformer() + html_with_comments = ( + "

First paragraph." + ) + documents = [ + Document(page_content=html_with_comments), + ] + + docs_transformed = bs_transformer.transform_documents( + documents, + tags_to_extract=["html"], + ) + assert docs_transformed[0].page_content == "Google tag (gtag.js) First paragraph."