From e830a4e731a9c41eed5d773406f7de13b587d85d Mon Sep 17 00:00:00 2001 From: Peter Vandenabeele Date: Tue, 2 Apr 2024 02:19:12 +0200 Subject: [PATCH] community[patch]: Add remove_comments option (default True): do not extract html comments (#13259) - **Description:** add `remove_comments` option (default: True): do not extract html _comments_, - **Issue:** None, - **Dependencies:** None, - **Tag maintainer:** @nfcampos , - **Twitter handle:** peter_v I ran `make format`, `make lint` and `make test`. Discussion: I my use case, I prefer to not have the comments in the extracted text: * e.g. from a Google tag that is added in the html as comment * e.g. content that the authors have temporarily hidden to make it non visible to the regular reader Removing the comments makes the extracted text more alike the intended text to be seen by the reader. **Choice to make:** do we prefer to make the default for this `remove_comments` option to be True or False? I have changed it to True in a second commit, since that is how I would prefer to use it by default. Have the cleaned text (without technical Google tags etc.) and also closer to the actually visible and intended content. I am not sure what is best aligned with the conventions of langchain in general ... INITIAL VERSION (new version above): ~**Choice to make:** do we prefer to make the default for this `ignore_comments` option to be True or False? I have set it to False now to be backwards compatible. On the other hand, I would use it mostly with True. I am not sure what is best aligned with the conventions of langchain in general ...~ --------- Co-authored-by: Bagatur --- .../beautiful_soup_transformer.py | 28 +++++++++++----- .../test_beautiful_soup_transformer.py | 33 +++++++++++++++++++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/libs/community/langchain_community/document_transformers/beautiful_soup_transformer.py b/libs/community/langchain_community/document_transformers/beautiful_soup_transformer.py index 45723dfff2..b70af98789 100644 --- a/libs/community/langchain_community/document_transformers/beautiful_soup_transformer.py +++ b/libs/community/langchain_community/document_transformers/beautiful_soup_transformer.py @@ -36,6 +36,8 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): unwanted_tags: List[str] = ["script", "style"], tags_to_extract: List[str] = ["p", "li", "div", "a"], remove_lines: bool = True, + *, + remove_comments: bool = False, **kwargs: Any, ) -> Sequence[Document]: """ @@ -45,8 +47,8 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): documents: A sequence of Document objects containing HTML content. unwanted_tags: A list of tags to be removed from the HTML. tags_to_extract: A list of tags whose content will be extracted. - remove_lines: If set to True, unnecessary lines will be - removed from the HTML content. + remove_lines: If set to True, unnecessary lines will be removed. + remove_comments: If set to True, comments will be removed. Returns: A sequence of Document objects with transformed content. @@ -56,7 +58,9 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags) - cleaned_content = self.extract_tags(cleaned_content, tags_to_extract) + cleaned_content = self.extract_tags( + cleaned_content, tags_to_extract, remove_comments=remove_comments + ) if remove_lines: cleaned_content = self.remove_unnecessary_lines(cleaned_content) @@ -86,7 +90,9 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): return str(soup) @staticmethod - def extract_tags(html_content: str, tags: List[str]) -> str: + def extract_tags( + html_content: str, tags: List[str], *, remove_comments: bool = False + ) -> str: """ Extract specific tags from a given HTML content. @@ -104,7 +110,9 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): for element in soup.find_all(): if element.name in tags: # Extract all navigable strings recursively from this element. - text_parts += get_navigable_strings(element) + text_parts += get_navigable_strings( + element, remove_comments=remove_comments + ) # To avoid duplicate text, remove all descendants from the soup. element.decompose() @@ -136,7 +144,9 @@ class BeautifulSoupTransformer(BaseDocumentTransformer): raise NotImplementedError -def get_navigable_strings(element: Any) -> Iterator[str]: +def get_navigable_strings( + element: Any, *, remove_comments: bool = False +) -> Iterator[str]: """Get all navigable strings from a BeautifulSoup element. Args: @@ -146,11 +156,13 @@ def get_navigable_strings(element: Any) -> Iterator[str]: A generator of strings. """ - from bs4 import NavigableString, Tag + from bs4 import Comment, NavigableString, Tag for child in cast(Tag, element).children: + if isinstance(child, Comment) and remove_comments: + continue if isinstance(child, Tag): - yield from get_navigable_strings(child) + yield from get_navigable_strings(child, remove_comments=remove_comments) elif isinstance(child, NavigableString): if (element.name == "a") and (href := element.get("href")): yield f"{child.strip()} ({href})" diff --git a/libs/community/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py b/libs/community/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py index cc3076854e..199ab5e4e0 100644 --- a/libs/community/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py +++ b/libs/community/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py @@ -230,3 +230,36 @@ def test_invalid_html() -> None: ) assert docs_transformed[0].page_content == "First heading." assert docs_transformed[1].page_content == "" + + +@pytest.mark.requires("bs4") +def test_remove_comments() -> None: + bs_transformer = BeautifulSoupTransformer() + html_with_comments = ( + "

First paragraph." + ) + documents = [ + Document(page_content=html_with_comments), + ] + + docs_transformed = bs_transformer.transform_documents( + documents, tags_to_extract=["html"], remove_comments=True + ) + assert docs_transformed[0].page_content == "First paragraph." + + +@pytest.mark.requires("bs4") +def test_do_not_remove_comments() -> None: + bs_transformer = BeautifulSoupTransformer() + html_with_comments = ( + "

First paragraph." + ) + documents = [ + Document(page_content=html_with_comments), + ] + + docs_transformed = bs_transformer.transform_documents( + documents, + tags_to_extract=["html"], + ) + assert docs_transformed[0].page_content == "Google tag (gtag.js) First paragraph."