From 64c4a698a823c1efefb3774a3388319b6dd97f8d Mon Sep 17 00:00:00 2001
From: Andrew Zhou <44193474+adrwz@users.noreply.github.com>
Date: Sun, 29 Oct 2023 16:26:53 -0700
Subject: [PATCH] More comprehensive readthedocs document loader (#12382)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## **Description:**
When building our own readthedocs.io scraper, we noticed a couple
interesting things:

1. Text lines with a lot of nested <span> tags would give unclean text
with a bunch of newlines. For example, for [Langchain's
documentation](https://api.python.langchain.com/en/latest/document_loaders/langchain.document_loaders.readthedocs.ReadTheDocsLoader.html#langchain.document_loaders.readthedocs.ReadTheDocsLoader),
a single line is represented in a complicated nested HTML structure, and
the naive `soup.get_text()` call currently being made will create a
newline for each nested HTML element. Therefore, the document loader
would give a messy, newline-separated blob of text. This would be true
in a lot of cases.

<img width="945" alt="Screenshot 2023-10-26 at 6 15 39 PM"
src="https://github.com/langchain-ai/langchain/assets/44193474/eca85d1f-d2bf-4487-a18a-e1e732fadf19">
<img width="1031" alt="Screenshot 2023-10-26 at 6 16 00 PM"
src="https://github.com/langchain-ai/langchain/assets/44193474/035938a0-9892-4f6a-83cd-0d7b409b00a3">

Additionally, content from iframes, code from scripts, css from styles,
etc. will be gotten if it's a subclass of the selector (which happens
more often than you'd think). For example, [this
page](https://pydeck.gl/gallery/contour_layer.html#) will scrape 1.5
million characters of content that looks like this:

<img width="1372" alt="Screenshot 2023-10-26 at 6 32 55 PM"
src="https://github.com/langchain-ai/langchain/assets/44193474/dbd89e39-9478-4a18-9e84-f0eb91954eac">

Therefore, I wrote a recursive _get_clean_text(soup) class function that
1. skips all irrelevant elements, and 2. only adds newlines when
necessary.

2. Index pages (like [this
one](https://api.python.langchain.com/en/latest/api_reference.html))
would be loaded, chunked, and eventually embedded. This is really bad
not just because the user will be embedding irrelevant information - but
because index pages are very likely to show up in retrieved content,
making retrieval less effective (in our tests). Therefore, I added a
bool parameter `exclude_index_pages` defaulted to False (which is the
current behavior — although I'd petition to default this to True) that
will skip all pages where links take up 50%+ of the page. Through manual
testing, this seems to be the best threshold.



## Other Information:
  - **Issue:** n/a
  - **Dependencies:** n/a
  - **Tag maintainer:** n/a
  - **Twitter handle:** @andrewthezhou

---------

Co-authored-by: Andrew Zhou <andrew@heykona.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 .../langchain/document_loaders/readthedocs.py | 147 ++++++++++++++++--
 .../readthedocs/index_page/test.html          |  10 ++
 .../nested_html_structure/test.html           |   5 +
 .../document_loaders/test_readthedoc.py       |  14 ++
 4 files changed, 161 insertions(+), 15 deletions(-)
 create mode 100644 libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/index_page/test.html
 create mode 100644 libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/nested_html_structure/test.html
diff --git a/libs/langchain/langchain/document_loaders/readthedocs.py b/libs/langchain/langchain/document_loaders/readthedocs.py
index 6aa3ddfd91..2eed19931f 100644
--- a/libs/langchain/langchain/document_loaders/readthedocs.py
+++ b/libs/langchain/langchain/document_loaders/readthedocs.py
@@ -1,9 +1,15 @@
+from __future__ import annotations
+
 from pathlib import Path
-from typing import Any, List, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Sequence, Tuple, Union
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 
+if TYPE_CHECKING:
+    from bs4 import NavigableString
+    from bs4.element import Comment, Tag
+
 
 class ReadTheDocsLoader(BaseLoader):
     """Load `ReadTheDocs` documentation directory."""
@@ -15,7 +21,8 @@ class ReadTheDocsLoader(BaseLoader):
         errors: Optional[str] = None,
         custom_html_tag: Optional[Tuple[str, dict]] = None,
         patterns: Sequence[str] = ("*.htm", "*.html"),
-        **kwargs: Optional[Any]
+        exclude_links_ratio: float = 1.0,
+        **kwargs: Optional[Any],
     ):
         """
         Initialize ReadTheDocsLoader
@@ -36,6 +43,9 @@ class ReadTheDocsLoader(BaseLoader):
             custom_html_tag: Optional custom html tag to retrieve the content from
                 files.
             patterns: The file patterns to load, passed to `glob.rglob`.
+            exclude_links_ratio: The ratio of links:content to exclude pages from.
+                This is to reduce the frequency at which index pages make their
+                way into retrieved results. Recommended: 0.5
             kwargs: named arguments passed to `bs4.BeautifulSoup`.
         """
         try:
@@ -48,7 +58,9 @@ class ReadTheDocsLoader(BaseLoader):
 
         try:
             _ = BeautifulSoup(
-                "<html><body>Parser builder library test.</body></html>", **kwargs
+                "<html><body>Parser builder library test.</body></html>",
+                "html.parser",
+                **kwargs,
             )
         except Exception as e:
             raise ValueError("Parsing kwargs do not appear valid") from e
@@ -59,24 +71,26 @@ class ReadTheDocsLoader(BaseLoader):
         self.custom_html_tag = custom_html_tag
         self.patterns = patterns
         self.bs_kwargs = kwargs
+        self.exclude_links_ratio = exclude_links_ratio
 
-    def load(self) -> List[Document]:
-        """Load documents."""
-        docs = []
+    def lazy_load(self) -> Iterator[Document]:
+        """A lazy loader for Documents."""
         for file_pattern in self.patterns:
             for p in self.file_path.rglob(file_pattern):
                 if p.is_dir():
                     continue
                 with open(p, encoding=self.encoding, errors=self.errors) as f:
                     text = self._clean_data(f.read())
-                metadata = {"source": str(p)}
-                docs.append(Document(page_content=text, metadata=metadata))
-        return docs
+                yield Document(page_content=text, metadata={"source": str(p)})
+
+    def load(self) -> List[Document]:
+        """Load documents."""
+        return list(self.lazy_load())
 
     def _clean_data(self, data: str) -> str:
         from bs4 import BeautifulSoup
 
-        soup = BeautifulSoup(data, **self.bs_kwargs)
+        soup = BeautifulSoup(data, "html.parser", **self.bs_kwargs)
 
         # default tags
         html_tags = [
@@ -87,18 +101,121 @@ class ReadTheDocsLoader(BaseLoader):
         if self.custom_html_tag is not None:
             html_tags.append(self.custom_html_tag)
 
-        text = None
+        element = None
 
         # reversed order. check the custom one first
         for tag, attrs in html_tags[::-1]:
-            text = soup.find(tag, attrs)
+            element = soup.find(tag, attrs)
             # if found, break
-            if text is not None:
+            if element is not None:
                 break
 
-        if text is not None:
-            text = text.get_text()
+        if element is not None and _get_link_ratio(element) <= self.exclude_links_ratio:
+            text = _get_clean_text(element)
         else:
             text = ""
         # trim empty lines
         return "\n".join([t for t in text.split("\n") if t])
+
+
+def _get_clean_text(element: Tag) -> str:
+    """Returns cleaned text with newlines preserved and irrelevant elements removed."""
+    elements_to_skip = [
+        "script",
+        "noscript",
+        "canvas",
+        "meta",
+        "svg",
+        "map",
+        "area",
+        "audio",
+        "source",
+        "track",
+        "video",
+        "embed",
+        "object",
+        "param",
+        "picture",
+        "iframe",
+        "frame",
+        "frameset",
+        "noframes",
+        "applet",
+        "form",
+        "button",
+        "select",
+        "base",
+        "style",
+        "img",
+    ]
+
+    newline_elements = [
+        "p",
+        "div",
+        "ul",
+        "ol",
+        "li",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+        "pre",
+        "table",
+        "tr",
+    ]
+
+    text = _process_element(element, elements_to_skip, newline_elements)
+    return text.strip()
+
+
+def _get_link_ratio(section: Tag) -> float:
+    links = section.find_all("a")
+    total_text = "".join(str(s) for s in section.stripped_strings)
+    if len(total_text) == 0:
+        return 0
+
+    link_text = "".join(
+        str(string.string.strip())
+        for link in links
+        for string in link.strings
+        if string
+    )
+    return len(link_text) / len(total_text)
+
+
+def _process_element(
+    element: Union[Tag, NavigableString, Comment],
+    elements_to_skip: List[str],
+    newline_elements: List[str],
+) -> str:
+    """
+    Traverse through HTML tree recursively to preserve newline and skip
+    unwanted (code/binary) elements
+    """
+    from bs4 import NavigableString
+    from bs4.element import Comment, Tag
+
+    tag_name = getattr(element, "name", None)
+    if isinstance(element, Comment) or tag_name in elements_to_skip:
+        return ""
+    elif isinstance(element, NavigableString):
+        return element
+    elif tag_name == "br":
+        return "\n"
+    elif tag_name in newline_elements:
+        return (
+            "".join(
+                _process_element(child, elements_to_skip, newline_elements)
+                for child in element.children
+                if isinstance(child, (Tag, NavigableString, Comment))
+            )
+            + "\n"
+        )
+    else:
+        return "".join(
+            _process_element(child, elements_to_skip, newline_elements)
+            for child in element.children
+            if isinstance(child, (Tag, NavigableString, Comment))
+        )
diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/index_page/test.html b/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/index_page/test.html
new file mode 100644
index 0000000000..29aaaa6e6e
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/index_page/test.html
@@ -0,0 +1,10 @@
+<html>
+  <main id="main-content">
+    Websites:
+    <a href="https://langchain.com">Langchain</a>
+    <a href="https://docs.langchain.com">Langchain Docs</a>
+    <a href="https://api.python.langchain.com/en/latest/api_reference.html"
+      >Langchain API Reference</a
+    >
+  </main>
+</html>
diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/nested_html_structure/test.html b/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/nested_html_structure/test.html
new file mode 100644
index 0000000000..89c864c231
--- /dev/null
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/nested_html_structure/test.html
@@ -0,0 +1,5 @@
+<html>
+  <main id="main-content">
+    Hello <span><em>World</em>!</span>
+  </main>
+</html>
diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py b/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py
index 9bcaae2fef..087bbf3480 100644
--- a/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py
@@ -31,6 +31,20 @@ def test_custom() -> None:
     assert len(documents[0].page_content) != 0
 
 
+@pytest.mark.requires("bs4")
+def test_nested_html_structure() -> None:
+    loader = ReadTheDocsLoader(PARENT_DIR / "nested_html_structure")
+    documents = loader.load()
+    assert documents[0].page_content == "Hello World!"
+
+
+@pytest.mark.requires("bs4")
+def test_index_page() -> None:
+    loader = ReadTheDocsLoader(PARENT_DIR / "index_page", exclude_links_ratio=0.5)
+    documents = loader.load()
+    assert len(documents[0].page_content) == 0
+
+
 @pytest.mark.requires("bs4")
 def test_empty() -> None:
     loader = ReadTheDocsLoader(