langchain/libs/community/langchain_community/document_loaders/readthedocs.py

from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Sequence, Tuple, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

if TYPE_CHECKING:
    from bs4 import NavigableString
    from bs4.element import Comment, Tag


class ReadTheDocsLoader(BaseLoader):
    """Load `ReadTheDocs` documentation directory."""

    def __init__(
        self,
        path: Union[str, Path],
        encoding: Optional[str] = None,
        errors: Optional[str] = None,
        custom_html_tag: Optional[Tuple[str, dict]] = None,
        patterns: Sequence[str] = ("*.htm", "*.html"),
        exclude_links_ratio: float = 1.0,
        **kwargs: Optional[Any],
    ):
        """
        Initialize ReadTheDocsLoader

        The loader loops over all files under `path` and extracts the actual content of
        the files by retrieving main html tags. Default main html tags include
        `<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
        can also define your own html tags by passing custom_html_tag, e.g.
        `("div", "class=main")`. The loader iterates html tags with the order of
        custom html tags (if exists) and default html tags. If any of the tags is not
        empty, the loop will break and retrieve the content out of that tag.

        Args:
            path: The location of pulled readthedocs folder.
            encoding: The encoding with which to open the documents.
            errors: Specify how encoding and decoding errors are to be handled—this
                cannot be used in binary mode.
            custom_html_tag: Optional custom html tag to retrieve the content from
                files.
            patterns: The file patterns to load, passed to `glob.rglob`.
            exclude_links_ratio: The ratio of links:content to exclude pages from.
                This is to reduce the frequency at which index pages make their
                way into retrieved results. Recommended: 0.5
            kwargs: named arguments passed to `bs4.BeautifulSoup`.
        """
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            raise ImportError(
                "Could not import python packages. "
                "Please install it with `pip install beautifulsoup4`. "
            )

        try:
            _ = BeautifulSoup(
                "<html><body>Parser builder library test.</body></html>",
                "html.parser",
                **kwargs,
            )
        except Exception as e:
            raise ValueError("Parsing kwargs do not appear valid") from e

        self.file_path = Path(path)
        self.encoding = encoding
        self.errors = errors
        self.custom_html_tag = custom_html_tag
        self.patterns = patterns
        self.bs_kwargs = kwargs
        self.exclude_links_ratio = exclude_links_ratio

    def lazy_load(self) -> Iterator[Document]:
        """A lazy loader for Documents."""
        for file_pattern in self.patterns:
            for p in self.file_path.rglob(file_pattern):
                if p.is_dir():
                    continue
                with open(p, encoding=self.encoding, errors=self.errors) as f:
                    text = self._clean_data(f.read())
                yield Document(page_content=text, metadata={"source": str(p)})

    def _clean_data(self, data: str) -> str:
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(data, "html.parser", **self.bs_kwargs)

        # default tags
        html_tags = [
            ("div", {"role": "main"}),
            ("main", {"id": "main-content"}),
        ]

        if self.custom_html_tag is not None:
            html_tags.append(self.custom_html_tag)

        element = None

        # reversed order. check the custom one first
        for tag, attrs in html_tags[::-1]:
            element = soup.find(tag, attrs)
            # if found, break
            if element is not None:
                break

        if element is not None and _get_link_ratio(element) <= self.exclude_links_ratio:
            text = _get_clean_text(element)
        else:
            text = ""
        # trim empty lines
        return "\n".join([t for t in text.split("\n") if t])


def _get_clean_text(element: Tag) -> str:
    """Returns cleaned text with newlines preserved and irrelevant elements removed."""
    elements_to_skip = [
        "script",
        "noscript",
        "canvas",
        "meta",
        "svg",
        "map",
        "area",
        "audio",
        "source",
        "track",
        "video",
        "embed",
        "object",
        "param",
        "picture",
        "iframe",
        "frame",
        "frameset",
        "noframes",
        "applet",
        "form",
        "button",
        "select",
        "base",
        "style",
        "img",
    ]

    newline_elements = [
        "p",
        "div",
        "ul",
        "ol",
        "li",
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "pre",
        "table",
        "tr",
    ]

    text = _process_element(element, elements_to_skip, newline_elements)
    return text.strip()


def _get_link_ratio(section: Tag) -> float:
    links = section.find_all("a")
    total_text = "".join(str(s) for s in section.stripped_strings)
    if len(total_text) == 0:
        return 0

    link_text = "".join(
        str(string.string.strip())
        for link in links
        for string in link.strings
        if string
    )
    return len(link_text) / len(total_text)


def _process_element(
    element: Union[Tag, NavigableString, Comment],
    elements_to_skip: List[str],
    newline_elements: List[str],
) -> str:
    """
    Traverse through HTML tree recursively to preserve newline and skip
    unwanted (code/binary) elements
    """
    from bs4 import NavigableString
    from bs4.element import Comment, Tag

    tag_name = getattr(element, "name", None)
    if isinstance(element, Comment) or tag_name in elements_to_skip:
        return ""
    elif isinstance(element, NavigableString):
        return element
    elif tag_name == "br":
        return "\n"
    elif tag_name in newline_elements:
        return (
            "".join(
                _process_element(child, elements_to_skip, newline_elements)
                for child in element.children
                if isinstance(child, (Tag, NavigableString, Comment))
            )
            + "\n"
        )
    else:
        return "".join(
            _process_element(child, elements_to_skip, newline_elements)
            for child in element.children
            if isinstance(child, (Tag, NavigableString, Comment))
        )
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`from __future__ import annotations`

			`from pathlib import Path`
			`from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Sequence, Tuple, Union`

			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.base import BaseLoader`

			`if TYPE_CHECKING:`
			`from bs4 import NavigableString`
			`from bs4.element import Comment, Tag`


			`class ReadTheDocsLoader(BaseLoader):`
			"""Load `ReadTheDocs` documentation directory."""

			`def __init__(`
			`self,`
			`path: Union[str, Path],`
			`encoding: Optional[str] = None,`
			`errors: Optional[str] = None,`
			`custom_html_tag: Optional[Tuple[str, dict]] = None,`
			`patterns: Sequence[str] = (".htm", ".html"),`
			`exclude_links_ratio: float = 1.0,`
			`**kwargs: Optional[Any],`
			`):`
			`"""`
			`Initialize ReadTheDocsLoader`

			The loader loops over all files under `path` and extracts the actual content of
			`the files by retrieving main html tags. Default main html tags include`
			`<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
			`can also define your own html tags by passing custom_html_tag, e.g.`
			`("div", "class=main")`. The loader iterates html tags with the order of
			`custom html tags (if exists) and default html tags. If any of the tags is not`
			`empty, the loop will break and retrieve the content out of that tag.`

			`Args:`
			`path: The location of pulled readthedocs folder.`
			`encoding: The encoding with which to open the documents.`
			`errors: Specify how encoding and decoding errors are to be handled—this`
			`cannot be used in binary mode.`
			`custom_html_tag: Optional custom html tag to retrieve the content from`
			`files.`
			patterns: The file patterns to load, passed to `glob.rglob`.
			`exclude_links_ratio: The ratio of links:content to exclude pages from.`
			`This is to reduce the frequency at which index pages make their`
			`way into retrieved results. Recommended: 0.5`
			kwargs: named arguments passed to `bs4.BeautifulSoup`.
			`"""`
			`try:`
			`from bs4 import BeautifulSoup`
			`except ImportError:`
			`raise ImportError(`
			`"Could not import python packages. "`
			"Please install it with `pip install beautifulsoup4`. "
			`)`

			`try:`
			`_ = BeautifulSoup(`
			`"<html><body>Parser builder library test.</body></html>",`
			`"html.parser",`
			`**kwargs,`
			`)`
			`except Exception as e:`
			`raise ValueError("Parsing kwargs do not appear valid") from e`

			`self.file_path = Path(path)`
			`self.encoding = encoding`
			`self.errors = errors`
			`self.custom_html_tag = custom_html_tag`
			`self.patterns = patterns`
			`self.bs_kwargs = kwargs`
			`self.exclude_links_ratio = exclude_links_ratio`

			`def lazy_load(self) -> Iterator[Document]:`
			`"""A lazy loader for Documents."""`
			`for file_pattern in self.patterns:`
			`for p in self.file_path.rglob(file_pattern):`
			`if p.is_dir():`
			`continue`
			`with open(p, encoding=self.encoding, errors=self.errors) as f:`
			`text = self._clean_data(f.read())`
			`yield Document(page_content=text, metadata={"source": str(p)})`

			`def _clean_data(self, data: str) -> str:`
			`from bs4 import BeautifulSoup`

			`soup = BeautifulSoup(data, "html.parser", **self.bs_kwargs)`

			`# default tags`
			`html_tags = [`
			`("div", {"role": "main"}),`
			`("main", {"id": "main-content"}),`
			`]`

			`if self.custom_html_tag is not None:`
			`html_tags.append(self.custom_html_tag)`

			`element = None`

			`# reversed order. check the custom one first`
			`for tag, attrs in html_tags[::-1]:`
			`element = soup.find(tag, attrs)`
			`# if found, break`
			`if element is not None:`
			`break`

			`if element is not None and _get_link_ratio(element) <= self.exclude_links_ratio:`
			`text = _get_clean_text(element)`
			`else:`
			`text = ""`
			`# trim empty lines`
			`return "\n".join([t for t in text.split("\n") if t])`


			`def _get_clean_text(element: Tag) -> str:`
			`"""Returns cleaned text with newlines preserved and irrelevant elements removed."""`
			`elements_to_skip = [`
			`"script",`
			`"noscript",`
			`"canvas",`
			`"meta",`
			`"svg",`
			`"map",`
			`"area",`
			`"audio",`
			`"source",`
			`"track",`
			`"video",`
			`"embed",`
			`"object",`
			`"param",`
			`"picture",`
			`"iframe",`
			`"frame",`
			`"frameset",`
			`"noframes",`
			`"applet",`
			`"form",`
			`"button",`
			`"select",`
			`"base",`
			`"style",`
			`"img",`
			`]`

			`newline_elements = [`
			`"p",`
			`"div",`
			`"ul",`
			`"ol",`
			`"li",`
			`"h1",`
			`"h2",`
			`"h3",`
			`"h4",`
			`"h5",`
			`"h6",`
			`"pre",`
			`"table",`
			`"tr",`
			`]`

			`text = _process_element(element, elements_to_skip, newline_elements)`
			`return text.strip()`


			`def _get_link_ratio(section: Tag) -> float:`
			`links = section.find_all("a")`
			`total_text = "".join(str(s) for s in section.stripped_strings)`
			`if len(total_text) == 0:`
			`return 0`

			`link_text = "".join(`
			`str(string.string.strip())`
			`for link in links`
			`for string in link.strings`
			`if string`
			`)`
			`return len(link_text) / len(total_text)`


			`def _process_element(`
			`element: Union[Tag, NavigableString, Comment],`
			`elements_to_skip: List[str],`
			`newline_elements: List[str],`
			`) -> str:`
			`"""`
			`Traverse through HTML tree recursively to preserve newline and skip`
			`unwanted (code/binary) elements`
			`"""`
			`from bs4 import NavigableString`
			`from bs4.element import Comment, Tag`

			`tag_name = getattr(element, "name", None)`
			`if isinstance(element, Comment) or tag_name in elements_to_skip:`
			`return ""`
			`elif isinstance(element, NavigableString):`
			`return element`
			`elif tag_name == "br":`
			`return "\n"`
			`elif tag_name in newline_elements:`
			`return (`
			`"".join(`
			`_process_element(child, elements_to_skip, newline_elements)`
			`for child in element.children`
			`if isinstance(child, (Tag, NavigableString, Comment))`
			`)`
			`+ "\n"`
			`)`
			`else:`
			`return "".join(`
			`_process_element(child, elements_to_skip, newline_elements)`
			`for child in element.children`
			`if isinstance(child, (Tag, NavigableString, Comment))`
			`)`