langchain/libs/community/langchain_community/document_loaders/html_bs.py

import logging
from typing import Dict, List, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


class BSHTMLLoader(BaseLoader):
    """Load `HTML` files and parse them with `beautiful soup`."""

    def __init__(
        self,
        file_path: str,
        open_encoding: Union[str, None] = None,
        bs_kwargs: Union[dict, None] = None,
        get_text_separator: str = "",
    ) -> None:
        """Initialise with path, and optionally, file encoding to use, and any kwargs
        to pass to the BeautifulSoup object.

        Args:
            file_path: The path to the file to load.
            open_encoding: The encoding to use when opening the file.
            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
            get_text_separator: The separator to use when calling get_text on the soup.
        """
        try:
            import bs4  # noqa:F401
        except ImportError:
            raise ImportError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install beautifulsoup4`"
            )

        self.file_path = file_path
        self.open_encoding = open_encoding
        if bs_kwargs is None:
            bs_kwargs = {"features": "lxml"}
        self.bs_kwargs = bs_kwargs
        self.get_text_separator = get_text_separator

    def load(self) -> List[Document]:
        """Load HTML document into document objects."""
        from bs4 import BeautifulSoup

        with open(self.file_path, "r", encoding=self.open_encoding) as f:
            soup = BeautifulSoup(f, **self.bs_kwargs)

        text = soup.get_text(self.get_text_separator)

        if soup.title:
            title = str(soup.title.string)
        else:
            title = ""

        metadata: Dict[str, Union[str, None]] = {
            "source": self.file_path,
            "title": title,
        }
        return [Document(page_content=text, metadata=metadata)]
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`import logging`
			`from typing import Dict, List, Union`

			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.base import BaseLoader`

			`logger = logging.getLogger(__name__)`


			`class BSHTMLLoader(BaseLoader):`
			"""Load `HTML` files and parse them with `beautiful soup`."""

			`def __init__(`
			`self,`
			`file_path: str,`
			`open_encoding: Union[str, None] = None,`
			`bs_kwargs: Union[dict, None] = None,`
			`get_text_separator: str = "",`
			`) -> None:`
			`"""Initialise with path, and optionally, file encoding to use, and any kwargs`
			`to pass to the BeautifulSoup object.`

			`Args:`
			`file_path: The path to the file to load.`
			`open_encoding: The encoding to use when opening the file.`
			`bs_kwargs: Any kwargs to pass to the BeautifulSoup object.`
			`get_text_separator: The separator to use when calling get_text on the soup.`
			`"""`
			`try:`
			`import bs4 # noqa:F401`
			`except ImportError:`
			`raise ImportError(`
			`"beautifulsoup4 package not found, please install it with "`
			"`pip install beautifulsoup4`"
			`)`

			`self.file_path = file_path`
			`self.open_encoding = open_encoding`
			`if bs_kwargs is None:`
			`bs_kwargs = {"features": "lxml"}`
			`self.bs_kwargs = bs_kwargs`
			`self.get_text_separator = get_text_separator`

			`def load(self) -> List[Document]:`
			`"""Load HTML document into document objects."""`
			`from bs4 import BeautifulSoup`

			`with open(self.file_path, "r", encoding=self.open_encoding) as f:`
			`soup = BeautifulSoup(f, **self.bs_kwargs)`

			`text = soup.get_text(self.get_text_separator)`

			`if soup.title:`
			`title = str(soup.title.string)`
			`else:`
			`title = ""`

			`metadata: Dict[str, Union[str, None]] = {`
			`"source": self.file_path,`
			`"title": title,`
			`}`
			`return [Document(page_content=text, metadata=metadata)]`