langchain/libs/community/langchain_community/document_loaders/chm.py

from typing import TYPE_CHECKING, Dict, List, Union

from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

if TYPE_CHECKING:
    from chm import chm


class UnstructuredCHMLoader(UnstructuredFileLoader):
    """Load `CHM` files using `Unstructured`.

    CHM means Microsoft Compiled HTML Help.

    Examples
    --------
    from langchain_community.document_loaders import UnstructuredCHMLoader

    loader = UnstructuredCHMLoader("example.chm")
    docs = loader.load()

    References
    ----------
    https://github.com/dottedmag/pychm
    http://www.jedrea.com/chmlib/
    """

    def _get_elements(self) -> List:
        from unstructured.partition.html import partition_html

        with CHMParser(self.file_path) as f:  # type: ignore[arg-type]
            return [
                partition_html(text=item["content"], **self.unstructured_kwargs)
                for item in f.load_all()
            ]


class CHMParser(object):
    """Microsoft Compiled HTML Help (CHM) Parser."""

    path: str
    file: "chm.CHMFile"

    def __init__(self, path: str):
        from chm import chm

        self.path = path
        self.file = chm.CHMFile()
        self.file.LoadCHM(path)

    def __enter__(self):  # type: ignore[no-untyped-def]
        return self

    def __exit__(self, exc_type, exc_value, traceback):  # type: ignore[no-untyped-def]
        if self.file:
            self.file.CloseCHM()

    @property
    def encoding(self) -> str:
        return self.file.GetEncoding().decode("utf-8")

    def index(self) -> List[Dict[str, str]]:
        from urllib.parse import urlparse

        from bs4 import BeautifulSoup

        res = []
        index = self.file.GetTopicsTree().decode(self.encoding)
        soup = BeautifulSoup(index)
        # <OBJECT ..>
        for obj in soup.find_all("object"):
            # <param name="Name" value="<...>">
            # <param name="Local" value="<...>">
            name = ""
            local = ""
            for param in obj.find_all("param"):
                if param["name"] == "Name":
                    name = param["value"]
                if param["name"] == "Local":
                    local = param["value"]
            if not name or not local:
                continue

            local = urlparse(local).path
            if not local.startswith("/"):
                local = "/" + local
            res.append({"name": name, "local": local})

        return res

    def load(self, path: Union[str, bytes]) -> str:
        if isinstance(path, str):
            path = path.encode("utf-8")
        obj = self.file.ResolveObject(path)[1]
        return self.file.RetrieveObject(obj)[1].decode(self.encoding)

    def load_all(self) -> List[Dict[str, str]]:
        res = []
        index = self.index()
        for item in index:
            content = self.load(item["local"])
            res.append(
                {"name": item["name"], "local": item["local"], "content": content}
            )
        return res
Feat: add CHM file loader (#15519) fix https://github.com/langchain-ai/langchain/issues/15469 2024-01-07 17:28:52 +00:00			`from typing import TYPE_CHECKING, Dict, List, Union`

			`from langchain_community.document_loaders.unstructured import UnstructuredFileLoader`

			`if TYPE_CHECKING:`
			`from chm import chm`


			`class UnstructuredCHMLoader(UnstructuredFileLoader):`
			"""Load `CHM` files using `Unstructured`.

community[patch]: docstrings (#16810) - added missed docstrings - formated docstrings to the consistent form 2024-02-09 20:48:57 +00:00			`CHM means Microsoft Compiled HTML Help.`
Feat: add CHM file loader (#15519) fix https://github.com/langchain-ai/langchain/issues/15469 2024-01-07 17:28:52 +00:00
			`Examples`
			`--------`
			`from langchain_community.document_loaders import UnstructuredCHMLoader`

			`loader = UnstructuredCHMLoader("example.chm")`
			`docs = loader.load()`

			`References`
			`----------`
			`https://github.com/dottedmag/pychm`
			`http://www.jedrea.com/chmlib/`
			`"""`

			`def _get_elements(self) -> List:`
			`from unstructured.partition.html import partition_html`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`with CHMParser(self.file_path) as f: # type: ignore[arg-type]`
Feat: add CHM file loader (#15519) fix https://github.com/langchain-ai/langchain/issues/15469 2024-01-07 17:28:52 +00:00			`return [`
			`partition_html(text=item["content"], **self.unstructured_kwargs)`
			`for item in f.load_all()`
			`]`


			`class CHMParser(object):`
community[patch]: docstrings (#16810) - added missed docstrings - formated docstrings to the consistent form 2024-02-09 20:48:57 +00:00			`"""Microsoft Compiled HTML Help (CHM) Parser."""`

Feat: add CHM file loader (#15519) fix https://github.com/langchain-ai/langchain/issues/15469 2024-01-07 17:28:52 +00:00			`path: str`
			`file: "chm.CHMFile"`

			`def __init__(self, path: str):`
			`from chm import chm`

			`self.path = path`
			`self.file = chm.CHMFile()`
			`self.file.LoadCHM(path)`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def __enter__(self): # type: ignore[no-untyped-def]`
Feat: add CHM file loader (#15519) fix https://github.com/langchain-ai/langchain/issues/15469 2024-01-07 17:28:52 +00:00			`return self`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def __exit__(self, exc_type, exc_value, traceback): # type: ignore[no-untyped-def]`
Feat: add CHM file loader (#15519) fix https://github.com/langchain-ai/langchain/issues/15469 2024-01-07 17:28:52 +00:00			`if self.file:`
			`self.file.CloseCHM()`

			`@property`
			`def encoding(self) -> str:`
			`return self.file.GetEncoding().decode("utf-8")`

			`def index(self) -> List[Dict[str, str]]:`
			`from urllib.parse import urlparse`

			`from bs4 import BeautifulSoup`

			`res = []`
			`index = self.file.GetTopicsTree().decode(self.encoding)`
			`soup = BeautifulSoup(index)`
			`# <OBJECT ..>`
			`for obj in soup.find_all("object"):`
			`# <param name="Name" value="<...>">`
			`# <param name="Local" value="<...>">`
			`name = ""`
			`local = ""`
			`for param in obj.find_all("param"):`
			`if param["name"] == "Name":`
			`name = param["value"]`
			`if param["name"] == "Local":`
			`local = param["value"]`
			`if not name or not local:`
			`continue`

			`local = urlparse(local).path`
			`if not local.startswith("/"):`
			`local = "/" + local`
			`res.append({"name": name, "local": local})`

			`return res`

			`def load(self, path: Union[str, bytes]) -> str:`
			`if isinstance(path, str):`
			`path = path.encode("utf-8")`
			`obj = self.file.ResolveObject(path)[1]`
			`return self.file.RetrieveObject(obj)[1].decode(self.encoding)`

			`def load_all(self) -> List[Dict[str, str]]:`
			`res = []`
			`index = self.index()`
			`for item in index:`
			`content = self.load(item["local"])`
			`res.append(`
			`{"name": item["name"], "local": item["local"], "content": content}`
			`)`
			`return res`