2024-01-07 17:28:52 +00:00
|
|
|
from typing import TYPE_CHECKING, Dict, List, Union
|
|
|
|
|
|
|
|
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
from chm import chm
|
|
|
|
|
|
|
|
|
|
|
|
class UnstructuredCHMLoader(UnstructuredFileLoader):
|
|
|
|
"""Load `CHM` files using `Unstructured`.
|
|
|
|
|
2024-02-09 20:48:57 +00:00
|
|
|
CHM means Microsoft Compiled HTML Help.
|
2024-01-07 17:28:52 +00:00
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
|
|
|
from langchain_community.document_loaders import UnstructuredCHMLoader
|
|
|
|
|
|
|
|
loader = UnstructuredCHMLoader("example.chm")
|
|
|
|
docs = loader.load()
|
|
|
|
|
|
|
|
References
|
|
|
|
----------
|
|
|
|
https://github.com/dottedmag/pychm
|
|
|
|
http://www.jedrea.com/chmlib/
|
|
|
|
"""
|
|
|
|
|
|
|
|
def _get_elements(self) -> List:
|
|
|
|
from unstructured.partition.html import partition_html
|
|
|
|
|
2024-02-05 19:22:06 +00:00
|
|
|
with CHMParser(self.file_path) as f: # type: ignore[arg-type]
|
2024-01-07 17:28:52 +00:00
|
|
|
return [
|
|
|
|
partition_html(text=item["content"], **self.unstructured_kwargs)
|
|
|
|
for item in f.load_all()
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class CHMParser(object):
|
2024-02-09 20:48:57 +00:00
|
|
|
"""Microsoft Compiled HTML Help (CHM) Parser."""
|
|
|
|
|
2024-01-07 17:28:52 +00:00
|
|
|
path: str
|
|
|
|
file: "chm.CHMFile"
|
|
|
|
|
|
|
|
def __init__(self, path: str):
|
|
|
|
from chm import chm
|
|
|
|
|
|
|
|
self.path = path
|
|
|
|
self.file = chm.CHMFile()
|
|
|
|
self.file.LoadCHM(path)
|
|
|
|
|
2024-02-05 19:22:06 +00:00
|
|
|
def __enter__(self): # type: ignore[no-untyped-def]
|
2024-01-07 17:28:52 +00:00
|
|
|
return self
|
|
|
|
|
2024-02-05 19:22:06 +00:00
|
|
|
def __exit__(self, exc_type, exc_value, traceback): # type: ignore[no-untyped-def]
|
2024-01-07 17:28:52 +00:00
|
|
|
if self.file:
|
|
|
|
self.file.CloseCHM()
|
|
|
|
|
|
|
|
@property
|
|
|
|
def encoding(self) -> str:
|
|
|
|
return self.file.GetEncoding().decode("utf-8")
|
|
|
|
|
|
|
|
def index(self) -> List[Dict[str, str]]:
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
res = []
|
|
|
|
index = self.file.GetTopicsTree().decode(self.encoding)
|
|
|
|
soup = BeautifulSoup(index)
|
|
|
|
# <OBJECT ..>
|
|
|
|
for obj in soup.find_all("object"):
|
|
|
|
# <param name="Name" value="<...>">
|
|
|
|
# <param name="Local" value="<...>">
|
|
|
|
name = ""
|
|
|
|
local = ""
|
|
|
|
for param in obj.find_all("param"):
|
|
|
|
if param["name"] == "Name":
|
|
|
|
name = param["value"]
|
|
|
|
if param["name"] == "Local":
|
|
|
|
local = param["value"]
|
|
|
|
if not name or not local:
|
|
|
|
continue
|
|
|
|
|
|
|
|
local = urlparse(local).path
|
|
|
|
if not local.startswith("/"):
|
|
|
|
local = "/" + local
|
|
|
|
res.append({"name": name, "local": local})
|
|
|
|
|
|
|
|
return res
|
|
|
|
|
|
|
|
def load(self, path: Union[str, bytes]) -> str:
|
|
|
|
if isinstance(path, str):
|
|
|
|
path = path.encode("utf-8")
|
|
|
|
obj = self.file.ResolveObject(path)[1]
|
|
|
|
return self.file.RetrieveObject(obj)[1].decode(self.encoding)
|
|
|
|
|
|
|
|
def load_all(self) -> List[Dict[str, str]]:
|
|
|
|
res = []
|
|
|
|
index = self.index()
|
|
|
|
for item in index:
|
|
|
|
content = self.load(item["local"])
|
|
|
|
res.append(
|
|
|
|
{"name": item["name"], "local": item["local"], "content": content}
|
|
|
|
)
|
|
|
|
return res
|