You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/langchain/document_loaders/readthedocs.py

101 lines
3.5 KiB
Python

"""Loader that loads ReadTheDocs documentation directory dump."""
from pathlib import Path
from typing import Any, List, Optional, Tuple, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class ReadTheDocsLoader(BaseLoader):
"""Loader that loads ReadTheDocs documentation directory dump."""
def __init__(
self,
path: Union[str, Path],
encoding: Optional[str] = None,
errors: Optional[str] = None,
custom_html_tag: Optional[Tuple[str, dict]] = None,
**kwargs: Optional[Any]
):
"""
Initialize ReadTheDocsLoader
The loader loops over all files under `path` and extract the actual content of
the files by retrieving main html tags. Default main html tags include
`<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
can also define your own html tags by passing custom_html_tag, e.g.
`("div", "class=main")`. The loader iterates html tags with the order of
custom html tags (if exists) and default html tags. If any of the tags is not
empty, the loop will break and retrieve the content out of that tag.
Args:
path: The location of pulled readthedocs folder.
encoding: The encoding with which to open the documents.
errors: Specifies how encoding and decoding errors are to be handled—this
cannot be used in binary mode.
custom_html_tag: Optional custom html tag to retrieve the content from
files.
"""
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError(
"Could not import python packages. "
"Please install it with `pip install beautifulsoup4`. "
)
try:
_ = BeautifulSoup(
"<html><body>Parser builder library test.</body></html>", **kwargs
)
except Exception as e:
raise ValueError("Parsing kwargs do not appear valid") from e
self.file_path = Path(path)
self.encoding = encoding
self.errors = errors
self.custom_html_tag = custom_html_tag
self.bs_kwargs = kwargs
def load(self) -> List[Document]:
"""Load documents."""
docs = []
for p in self.file_path.rglob("*"):
if p.is_dir():
continue
with open(p, encoding=self.encoding, errors=self.errors) as f:
text = self._clean_data(f.read())
metadata = {"source": str(p)}
docs.append(Document(page_content=text, metadata=metadata))
return docs
def _clean_data(self, data: str) -> str:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, **self.bs_kwargs)
# default tags
html_tags = [
("div", {"role": "main"}),
("main", {"id": "main-content"}),
]
if self.custom_html_tag is not None:
html_tags.append(self.custom_html_tag)
text = None
# reversed order. check the custom one first
for tag, attrs in html_tags[::-1]:
text = soup.find(tag, attrs)
# if found, break
if text is not None:
break
if text is not None:
text = text.get_text()
else:
text = ""
# trim empty lines
return "\n".join([t for t in text.split("\n") if t])