Allow readthedoc loader to pass custom html tag (#5175)

## Description

The html structure of readthedocs can differ. Currently, the html tag is
hardcoded in the reader, and unable to fit into some cases. This pr
includes the following changes:

1. Replace `find_all` with `find` because we just want one tag.
2. Provide `custom_html_tag` to the loader.
3. Add tests for readthedoc loader
4. Refactor code

## Issues

See more in https://github.com/hwchase17/langchain/pull/2609. The
problem was not completely fixed in that pr.
---------

Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
ByronHsu 2023-05-24 10:40:27 -07:00 committed by GitHub
parent d8eed6018f
commit f0730c6489
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 111 additions and 21 deletions

View File

@ -1,6 +1,6 @@
"""Loader that loads ReadTheDocs documentation directory dump."""
from pathlib import Path
from typing import Any, List, Optional
from typing import Any, List, Optional, Tuple, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
@ -11,12 +11,31 @@ class ReadTheDocsLoader(BaseLoader):
def __init__(
self,
path: str,
path: Union[str, Path],
encoding: Optional[str] = None,
errors: Optional[str] = None,
custom_html_tag: Optional[Tuple[str, dict]] = None,
**kwargs: Optional[Any]
):
"""Initialize path."""
"""
Initialize ReadTheDocsLoader
The loader loops over all files under `path` and extract the actual content of
the files by retrieving main html tags. Default main html tags include
`<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
can also define your own html tags by passing custom_html_tag, e.g.
`("div", "class=main")`. The loader iterates html tags with the order of
custom html tags (if exists) and default html tags. If any of the tags is not
empty, the loop will break and retrieve the content out of that tag.
Args:
path: The location of pulled readthedocs folder.
encoding: The encoding with which to open the documents.
errors: Specifies how encoding and decoding errors are to be handledthis
cannot be used in binary mode.
custom_html_tag: Optional custom html tag to retrieve the content from
files.
"""
try:
from bs4 import BeautifulSoup
except ImportError:
@ -32,34 +51,50 @@ class ReadTheDocsLoader(BaseLoader):
except Exception as e:
raise ValueError("Parsing kwargs do not appear valid") from e
self.file_path = path
self.file_path = Path(path)
self.encoding = encoding
self.errors = errors
self.custom_html_tag = custom_html_tag
self.bs_kwargs = kwargs
def load(self) -> List[Document]:
"""Load documents."""
from bs4 import BeautifulSoup
def _clean_data(data: str) -> str:
soup = BeautifulSoup(data, **self.bs_kwargs)
text = soup.find_all("main", {"id": "main-content"})
if len(text) == 0:
text = soup.find_all("div", {"role": "main"})
if len(text) != 0:
text = text[0].get_text()
else:
text = ""
return "\n".join([t for t in text.split("\n") if t])
docs = []
for p in Path(self.file_path).rglob("*"):
for p in self.file_path.rglob("*"):
if p.is_dir():
continue
with open(p, encoding=self.encoding, errors=self.errors) as f:
text = _clean_data(f.read())
text = self._clean_data(f.read())
metadata = {"source": str(p)}
docs.append(Document(page_content=text, metadata=metadata))
return docs
def _clean_data(self, data: str) -> str:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, **self.bs_kwargs)
# default tags
html_tags = [
("div", {"role": "main"}),
("main", {"id": "main-content"}),
]
if self.custom_html_tag is not None:
html_tags.append(self.custom_html_tag)
text = None
# reversed order. check the custom one first
for tag, attrs in html_tags[::-1]:
text = soup.find(tag, attrs)
# if found, break
if text is not None:
break
if text is not None:
text = text.get_text()
else:
text = ""
# trim empty lines
return "\n".join([t for t in text.split("\n") if t])

View File

@ -0,0 +1,5 @@
<html>
<article role="main">
Hello World!
</article>
</html>

View File

@ -0,0 +1,5 @@
<html>
<div role="main">
Hello World!
</div>
</html>

View File

@ -0,0 +1,5 @@
<html>
<main id="main-content">
Hello World!
</main>
</html>

View File

@ -0,0 +1,40 @@
from pathlib import Path
import pytest
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
PARENT_DIR = Path(__file__).parent / "test_docs" / "readthedocs"
@pytest.mark.requires("bs4")
def test_main_id_main_content() -> None:
loader = ReadTheDocsLoader(PARENT_DIR / "main_id_main_content")
documents = loader.load()
assert len(documents[0].page_content) != 0
@pytest.mark.requires("bs4")
def test_div_role_main() -> None:
loader = ReadTheDocsLoader(PARENT_DIR / "div_role_main")
documents = loader.load()
assert len(documents[0].page_content) != 0
@pytest.mark.requires("bs4")
def test_custom() -> None:
loader = ReadTheDocsLoader(
PARENT_DIR / "custom",
custom_html_tag=("article", {"role": "main"}),
)
documents = loader.load()
assert len(documents[0].page_content) != 0
@pytest.mark.requires("bs4")
def test_empty() -> None:
loader = ReadTheDocsLoader(
PARENT_DIR / "custom",
)
documents = loader.load()
assert len(documents[0].page_content) == 0