diff --git a/langchain/document_loaders/readthedocs.py b/langchain/document_loaders/readthedocs.py index bb10a0e9..979b4ae1 100644 --- a/langchain/document_loaders/readthedocs.py +++ b/langchain/document_loaders/readthedocs.py @@ -1,6 +1,6 @@ """Loader that loads ReadTheDocs documentation directory dump.""" from pathlib import Path -from typing import Any, List, Optional +from typing import Any, List, Optional, Tuple, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -11,12 +11,31 @@ class ReadTheDocsLoader(BaseLoader): def __init__( self, - path: str, + path: Union[str, Path], encoding: Optional[str] = None, errors: Optional[str] = None, + custom_html_tag: Optional[Tuple[str, dict]] = None, **kwargs: Optional[Any] ): - """Initialize path.""" + """ + Initialize ReadTheDocsLoader + + The loader loops over all files under `path` and extract the actual content of + the files by retrieving main html tags. Default main html tags include + `
`, and `
`. You + can also define your own html tags by passing custom_html_tag, e.g. + `("div", "class=main")`. The loader iterates html tags with the order of + custom html tags (if exists) and default html tags. If any of the tags is not + empty, the loop will break and retrieve the content out of that tag. + + Args: + path: The location of pulled readthedocs folder. + encoding: The encoding with which to open the documents. + errors: Specifies how encoding and decoding errors are to be handled—this + cannot be used in binary mode. + custom_html_tag: Optional custom html tag to retrieve the content from + files. + """ try: from bs4 import BeautifulSoup except ImportError: @@ -32,34 +51,50 @@ class ReadTheDocsLoader(BaseLoader): except Exception as e: raise ValueError("Parsing kwargs do not appear valid") from e - self.file_path = path + self.file_path = Path(path) self.encoding = encoding self.errors = errors + self.custom_html_tag = custom_html_tag self.bs_kwargs = kwargs def load(self) -> List[Document]: """Load documents.""" - from bs4 import BeautifulSoup - - def _clean_data(data: str) -> str: - soup = BeautifulSoup(data, **self.bs_kwargs) - text = soup.find_all("main", {"id": "main-content"}) - - if len(text) == 0: - text = soup.find_all("div", {"role": "main"}) - - if len(text) != 0: - text = text[0].get_text() - else: - text = "" - return "\n".join([t for t in text.split("\n") if t]) - docs = [] - for p in Path(self.file_path).rglob("*"): + for p in self.file_path.rglob("*"): if p.is_dir(): continue with open(p, encoding=self.encoding, errors=self.errors) as f: - text = _clean_data(f.read()) + text = self._clean_data(f.read()) metadata = {"source": str(p)} docs.append(Document(page_content=text, metadata=metadata)) return docs + + def _clean_data(self, data: str) -> str: + from bs4 import BeautifulSoup + + soup = BeautifulSoup(data, **self.bs_kwargs) + + # default tags + html_tags = [ + ("div", {"role": "main"}), + ("main", {"id": "main-content"}), + ] + + if self.custom_html_tag is not None: + html_tags.append(self.custom_html_tag) + + text = None + + # reversed order. check the custom one first + for tag, attrs in html_tags[::-1]: + text = soup.find(tag, attrs) + # if found, break + if text is not None: + break + + if text is not None: + text = text.get_text() + else: + text = "" + # trim empty lines + return "\n".join([t for t in text.split("\n") if t]) diff --git a/tests/unit_tests/document_loaders/test_docs/readthedocs/custom/test.html b/tests/unit_tests/document_loaders/test_docs/readthedocs/custom/test.html new file mode 100644 index 00000000..9b148a77 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_docs/readthedocs/custom/test.html @@ -0,0 +1,5 @@ + +
+ Hello World! +
+ \ No newline at end of file diff --git a/tests/unit_tests/document_loaders/test_docs/readthedocs/div_role_main/test.html b/tests/unit_tests/document_loaders/test_docs/readthedocs/div_role_main/test.html new file mode 100644 index 00000000..d528c7ae --- /dev/null +++ b/tests/unit_tests/document_loaders/test_docs/readthedocs/div_role_main/test.html @@ -0,0 +1,5 @@ + +
+ Hello World! +
+ \ No newline at end of file diff --git a/tests/unit_tests/document_loaders/test_docs/readthedocs/main_id_main_content/test.html b/tests/unit_tests/document_loaders/test_docs/readthedocs/main_id_main_content/test.html new file mode 100644 index 00000000..1790735a --- /dev/null +++ b/tests/unit_tests/document_loaders/test_docs/readthedocs/main_id_main_content/test.html @@ -0,0 +1,5 @@ + +
+ Hello World! +
+ \ No newline at end of file diff --git a/tests/unit_tests/document_loaders/test_readthedoc.py b/tests/unit_tests/document_loaders/test_readthedoc.py new file mode 100644 index 00000000..9bcaae2f --- /dev/null +++ b/tests/unit_tests/document_loaders/test_readthedoc.py @@ -0,0 +1,40 @@ +from pathlib import Path + +import pytest + +from langchain.document_loaders.readthedocs import ReadTheDocsLoader + +PARENT_DIR = Path(__file__).parent / "test_docs" / "readthedocs" + + +@pytest.mark.requires("bs4") +def test_main_id_main_content() -> None: + loader = ReadTheDocsLoader(PARENT_DIR / "main_id_main_content") + documents = loader.load() + assert len(documents[0].page_content) != 0 + + +@pytest.mark.requires("bs4") +def test_div_role_main() -> None: + loader = ReadTheDocsLoader(PARENT_DIR / "div_role_main") + documents = loader.load() + assert len(documents[0].page_content) != 0 + + +@pytest.mark.requires("bs4") +def test_custom() -> None: + loader = ReadTheDocsLoader( + PARENT_DIR / "custom", + custom_html_tag=("article", {"role": "main"}), + ) + documents = loader.load() + assert len(documents[0].page_content) != 0 + + +@pytest.mark.requires("bs4") +def test_empty() -> None: + loader = ReadTheDocsLoader( + PARENT_DIR / "custom", + ) + documents = loader.load() + assert len(documents[0].page_content) == 0