diff --git a/langchain/document_loaders/readthedocs.py b/langchain/document_loaders/readthedocs.py index 99547f3f..a8ca2433 100644 --- a/langchain/document_loaders/readthedocs.py +++ b/langchain/document_loaders/readthedocs.py @@ -1,6 +1,6 @@ """Loader that loads ReadTheDocs documentation directory dump.""" from pathlib import Path -from typing import List +from typing import Any, List, Optional from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -9,16 +9,41 @@ from langchain.document_loaders.base import BaseLoader class ReadTheDocsLoader(BaseLoader): """Loader that loads ReadTheDocs documentation directory dump.""" - def __init__(self, path: str): + def __init__( + self, + path: str, + encoding: Optional[str] = None, + errors: Optional[str] = None, + **kwargs: Optional[Any] + ): """Initialize path.""" + try: + from bs4 import BeautifulSoup + + except ImportError: + raise ValueError( + "Could not import python packages. " + "Please install it with `pip install beautifulsoup4`. " + ) + + try: + _ = BeautifulSoup( + "Parser builder library test.", **kwargs + ) + except Exception as e: + raise ValueError("Parsing kwargs do not appear valid") from e + self.file_path = path + self.encoding = encoding + self.errors = errors + self.bs_kwargs = kwargs def load(self) -> List[Document]: """Load documents.""" from bs4 import BeautifulSoup def _clean_data(data: str) -> str: - soup = BeautifulSoup(data) + soup = BeautifulSoup(data, **self.bs_kwargs) text = soup.find_all("main", {"id": "main-content"}) if len(text) != 0: text = text[0].get_text() @@ -30,7 +55,7 @@ class ReadTheDocsLoader(BaseLoader): for p in Path(self.file_path).rglob("*"): if p.is_dir(): continue - with open(p) as f: + with open(p, encoding=self.encoding, errors=self.errors) as f: text = _clean_data(f.read()) metadata = {"source": str(p)} docs.append(Document(page_content=text, metadata=metadata))