Harrison/rtd loader (#1513)

Co-authored-by: Youssef A. Abukwaik <yousseb@users.noreply.github.com>
This commit is contained in:
Harrison Chase 2023-03-07 21:09:54 -08:00 committed by GitHub
parent 8f21605d71
commit a4a2d79087
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,6 +1,6 @@
"""Loader that loads ReadTheDocs documentation directory dump.""" """Loader that loads ReadTheDocs documentation directory dump."""
from pathlib import Path from pathlib import Path
from typing import List from typing import Any, List, Optional
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
@ -9,16 +9,41 @@ from langchain.document_loaders.base import BaseLoader
class ReadTheDocsLoader(BaseLoader): class ReadTheDocsLoader(BaseLoader):
"""Loader that loads ReadTheDocs documentation directory dump.""" """Loader that loads ReadTheDocs documentation directory dump."""
def __init__(self, path: str): def __init__(
self,
path: str,
encoding: Optional[str] = None,
errors: Optional[str] = None,
**kwargs: Optional[Any]
):
"""Initialize path.""" """Initialize path."""
try:
from bs4 import BeautifulSoup
except ImportError:
raise ValueError(
"Could not import python packages. "
"Please install it with `pip install beautifulsoup4`. "
)
try:
_ = BeautifulSoup(
"<html><body>Parser builder library test.</body></html>", **kwargs
)
except Exception as e:
raise ValueError("Parsing kwargs do not appear valid") from e
self.file_path = path self.file_path = path
self.encoding = encoding
self.errors = errors
self.bs_kwargs = kwargs
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
def _clean_data(data: str) -> str: def _clean_data(data: str) -> str:
soup = BeautifulSoup(data) soup = BeautifulSoup(data, **self.bs_kwargs)
text = soup.find_all("main", {"id": "main-content"}) text = soup.find_all("main", {"id": "main-content"})
if len(text) != 0: if len(text) != 0:
text = text[0].get_text() text = text[0].get_text()
@ -30,7 +55,7 @@ class ReadTheDocsLoader(BaseLoader):
for p in Path(self.file_path).rglob("*"): for p in Path(self.file_path).rglob("*"):
if p.is_dir(): if p.is_dir():
continue continue
with open(p) as f: with open(p, encoding=self.encoding, errors=self.errors) as f:
text = _clean_data(f.read()) text = _clean_data(f.read())
metadata = {"source": str(p)} metadata = {"source": str(p)}
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))