From 51193309ea713e515667747e8acee64deabc2b77 Mon Sep 17 00:00:00 2001 From: plpycoin <103234125+plpycoin@users.noreply.github.com> Date: Thu, 12 Oct 2023 23:32:06 +0800 Subject: [PATCH] Update readthedocs.py (#11110) Only parse .html files .svg .png favicon.ico will crash processing phase --------- Co-authored-by: Eugene Yurtsev --- .../langchain/document_loaders/readthedocs.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/readthedocs.py b/libs/langchain/langchain/document_loaders/readthedocs.py index a123f6a72e..6aa3ddfd91 100644 --- a/libs/langchain/langchain/document_loaders/readthedocs.py +++ b/libs/langchain/langchain/document_loaders/readthedocs.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Sequence, Tuple, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -14,6 +14,7 @@ class ReadTheDocsLoader(BaseLoader): encoding: Optional[str] = None, errors: Optional[str] = None, custom_html_tag: Optional[Tuple[str, dict]] = None, + patterns: Sequence[str] = ("*.htm", "*.html"), **kwargs: Optional[Any] ): """ @@ -34,6 +35,8 @@ class ReadTheDocsLoader(BaseLoader): cannot be used in binary mode. custom_html_tag: Optional custom html tag to retrieve the content from files. + patterns: The file patterns to load, passed to `glob.rglob`. + kwargs: named arguments passed to `bs4.BeautifulSoup`. """ try: from bs4 import BeautifulSoup @@ -54,18 +57,20 @@ class ReadTheDocsLoader(BaseLoader): self.encoding = encoding self.errors = errors self.custom_html_tag = custom_html_tag + self.patterns = patterns self.bs_kwargs = kwargs def load(self) -> List[Document]: """Load documents.""" docs = [] - for p in self.file_path.rglob("*"): - if p.is_dir(): - continue - with open(p, encoding=self.encoding, errors=self.errors) as f: - text = self._clean_data(f.read()) - metadata = {"source": str(p)} - docs.append(Document(page_content=text, metadata=metadata)) + for file_pattern in self.patterns: + for p in self.file_path.rglob(file_pattern): + if p.is_dir(): + continue + with open(p, encoding=self.encoding, errors=self.errors) as f: + text = self._clean_data(f.read()) + metadata = {"source": str(p)} + docs.append(Document(page_content=text, metadata=metadata)) return docs def _clean_data(self, data: str) -> str: