when encountering error during fetch return "" in web_base.py (#8753)

when e.g. downloading a sitemap with a malformed url (e.g.
"ttp://example.com/index.html" with the h omitted at the beginning of
the url), this will ensure that the sitemap download does not crash, but
just emits a warning. (maybe should be optional with e.g. a
`skip_faulty_urls:bool=True` parameter, but this was the most
straightforward fix)

@rlancemartin, @eyurtsev
---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/8900/head
Oege Dijk 1 year ago committed by GitHub
parent bbd22b9b76
commit cff52638b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -31,7 +31,7 @@ class BlackboardLoader(WebBaseLoader):
)
documents = loader.load()
"""
""" # noqa: E501
base_url: str
"""Base url of the blackboard course."""
@ -47,6 +47,7 @@ class BlackboardLoader(WebBaseLoader):
load_all_recursively: bool = True,
basic_auth: Optional[Tuple[str, str]] = None,
cookies: Optional[dict] = None,
continue_on_failure: Optional[bool] = False,
):
"""Initialize with blackboard course url.
@ -58,6 +59,10 @@ class BlackboardLoader(WebBaseLoader):
load_all_recursively: If True, load all documents recursively.
basic_auth: Basic auth credentials.
cookies: Cookies.
continue_on_failure: whether to continue loading the sitemap if an error
occurs loading a url, emitting a warning instead of raising an
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
Raises:
ValueError: If blackboard course url is invalid.
@ -80,6 +85,7 @@ class BlackboardLoader(WebBaseLoader):
cookies.update({"BbRouter": bbrouter})
self.session.cookies.update(cookies)
self.load_all_recursively = load_all_recursively
self.continue_on_failure = continue_on_failure
self.check_bs4()
def check_bs4(self) -> None:

@ -19,6 +19,7 @@ class GitbookLoader(WebBaseLoader):
load_all_paths: bool = False,
base_url: Optional[str] = None,
content_selector: str = "main",
continue_on_failure: Optional[bool] = False,
):
"""Initialize with web page and whether to load all paths.
@ -31,6 +32,10 @@ class GitbookLoader(WebBaseLoader):
appended to this base url. Defaults to `web_page`.
content_selector: The CSS selector for the content to load.
Defaults to "main".
continue_on_failure: whether to continue loading the sitemap if an error
occurs loading a url, emitting a warning instead of raising an
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
"""
self.base_url = base_url or web_page
if self.base_url.endswith("/"):
@ -43,6 +48,7 @@ class GitbookLoader(WebBaseLoader):
super().__init__(web_paths)
self.load_all_paths = load_all_paths
self.content_selector = content_selector
self.continue_on_failure = continue_on_failure
def load(self) -> List[Document]:
"""Fetch text from one single GitBook page."""

@ -33,6 +33,7 @@ class SitemapLoader(WebBaseLoader):
blocknum: int = 0,
meta_function: Optional[Callable] = None,
is_local: bool = False,
continue_on_failure: bool = False,
):
"""Initialize with webpage path and optional filter URLs.
@ -48,6 +49,10 @@ class SitemapLoader(WebBaseLoader):
remember when setting this method to also copy metadata["loc"]
to metadata["source"] if you are using this field
is_local: whether the sitemap is a local file. Default: False
continue_on_failure: whether to continue loading the sitemap if an error
occurs loading a url, emitting a warning instead of raising an
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
"""
if blocksize is not None and blocksize < 1:
@ -71,6 +76,7 @@ class SitemapLoader(WebBaseLoader):
self.blocksize = blocksize
self.blocknum = blocknum
self.is_local = is_local
self.continue_on_failure = continue_on_failure
def parse_sitemap(self, soup: Any) -> List[dict]:
"""Parse sitemap xml and load into a list of dicts.

@ -62,6 +62,7 @@ class WebBaseLoader(BaseLoader):
header_template: Optional[dict] = None,
verify_ssl: Optional[bool] = True,
proxies: Optional[dict] = None,
continue_on_failure: Optional[bool] = False,
):
"""Initialize with webpage path."""
@ -96,6 +97,7 @@ class WebBaseLoader(BaseLoader):
self.session = requests.Session()
self.session.headers = dict(headers)
self.session.verify = verify_ssl
self.continue_on_failure = continue_on_failure
if proxies:
self.session.proxies.update(proxies)
@ -133,7 +135,20 @@ class WebBaseLoader(BaseLoader):
self, url: str, semaphore: asyncio.Semaphore
) -> str:
async with semaphore:
return await self._fetch(url)
try:
return await self._fetch(url)
except Exception as e:
if self.continue_on_failure:
logger.warning(
f"Error fetching {url}, skipping due to"
f" continue_on_failure=True"
)
return ""
logger.exception(
f"Error fetching {url} and aborting, use continue_on_failure=True "
"to continue loading urls after encountering an error."
)
raise e
async def fetch_all(self, urls: List[str]) -> Any:
"""Fetch all urls concurrently with rate limiting."""

Loading…
Cancel
Save