WebBaseLoader: optionally raise exception in the case of http error (#6823)

- **Description**: this PR adds the possibility to raise an exception in
the case the http request did not return a 2xx status code. This is
particularly useful in the situation when the url points to a
non-existent web page, the server returns a http status of 404 NOT
FOUND, but WebBaseLoader anyway parses and returns the http body of the
error message.
  - **Dependencies**: none,
  - **Tag maintainer**: @rlancemartin, @eyurtsev,
  - **Twitter handle**: jtolgyesi
This commit is contained in:
Janos Tolgyesi 2023-06-28 01:43:59 +02:00 committed by GitHub
parent ef72a7cf26
commit f1070de038
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -50,6 +50,9 @@ class WebBaseLoader(BaseLoader):
requests_kwargs: Dict[str, Any] = {}
"""kwargs for requests"""
raise_for_status: bool = False
"""Raise an exception if http status code denotes an error."""
bs_get_text_kwargs: Dict[str, Any] = {}
"""kwargs for beatifulsoup4 get_text"""
@ -189,6 +192,8 @@ class WebBaseLoader(BaseLoader):
self._check_parser(parser)
html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs)
if self.raise_for_status:
html_doc.raise_for_status()
html_doc.encoding = html_doc.apparent_encoding
return BeautifulSoup(html_doc.text, parser)