From f1070de038aa4161d083eca999430d0eb482578e Mon Sep 17 00:00:00 2001 From: Janos Tolgyesi Date: Wed, 28 Jun 2023 01:43:59 +0200 Subject: [PATCH] WebBaseLoader: optionally raise exception in the case of http error (#6823) - **Description**: this PR adds the possibility to raise an exception in the case the http request did not return a 2xx status code. This is particularly useful in the situation when the url points to a non-existent web page, the server returns a http status of 404 NOT FOUND, but WebBaseLoader anyway parses and returns the http body of the error message. - **Dependencies**: none, - **Tag maintainer**: @rlancemartin, @eyurtsev, - **Twitter handle**: jtolgyesi --- langchain/document_loaders/web_base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index 8ea5d74911..6769640605 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -50,6 +50,9 @@ class WebBaseLoader(BaseLoader): requests_kwargs: Dict[str, Any] = {} """kwargs for requests""" + raise_for_status: bool = False + """Raise an exception if http status code denotes an error.""" + bs_get_text_kwargs: Dict[str, Any] = {} """kwargs for beatifulsoup4 get_text""" @@ -189,6 +192,8 @@ class WebBaseLoader(BaseLoader): self._check_parser(parser) html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs) + if self.raise_for_status: + html_doc.raise_for_status() html_doc.encoding = html_doc.apparent_encoding return BeautifulSoup(html_doc.text, parser)