From 2eec687474252cd8083da09ccc5ac5bf89a59611 Mon Sep 17 00:00:00 2001 From: Dhruvil Shah <48760936+jackfrost1411@users.noreply.github.com> Date: Sat, 17 Jun 2023 11:10:48 -0700 Subject: [PATCH] update web_base.py to have verify option (#6107) We propose an enhancement to the web-based loader initialize method by introducing a "verify" option. This enhancement addresses the issue of SSL verification errors encountered on certain web pages. By providing users with the option to set the verify parameter to False, we offer greater flexibility and control. ### Fixes #6079 #### Who can review? @eyurtsev @hwchase17 --------- Co-authored-by: Harrison Chase --- langchain/document_loaders/web_base.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index cee218da..bd699eb8 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -51,7 +51,10 @@ class WebBaseLoader(BaseLoader): """kwargs for requests""" def __init__( - self, web_path: Union[str, List[str]], header_template: Optional[dict] = None + self, + web_path: Union[str, List[str]], + header_template: Optional[dict] = None, + verify: Optional[bool] = True, ): """Initialize with webpage path.""" @@ -71,6 +74,9 @@ class WebBaseLoader(BaseLoader): "bs4 package not found, please install it with " "`pip install bs4`" ) + # Choose to verify + self.verify = verify + headers = header_template or default_header_template if not headers.get("User-Agent"): try: @@ -98,7 +104,7 @@ class WebBaseLoader(BaseLoader): for i in range(retries): try: async with session.get( - url, headers=self.session.headers + url, headers=self.session.headers, verify=self.verify ) as response: return await response.text() except aiohttp.ClientConnectionError as e: @@ -173,7 +179,7 @@ class WebBaseLoader(BaseLoader): self._check_parser(parser) - html_doc = self.session.get(url, **self.requests_kwargs) + html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs) html_doc.encoding = html_doc.apparent_encoding return BeautifulSoup(html_doc.text, parser)