From 8afc8e6f5d52a17455bbdfb3d9829064e098f5b0 Mon Sep 17 00:00:00 2001 From: Mykola Zomchak <127699216+zomchak-code@users.noreply.github.com> Date: Wed, 5 Jul 2023 19:53:57 -0400 Subject: [PATCH] Fix web_base.py (#6519) Fix for bug in SitemapLoader `aiohttp` `get` does not accept `verify` argument, and currently throws error, so SitemapLoader is not working This PR fixes it by removing `verify` param for `get` function call Fixes #6107 #### Who can review? Tag maintainers/contributors who might be interested: @eyurtsev --------- Co-authored-by: techcenary <127699216+techcenary@users.noreply.github.com> --- langchain/document_loaders/web_base.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index 1528cd6156..1d4f0aee23 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -60,7 +60,7 @@ class WebBaseLoader(BaseLoader): self, web_path: Union[str, List[str]], header_template: Optional[dict] = None, - verify: Optional[bool] = True, + verify_ssl: Optional[bool] = True, proxies: Optional[dict] = None, ): """Initialize with webpage path.""" @@ -73,7 +73,6 @@ class WebBaseLoader(BaseLoader): elif isinstance(web_path, List): self.web_paths = web_path - self.session = requests.Session() try: import bs4 # noqa:F401 except ImportError: @@ -81,9 +80,6 @@ class WebBaseLoader(BaseLoader): "bs4 package not found, please install it with " "`pip install bs4`" ) - # Choose to verify - self.verify = verify - headers = header_template or default_header_template if not headers.get("User-Agent"): try: @@ -96,7 +92,10 @@ class WebBaseLoader(BaseLoader): "To get a realistic header for requests, " "`pip install fake_useragent`." ) + + self.session = requests.Session() self.session.headers = dict(headers) + self.session.verify = verify_ssl if proxies: self.session.proxies.update(proxies) @@ -110,17 +109,13 @@ class WebBaseLoader(BaseLoader): async def _fetch( self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5 ) -> str: - # For SiteMap SSL verification - if not self.requests_kwargs.get("verify", True): - connector = aiohttp.TCPConnector(ssl=False) - else: - connector = None - - async with aiohttp.ClientSession(connector=connector) as session: + async with aiohttp.ClientSession() as session: for i in range(retries): try: async with session.get( - url, headers=self.session.headers, verify=self.verify + url, + headers=self.session.headers, + ssl=None if self.session.verify else False, ) as response: return await response.text() except aiohttp.ClientConnectionError as e: @@ -195,7 +190,7 @@ class WebBaseLoader(BaseLoader): self._check_parser(parser) - html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs) + html_doc = self.session.get(url, **self.requests_kwargs) if self.raise_for_status: html_doc.raise_for_status() html_doc.encoding = html_doc.apparent_encoding