Fix web_base.py (#6519)

Fix for bug in SitemapLoader

`aiohttp` `get` does not accept `verify` argument, and currently throws
error, so SitemapLoader is not working

This PR fixes it by removing `verify` param for `get` function call

Fixes #6107

#### Who can review?

Tag maintainers/contributors who might be interested:

@eyurtsev

---------

Co-authored-by: techcenary <127699216+techcenary@users.noreply.github.com>
This commit is contained in:
Mykola Zomchak 2023-07-05 19:53:57 -04:00 committed by GitHub
parent f891f7d69f
commit 8afc8e6f5d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -60,7 +60,7 @@ class WebBaseLoader(BaseLoader):
self, self,
web_path: Union[str, List[str]], web_path: Union[str, List[str]],
header_template: Optional[dict] = None, header_template: Optional[dict] = None,
verify: Optional[bool] = True, verify_ssl: Optional[bool] = True,
proxies: Optional[dict] = None, proxies: Optional[dict] = None,
): ):
"""Initialize with webpage path.""" """Initialize with webpage path."""
@ -73,7 +73,6 @@ class WebBaseLoader(BaseLoader):
elif isinstance(web_path, List): elif isinstance(web_path, List):
self.web_paths = web_path self.web_paths = web_path
self.session = requests.Session()
try: try:
import bs4 # noqa:F401 import bs4 # noqa:F401
except ImportError: except ImportError:
@ -81,9 +80,6 @@ class WebBaseLoader(BaseLoader):
"bs4 package not found, please install it with " "`pip install bs4`" "bs4 package not found, please install it with " "`pip install bs4`"
) )
# Choose to verify
self.verify = verify
headers = header_template or default_header_template headers = header_template or default_header_template
if not headers.get("User-Agent"): if not headers.get("User-Agent"):
try: try:
@ -96,7 +92,10 @@ class WebBaseLoader(BaseLoader):
"To get a realistic header for requests, " "To get a realistic header for requests, "
"`pip install fake_useragent`." "`pip install fake_useragent`."
) )
self.session = requests.Session()
self.session.headers = dict(headers) self.session.headers = dict(headers)
self.session.verify = verify_ssl
if proxies: if proxies:
self.session.proxies.update(proxies) self.session.proxies.update(proxies)
@ -110,17 +109,13 @@ class WebBaseLoader(BaseLoader):
async def _fetch( async def _fetch(
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5 self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
) -> str: ) -> str:
# For SiteMap SSL verification async with aiohttp.ClientSession() as session:
if not self.requests_kwargs.get("verify", True):
connector = aiohttp.TCPConnector(ssl=False)
else:
connector = None
async with aiohttp.ClientSession(connector=connector) as session:
for i in range(retries): for i in range(retries):
try: try:
async with session.get( async with session.get(
url, headers=self.session.headers, verify=self.verify url,
headers=self.session.headers,
ssl=None if self.session.verify else False,
) as response: ) as response:
return await response.text() return await response.text()
except aiohttp.ClientConnectionError as e: except aiohttp.ClientConnectionError as e:
@ -195,7 +190,7 @@ class WebBaseLoader(BaseLoader):
self._check_parser(parser) self._check_parser(parser)
html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs) html_doc = self.session.get(url, **self.requests_kwargs)
if self.raise_for_status: if self.raise_for_status:
html_doc.raise_for_status() html_doc.raise_for_status()
html_doc.encoding = html_doc.apparent_encoding html_doc.encoding = html_doc.apparent_encoding