Fix web_base.py (#6519)

Fix for bug in SitemapLoader

`aiohttp` `get` does not accept `verify` argument, and currently throws
error, so SitemapLoader is not working

This PR fixes it by removing `verify` param for `get` function call

Fixes #6107

#### Who can review?

Tag maintainers/contributors who might be interested:

@eyurtsev

---------

Co-authored-by: techcenary <127699216+techcenary@users.noreply.github.com>
This commit is contained in:
Mykola Zomchak 2023-07-05 19:53:57 -04:00 committed by GitHub
parent f891f7d69f
commit 8afc8e6f5d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -60,7 +60,7 @@ class WebBaseLoader(BaseLoader):
self,
web_path: Union[str, List[str]],
header_template: Optional[dict] = None,
verify: Optional[bool] = True,
verify_ssl: Optional[bool] = True,
proxies: Optional[dict] = None,
):
"""Initialize with webpage path."""
@ -73,7 +73,6 @@ class WebBaseLoader(BaseLoader):
elif isinstance(web_path, List):
self.web_paths = web_path
self.session = requests.Session()
try:
import bs4 # noqa:F401
except ImportError:
@ -81,9 +80,6 @@ class WebBaseLoader(BaseLoader):
"bs4 package not found, please install it with " "`pip install bs4`"
)
# Choose to verify
self.verify = verify
headers = header_template or default_header_template
if not headers.get("User-Agent"):
try:
@ -96,7 +92,10 @@ class WebBaseLoader(BaseLoader):
"To get a realistic header for requests, "
"`pip install fake_useragent`."
)
self.session = requests.Session()
self.session.headers = dict(headers)
self.session.verify = verify_ssl
if proxies:
self.session.proxies.update(proxies)
@ -110,17 +109,13 @@ class WebBaseLoader(BaseLoader):
async def _fetch(
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
) -> str:
# For SiteMap SSL verification
if not self.requests_kwargs.get("verify", True):
connector = aiohttp.TCPConnector(ssl=False)
else:
connector = None
async with aiohttp.ClientSession(connector=connector) as session:
async with aiohttp.ClientSession() as session:
for i in range(retries):
try:
async with session.get(
url, headers=self.session.headers, verify=self.verify
url,
headers=self.session.headers,
ssl=None if self.session.verify else False,
) as response:
return await response.text()
except aiohttp.ClientConnectionError as e:
@ -195,7 +190,7 @@ class WebBaseLoader(BaseLoader):
self._check_parser(parser)
html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs)
html_doc = self.session.get(url, **self.requests_kwargs)
if self.raise_for_status:
html_doc.raise_for_status()
html_doc.encoding = html_doc.apparent_encoding