mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Fix web_base.py (#6519)
Fix for bug in SitemapLoader `aiohttp` `get` does not accept `verify` argument, and currently throws error, so SitemapLoader is not working This PR fixes it by removing `verify` param for `get` function call Fixes #6107 #### Who can review? Tag maintainers/contributors who might be interested: @eyurtsev --------- Co-authored-by: techcenary <127699216+techcenary@users.noreply.github.com>
This commit is contained in:
parent
f891f7d69f
commit
8afc8e6f5d
@ -60,7 +60,7 @@ class WebBaseLoader(BaseLoader):
|
||||
self,
|
||||
web_path: Union[str, List[str]],
|
||||
header_template: Optional[dict] = None,
|
||||
verify: Optional[bool] = True,
|
||||
verify_ssl: Optional[bool] = True,
|
||||
proxies: Optional[dict] = None,
|
||||
):
|
||||
"""Initialize with webpage path."""
|
||||
@ -73,7 +73,6 @@ class WebBaseLoader(BaseLoader):
|
||||
elif isinstance(web_path, List):
|
||||
self.web_paths = web_path
|
||||
|
||||
self.session = requests.Session()
|
||||
try:
|
||||
import bs4 # noqa:F401
|
||||
except ImportError:
|
||||
@ -81,9 +80,6 @@ class WebBaseLoader(BaseLoader):
|
||||
"bs4 package not found, please install it with " "`pip install bs4`"
|
||||
)
|
||||
|
||||
# Choose to verify
|
||||
self.verify = verify
|
||||
|
||||
headers = header_template or default_header_template
|
||||
if not headers.get("User-Agent"):
|
||||
try:
|
||||
@ -96,7 +92,10 @@ class WebBaseLoader(BaseLoader):
|
||||
"To get a realistic header for requests, "
|
||||
"`pip install fake_useragent`."
|
||||
)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.headers = dict(headers)
|
||||
self.session.verify = verify_ssl
|
||||
|
||||
if proxies:
|
||||
self.session.proxies.update(proxies)
|
||||
@ -110,17 +109,13 @@ class WebBaseLoader(BaseLoader):
|
||||
async def _fetch(
|
||||
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
|
||||
) -> str:
|
||||
# For SiteMap SSL verification
|
||||
if not self.requests_kwargs.get("verify", True):
|
||||
connector = aiohttp.TCPConnector(ssl=False)
|
||||
else:
|
||||
connector = None
|
||||
|
||||
async with aiohttp.ClientSession(connector=connector) as session:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for i in range(retries):
|
||||
try:
|
||||
async with session.get(
|
||||
url, headers=self.session.headers, verify=self.verify
|
||||
url,
|
||||
headers=self.session.headers,
|
||||
ssl=None if self.session.verify else False,
|
||||
) as response:
|
||||
return await response.text()
|
||||
except aiohttp.ClientConnectionError as e:
|
||||
@ -195,7 +190,7 @@ class WebBaseLoader(BaseLoader):
|
||||
|
||||
self._check_parser(parser)
|
||||
|
||||
html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs)
|
||||
html_doc = self.session.get(url, **self.requests_kwargs)
|
||||
if self.raise_for_status:
|
||||
html_doc.raise_for_status()
|
||||
html_doc.encoding = html_doc.apparent_encoding
|
||||
|
Loading…
Reference in New Issue
Block a user