mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Fix web_base.py (#6519)
Fix for bug in SitemapLoader `aiohttp` `get` does not accept `verify` argument, and currently throws error, so SitemapLoader is not working This PR fixes it by removing `verify` param for `get` function call Fixes #6107 #### Who can review? Tag maintainers/contributors who might be interested: @eyurtsev --------- Co-authored-by: techcenary <127699216+techcenary@users.noreply.github.com>
This commit is contained in:
parent
f891f7d69f
commit
8afc8e6f5d
@ -60,7 +60,7 @@ class WebBaseLoader(BaseLoader):
|
|||||||
self,
|
self,
|
||||||
web_path: Union[str, List[str]],
|
web_path: Union[str, List[str]],
|
||||||
header_template: Optional[dict] = None,
|
header_template: Optional[dict] = None,
|
||||||
verify: Optional[bool] = True,
|
verify_ssl: Optional[bool] = True,
|
||||||
proxies: Optional[dict] = None,
|
proxies: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with webpage path."""
|
"""Initialize with webpage path."""
|
||||||
@ -73,7 +73,6 @@ class WebBaseLoader(BaseLoader):
|
|||||||
elif isinstance(web_path, List):
|
elif isinstance(web_path, List):
|
||||||
self.web_paths = web_path
|
self.web_paths = web_path
|
||||||
|
|
||||||
self.session = requests.Session()
|
|
||||||
try:
|
try:
|
||||||
import bs4 # noqa:F401
|
import bs4 # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -81,9 +80,6 @@ class WebBaseLoader(BaseLoader):
|
|||||||
"bs4 package not found, please install it with " "`pip install bs4`"
|
"bs4 package not found, please install it with " "`pip install bs4`"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Choose to verify
|
|
||||||
self.verify = verify
|
|
||||||
|
|
||||||
headers = header_template or default_header_template
|
headers = header_template or default_header_template
|
||||||
if not headers.get("User-Agent"):
|
if not headers.get("User-Agent"):
|
||||||
try:
|
try:
|
||||||
@ -96,7 +92,10 @@ class WebBaseLoader(BaseLoader):
|
|||||||
"To get a realistic header for requests, "
|
"To get a realistic header for requests, "
|
||||||
"`pip install fake_useragent`."
|
"`pip install fake_useragent`."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.session = requests.Session()
|
||||||
self.session.headers = dict(headers)
|
self.session.headers = dict(headers)
|
||||||
|
self.session.verify = verify_ssl
|
||||||
|
|
||||||
if proxies:
|
if proxies:
|
||||||
self.session.proxies.update(proxies)
|
self.session.proxies.update(proxies)
|
||||||
@ -110,17 +109,13 @@ class WebBaseLoader(BaseLoader):
|
|||||||
async def _fetch(
|
async def _fetch(
|
||||||
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
|
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
|
||||||
) -> str:
|
) -> str:
|
||||||
# For SiteMap SSL verification
|
async with aiohttp.ClientSession() as session:
|
||||||
if not self.requests_kwargs.get("verify", True):
|
|
||||||
connector = aiohttp.TCPConnector(ssl=False)
|
|
||||||
else:
|
|
||||||
connector = None
|
|
||||||
|
|
||||||
async with aiohttp.ClientSession(connector=connector) as session:
|
|
||||||
for i in range(retries):
|
for i in range(retries):
|
||||||
try:
|
try:
|
||||||
async with session.get(
|
async with session.get(
|
||||||
url, headers=self.session.headers, verify=self.verify
|
url,
|
||||||
|
headers=self.session.headers,
|
||||||
|
ssl=None if self.session.verify else False,
|
||||||
) as response:
|
) as response:
|
||||||
return await response.text()
|
return await response.text()
|
||||||
except aiohttp.ClientConnectionError as e:
|
except aiohttp.ClientConnectionError as e:
|
||||||
@ -195,7 +190,7 @@ class WebBaseLoader(BaseLoader):
|
|||||||
|
|
||||||
self._check_parser(parser)
|
self._check_parser(parser)
|
||||||
|
|
||||||
html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs)
|
html_doc = self.session.get(url, **self.requests_kwargs)
|
||||||
if self.raise_for_status:
|
if self.raise_for_status:
|
||||||
html_doc.raise_for_status()
|
html_doc.raise_for_status()
|
||||||
html_doc.encoding = html_doc.apparent_encoding
|
html_doc.encoding = html_doc.apparent_encoding
|
||||||
|
Loading…
Reference in New Issue
Block a user