update web_base.py to have verify option (#6107)

We propose an enhancement to the web-based loader initialize method by
introducing a "verify" option. This enhancement addresses the issue of
SSL verification errors encountered on certain web pages. By providing
users with the option to set the verify parameter to False, we offer
greater flexibility and control.
<!--
Thank you for contributing to LangChain! Your PR will appear in our
release under the title you set. Please make sure it highlights your
valuable contribution.

Replace this with a description of the change, the issue it fixes (if
applicable), and relevant context. List any dependencies required for
this change.

After you're done, someone will review your PR. They may suggest
improvements. If no one reviews your PR within a few days, feel free to
@-mention the same people again, as notifications can get lost.

Finally, we'd love to show appreciation for your contribution - if you'd
like us to shout you out on Twitter, please also include your handle!
-->

### Fixes #6079 

#### Who can review?
@eyurtsev @hwchase17

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
Dhruvil Shah 2023-06-17 11:10:48 -07:00 committed by GitHub
parent e194dc5306
commit 2eec687474
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -51,7 +51,10 @@ class WebBaseLoader(BaseLoader):
"""kwargs for requests""" """kwargs for requests"""
def __init__( def __init__(
self, web_path: Union[str, List[str]], header_template: Optional[dict] = None self,
web_path: Union[str, List[str]],
header_template: Optional[dict] = None,
verify: Optional[bool] = True,
): ):
"""Initialize with webpage path.""" """Initialize with webpage path."""
@ -71,6 +74,9 @@ class WebBaseLoader(BaseLoader):
"bs4 package not found, please install it with " "`pip install bs4`" "bs4 package not found, please install it with " "`pip install bs4`"
) )
# Choose to verify
self.verify = verify
headers = header_template or default_header_template headers = header_template or default_header_template
if not headers.get("User-Agent"): if not headers.get("User-Agent"):
try: try:
@ -98,7 +104,7 @@ class WebBaseLoader(BaseLoader):
for i in range(retries): for i in range(retries):
try: try:
async with session.get( async with session.get(
url, headers=self.session.headers url, headers=self.session.headers, verify=self.verify
) as response: ) as response:
return await response.text() return await response.text()
except aiohttp.ClientConnectionError as e: except aiohttp.ClientConnectionError as e:
@ -173,7 +179,7 @@ class WebBaseLoader(BaseLoader):
self._check_parser(parser) self._check_parser(parser)
html_doc = self.session.get(url, **self.requests_kwargs) html_doc = self.session.get(url, verify=self.verify, **self.requests_kwargs)
html_doc.encoding = html_doc.apparent_encoding html_doc.encoding = html_doc.apparent_encoding
return BeautifulSoup(html_doc.text, parser) return BeautifulSoup(html_doc.text, parser)