From bd9e0f393499553b1ce694f5a45cbb9e5698ba27 Mon Sep 17 00:00:00 2001 From: Timothy Ji Date: Thu, 1 Jun 2023 06:27:38 +0800 Subject: [PATCH] Add param requests_kwargs for WebBaseLoader (#5485) # Add param `requests_kwargs` for WebBaseLoader Fixes # (issue) #5483 ## Who can review? @eyurtsev --- .../document_loaders/examples/sitemap.ipynb | 21 ++++++++++++++++++- langchain/document_loaders/web_base.py | 7 +++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/docs/modules/indexes/document_loaders/examples/sitemap.ipynb b/docs/modules/indexes/document_loaders/examples/sitemap.ipynb index 97a3b7af..f27dad30 100644 --- a/docs/modules/indexes/document_loaders/examples/sitemap.ipynb +++ b/docs/modules/indexes/document_loaders/examples/sitemap.ipynb @@ -8,7 +8,7 @@ "\n", "Extends from the `WebBaseLoader`, `SitemapLoader` loads a sitemap from a given URL, and then scrape and load all pages in the sitemap, returning each page as a Document.\n", "\n", - "The scraping is done concurrently. There are reasonable limits to concurrent requests, defaulting to 2 per second. If you aren't concerned about being a good citizen, or you control the scrapped server, or don't care about load, you can change the `requests_per_second` parameter to increase the max concurrent requests. Note, while this will speed up the scraping process, but it may cause the server to block you. Be careful!" + "The scraping is done concurrently. There are reasonable limits to concurrent requests, defaulting to 2 per second. If you aren't concerned about being a good citizen, or you control the scrapped server, or don't care about load. Note, while this will speed up the scraping process, but it may cause the server to block you. Be careful!" ] }, { @@ -63,6 +63,25 @@ "docs = sitemap_loader.load()" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can change the `requests_per_second` parameter to increase the max concurrent requests. and use `requests_kwargs` to pass kwargs when send requests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sitemap_loader.requests_per_second = 2\n", + "# Optional: avoid `[SSL: CERTIFICATE_VERIFY_FAILED]` issue\n", + "sitemap_loader.requests_kwargs = {\"verify\": False}" + ] + }, { "cell_type": "code", "execution_count": 4, diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index 4c7c6cc0..cee218da 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -2,7 +2,7 @@ import asyncio import logging import warnings -from typing import Any, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import aiohttp import requests @@ -47,6 +47,9 @@ class WebBaseLoader(BaseLoader): default_parser: str = "html.parser" """Default parser to use for BeautifulSoup.""" + requests_kwargs: Dict[str, Any] = {} + """kwargs for requests""" + def __init__( self, web_path: Union[str, List[str]], header_template: Optional[dict] = None ): @@ -170,7 +173,7 @@ class WebBaseLoader(BaseLoader): self._check_parser(parser) - html_doc = self.session.get(url) + html_doc = self.session.get(url, **self.requests_kwargs) html_doc.encoding = html_doc.apparent_encoding return BeautifulSoup(html_doc.text, parser)