Add param requests_kwargs for WebBaseLoader (#5485)

# Add param `requests_kwargs` for WebBaseLoader

Fixes # (issue)

#5483 

## Who can review?

@eyurtsev
searx_updates
Timothy Ji 1 year ago committed by GitHub
parent 359fb8fa3a
commit bd9e0f3934
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -8,7 +8,7 @@
"\n", "\n",
"Extends from the `WebBaseLoader`, `SitemapLoader` loads a sitemap from a given URL, and then scrape and load all pages in the sitemap, returning each page as a Document.\n", "Extends from the `WebBaseLoader`, `SitemapLoader` loads a sitemap from a given URL, and then scrape and load all pages in the sitemap, returning each page as a Document.\n",
"\n", "\n",
"The scraping is done concurrently. There are reasonable limits to concurrent requests, defaulting to 2 per second. If you aren't concerned about being a good citizen, or you control the scrapped server, or don't care about load, you can change the `requests_per_second` parameter to increase the max concurrent requests. Note, while this will speed up the scraping process, but it may cause the server to block you. Be careful!" "The scraping is done concurrently. There are reasonable limits to concurrent requests, defaulting to 2 per second. If you aren't concerned about being a good citizen, or you control the scrapped server, or don't care about load. Note, while this will speed up the scraping process, but it may cause the server to block you. Be careful!"
] ]
}, },
{ {
@ -63,6 +63,25 @@
"docs = sitemap_loader.load()" "docs = sitemap_loader.load()"
] ]
}, },
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"You can change the `requests_per_second` parameter to increase the max concurrent requests. and use `requests_kwargs` to pass kwargs when send requests."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sitemap_loader.requests_per_second = 2\n",
"# Optional: avoid `[SSL: CERTIFICATE_VERIFY_FAILED]` issue\n",
"sitemap_loader.requests_kwargs = {\"verify\": False}"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 4,

@ -2,7 +2,7 @@
import asyncio import asyncio
import logging import logging
import warnings import warnings
from typing import Any, List, Optional, Union from typing import Any, Dict, List, Optional, Union
import aiohttp import aiohttp
import requests import requests
@ -47,6 +47,9 @@ class WebBaseLoader(BaseLoader):
default_parser: str = "html.parser" default_parser: str = "html.parser"
"""Default parser to use for BeautifulSoup.""" """Default parser to use for BeautifulSoup."""
requests_kwargs: Dict[str, Any] = {}
"""kwargs for requests"""
def __init__( def __init__(
self, web_path: Union[str, List[str]], header_template: Optional[dict] = None self, web_path: Union[str, List[str]], header_template: Optional[dict] = None
): ):
@ -170,7 +173,7 @@ class WebBaseLoader(BaseLoader):
self._check_parser(parser) self._check_parser(parser)
html_doc = self.session.get(url) html_doc = self.session.get(url, **self.requests_kwargs)
html_doc.encoding = html_doc.apparent_encoding html_doc.encoding = html_doc.apparent_encoding
return BeautifulSoup(html_doc.text, parser) return BeautifulSoup(html_doc.text, parser)

Loading…
Cancel
Save