mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
Web Loader: Add proxy support (#6792)
Proxies are helpful, especially when you start querying against more anti-bot websites. [Proxy services](https://developers.oxylabs.io/advanced-proxy-solutions/web-unblocker/making-requests) (of which there are many) and `requests` make it easy to rotate IPs to prevent banning by just passing along a simple dict to `requests`. CC @rlancemartin, @eyurtsev
This commit is contained in:
parent
f92ccf70fd
commit
3ca1a387c2
@ -224,13 +224,33 @@
|
|||||||
"docs"
|
"docs"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Using proxies\n",
|
||||||
|
"\n",
|
||||||
|
"Sometimes you might need to use proxies to get around IP blocks. You can pass in a dictionary of proxies to the loader (and `requests` underneath) to use them."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "1dd8ab23",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": [
|
||||||
|
"loader = WebBaseLoader(\n",
|
||||||
|
" \"https://www.walmart.com/search?q=parrots\", proxies={\n",
|
||||||
|
" \"http\": \"http://{username}:{password}:@proxy.service.com:6666/\",\n",
|
||||||
|
" \"https\": \"https://{username}:{password}:@proxy.service.com:6666/\"\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
|
"docs = loader.load()\n"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -61,6 +61,7 @@ class WebBaseLoader(BaseLoader):
|
|||||||
web_path: Union[str, List[str]],
|
web_path: Union[str, List[str]],
|
||||||
header_template: Optional[dict] = None,
|
header_template: Optional[dict] = None,
|
||||||
verify: Optional[bool] = True,
|
verify: Optional[bool] = True,
|
||||||
|
proxies: Optional[dict] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with webpage path."""
|
"""Initialize with webpage path."""
|
||||||
|
|
||||||
@ -97,6 +98,9 @@ class WebBaseLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
self.session.headers = dict(headers)
|
self.session.headers = dict(headers)
|
||||||
|
|
||||||
|
if proxies:
|
||||||
|
self.session.proxies.update(proxies)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def web_path(self) -> str:
|
def web_path(self) -> str:
|
||||||
if len(self.web_paths) > 1:
|
if len(self.web_paths) > 1:
|
||||||
|
Loading…
Reference in New Issue
Block a user