From 3ca1a387c258c1e895900f56637a3d990ba403ec Mon Sep 17 00:00:00 2001 From: Tim Asp <707699+timothyasp@users.noreply.github.com> Date: Tue, 27 Jun 2023 22:27:49 -0700 Subject: [PATCH] Web Loader: Add proxy support (#6792) Proxies are helpful, especially when you start querying against more anti-bot websites. [Proxy services](https://developers.oxylabs.io/advanced-proxy-solutions/web-unblocker/making-requests) (of which there are many) and `requests` make it easy to rotate IPs to prevent banning by just passing along a simple dict to `requests`. CC @rlancemartin, @eyurtsev --- .../integrations/web_base.ipynb | 26 ++++++++++++++++--- langchain/document_loaders/web_base.py | 4 +++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/web_base.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/web_base.ipynb index 56b1edb376..5a91c3cd16 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/web_base.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/web_base.ipynb @@ -224,13 +224,33 @@ "docs" ] }, + { + "cell_type": "markdown", + "source": [ + "## Using proxies\n", + "\n", + "Sometimes you might need to use proxies to get around IP blocks. You can pass in a dictionary of proxies to the loader (and `requests` underneath) to use them." + ], + "metadata": { + "collapsed": false + } + }, { "cell_type": "code", "execution_count": null, - "id": "1dd8ab23", - "metadata": {}, "outputs": [], - "source": [] + "source": [ + "loader = WebBaseLoader(\n", + " \"https://www.walmart.com/search?q=parrots\", proxies={\n", + " \"http\": \"http://{username}:{password}:@proxy.service.com:6666/\",\n", + " \"https\": \"https://{username}:{password}:@proxy.service.com:6666/\"\n", + " }\n", + ")\n", + "docs = loader.load()\n" + ], + "metadata": { + "collapsed": false + } } ], "metadata": { diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index 6769640605..1528cd6156 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -61,6 +61,7 @@ class WebBaseLoader(BaseLoader): web_path: Union[str, List[str]], header_template: Optional[dict] = None, verify: Optional[bool] = True, + proxies: Optional[dict] = None, ): """Initialize with webpage path.""" @@ -97,6 +98,9 @@ class WebBaseLoader(BaseLoader): ) self.session.headers = dict(headers) + if proxies: + self.session.proxies.update(proxies) + @property def web_path(self) -> str: if len(self.web_paths) > 1: