From fbc97a77ed92c5e444fee0674bb337b3ca193c54 Mon Sep 17 00:00:00 2001 From: Jasper <37783831+jagilley@users.noreply.github.com> Date: Thu, 13 Jul 2023 13:18:28 -0700 Subject: [PATCH] add browserless loader (#7562) # Browserless Added support for Browserless' `/content` endpoint as a document loader. ### About Browserless Browserless is a cloud service that provides access to headless Chrome browsers via a REST API. It allows developers to automate Chromium in a serverless fashion without having to configure and maintain their own Chrome infrastructure. --------- Co-authored-by: Harrison Chase Co-authored-by: Lance Martin --- .../integrations/browserless.ipynb | 81 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/browserless.py | 41 ++++++++++ 3 files changed, 124 insertions(+) create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/browserless.ipynb create mode 100644 langchain/document_loaders/browserless.py diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/browserless.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/browserless.ipynb new file mode 100644 index 0000000000..dc90b998a4 --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/browserless.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Browserless" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import BrowserlessLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BROWSERLESS_API_TOKEN = \"YOUR_API_TOKEN\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "Document classification - Wikipedia\n", + "