add browserless loader (#7562)

# Browserless

Added support for Browserless' `/content` endpoint as a document loader.

### About Browserless

Browserless is a cloud service that provides access to headless Chrome
browsers via a REST API. It allows developers to automate Chromium in a
serverless fashion without having to configure and maintain their own
Chrome infrastructure.

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
Co-authored-by: Lance Martin <lance@langchain.dev>
pull/7676/head
Jasper 1 year ago committed by GitHub
parent 120c52589b
commit fbc97a77ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,81 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Browserless"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import BrowserlessLoader"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"BROWSERLESS_API_TOKEN = \"YOUR_API_TOKEN\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<!DOCTYPE html><html class=\"client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled\" lang=\"en\" dir=\"ltr\"><head>\n",
"<meta charset=\"UTF-8\">\n",
"<title>Document classification - Wikipedia</title>\n",
"<script>document.documentElement.className=\"client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled\";(function(){var cookie=document.cookie.match(/(?:^|; )enwikimwclien\n"
]
}
],
"source": [
"loader = BrowserlessLoader(\n",
" api_token=BROWSERLESS_API_TOKEN,\n",
" urls=[\n",
" \"https://en.wikipedia.org/wiki/Document_classification\",\n",
" ],\n",
")\n",
"\n",
"documents = loader.load()\n",
"\n",
"print(documents[0].page_content[:1000])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -24,6 +24,7 @@ from langchain.document_loaders.blob_loaders import (
)
from langchain.document_loaders.blockchain import BlockchainDocumentLoader
from langchain.document_loaders.brave_search import BraveSearchLoader
from langchain.document_loaders.browserless import BrowserlessLoader
from langchain.document_loaders.chatgpt import ChatGPTLoader
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
from langchain.document_loaders.confluence import ConfluenceLoader
@ -174,6 +175,7 @@ __all__ = [
"BlobLoader",
"BlockchainDocumentLoader",
"BraveSearchLoader",
"BrowserlessLoader",
"CSVLoader",
"ChatGPTLoader",
"CoNLLULoader",

@ -0,0 +1,41 @@
from typing import Iterator, List, Union
import requests
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class BrowserlessLoader(BaseLoader):
"""Loads the content of webpages using Browserless' /content endpoint"""
def __init__(self, api_token: str, urls: Union[str, List[str]]):
"""Initialize with API token and the URLs to scrape"""
self.api_token = api_token
"""Browserless API token."""
self.urls = urls
"""List of URLs to scrape."""
def lazy_load(self) -> Iterator[Document]:
"""Lazy load Documents from URLs."""
for url in self.urls:
response = requests.post(
"https://chrome.browserless.io/content",
params={
"token": self.api_token,
},
json={
"url": url,
},
)
yield Document(
page_content=response.text,
metadata={
"source": url,
},
)
def load(self) -> List[Document]:
"""Load Documents from URLs."""
return list(self.lazy_load())
Loading…
Cancel
Save