mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
add browserless loader (#7562)
# Browserless Added support for Browserless' `/content` endpoint as a document loader. ### About Browserless Browserless is a cloud service that provides access to headless Chrome browsers via a REST API. It allows developers to automate Chromium in a serverless fashion without having to configure and maintain their own Chrome infrastructure. --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev>
This commit is contained in:
parent
120c52589b
commit
fbc97a77ed
@ -0,0 +1,81 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Browserless"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import BrowserlessLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"BROWSERLESS_API_TOKEN = \"YOUR_API_TOKEN\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"<!DOCTYPE html><html class=\"client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled\" lang=\"en\" dir=\"ltr\"><head>\n",
|
||||||
|
"<meta charset=\"UTF-8\">\n",
|
||||||
|
"<title>Document classification - Wikipedia</title>\n",
|
||||||
|
"<script>document.documentElement.className=\"client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled\";(function(){var cookie=document.cookie.match(/(?:^|; )enwikimwclien\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"loader = BrowserlessLoader(\n",
|
||||||
|
" api_token=BROWSERLESS_API_TOKEN,\n",
|
||||||
|
" urls=[\n",
|
||||||
|
" \"https://en.wikipedia.org/wiki/Document_classification\",\n",
|
||||||
|
" ],\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"documents = loader.load()\n",
|
||||||
|
"\n",
|
||||||
|
"print(documents[0].page_content[:1000])"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.1"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
@ -24,6 +24,7 @@ from langchain.document_loaders.blob_loaders import (
|
|||||||
)
|
)
|
||||||
from langchain.document_loaders.blockchain import BlockchainDocumentLoader
|
from langchain.document_loaders.blockchain import BlockchainDocumentLoader
|
||||||
from langchain.document_loaders.brave_search import BraveSearchLoader
|
from langchain.document_loaders.brave_search import BraveSearchLoader
|
||||||
|
from langchain.document_loaders.browserless import BrowserlessLoader
|
||||||
from langchain.document_loaders.chatgpt import ChatGPTLoader
|
from langchain.document_loaders.chatgpt import ChatGPTLoader
|
||||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||||
from langchain.document_loaders.confluence import ConfluenceLoader
|
from langchain.document_loaders.confluence import ConfluenceLoader
|
||||||
@ -174,6 +175,7 @@ __all__ = [
|
|||||||
"BlobLoader",
|
"BlobLoader",
|
||||||
"BlockchainDocumentLoader",
|
"BlockchainDocumentLoader",
|
||||||
"BraveSearchLoader",
|
"BraveSearchLoader",
|
||||||
|
"BrowserlessLoader",
|
||||||
"CSVLoader",
|
"CSVLoader",
|
||||||
"ChatGPTLoader",
|
"ChatGPTLoader",
|
||||||
"CoNLLULoader",
|
"CoNLLULoader",
|
||||||
|
41
langchain/document_loaders/browserless.py
Normal file
41
langchain/document_loaders/browserless.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from typing import Iterator, List, Union
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
class BrowserlessLoader(BaseLoader):
|
||||||
|
"""Loads the content of webpages using Browserless' /content endpoint"""
|
||||||
|
|
||||||
|
def __init__(self, api_token: str, urls: Union[str, List[str]]):
|
||||||
|
"""Initialize with API token and the URLs to scrape"""
|
||||||
|
self.api_token = api_token
|
||||||
|
"""Browserless API token."""
|
||||||
|
self.urls = urls
|
||||||
|
"""List of URLs to scrape."""
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
"""Lazy load Documents from URLs."""
|
||||||
|
|
||||||
|
for url in self.urls:
|
||||||
|
response = requests.post(
|
||||||
|
"https://chrome.browserless.io/content",
|
||||||
|
params={
|
||||||
|
"token": self.api_token,
|
||||||
|
},
|
||||||
|
json={
|
||||||
|
"url": url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
yield Document(
|
||||||
|
page_content=response.text,
|
||||||
|
metadata={
|
||||||
|
"source": url,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
"""Load Documents from URLs."""
|
||||||
|
return list(self.lazy_load())
|
Loading…
Reference in New Issue
Block a user