mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
48 lines
1.3 KiB
Python
48 lines
1.3 KiB
Python
|
from typing import Iterator, List, Optional, Tuple, Union
|
||
|
|
||
|
from langchain_core.documents import Document
|
||
|
|
||
|
from langchain_community.document_loaders.base import BaseLoader
|
||
|
|
||
|
|
||
|
class BrowserbaseLoader(BaseLoader):
|
||
|
"""Load pre-rendered web pages using a headless browser hosted on Browserbase.
|
||
|
|
||
|
Depends on `browserbase` package.
|
||
|
Get your API key from https://browserbase.com
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
urls: Union[List[str], Tuple[str, ...]],
|
||
|
*,
|
||
|
api_key: Optional[str] = None,
|
||
|
text_content: bool = False,
|
||
|
):
|
||
|
self.urls = urls
|
||
|
self.text_content = text_content
|
||
|
|
||
|
try:
|
||
|
from browserbase import Browserbase
|
||
|
except ImportError:
|
||
|
raise ImportError(
|
||
|
"You must run "
|
||
|
"`pip install --upgrade "
|
||
|
"browserbase` "
|
||
|
"to use the Browserbase loader."
|
||
|
)
|
||
|
|
||
|
self.browserbase = Browserbase(api_key=api_key)
|
||
|
|
||
|
def lazy_load(self) -> Iterator[Document]:
|
||
|
"""Load pages from URLs"""
|
||
|
pages = self.browserbase.load_urls(self.urls, self.text_content)
|
||
|
|
||
|
for i, page in enumerate(pages):
|
||
|
yield Document(
|
||
|
page_content=page,
|
||
|
metadata={
|
||
|
"url": self.urls[i],
|
||
|
},
|
||
|
)
|