community[minor]: Add Scrapfly Loader community integration (#22036)

Added [Scrapfly](https://scrapfly.io/) Web Loader integration. Scrapfly
is a web scraping API that allows extracting web page data into
accessible markdown or text datasets.

- __Description__: Added Scrapfly web loader for retrieving web page
data as markdown or text.
- Dependencies: scrapfly-sdk
- Twitter: @thealchemi1st

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/22011/merge
Mazen Ramadan 4 weeks ago committed by GitHub
parent 9a66c43146
commit 3c1d77dd64
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,107 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## ScrapFly\n",
"[ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Installation\n",
"Install ScrapFly Python SDK and he required Langchain packages using pip:\n",
"```shell\n",
"pip install scrapfly-sdk langchain langchain-community\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Usage"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import ScrapflyLoader\n",
"\n",
"scrapfly_loader = ScrapflyLoader(\n",
" [\"https://web-scraping.dev/products\"],\n",
" api_key=\"Your ScrapFly API key\", # Get your API key from https://www.scrapfly.io/\n",
" ignore_scrape_failures=True, # Ignore unprocessable web pages and log their exceptions\n",
")\n",
"\n",
"# Load documents from URLs as markdown\n",
"documents = scrapfly_loader.load()\n",
"print(documents)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The ScrapflyLoader also allows passigng ScrapeConfig object for customizing the scrape request. See the documentation for the full feature details and their API params: https://scrapfly.io/docs/scrape-api/getting-started"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import ScrapflyLoader\n",
"\n",
"scrapfly_scrape_config = {\n",
" \"asp\": True, # Bypass scraping blocking and antibot solutions, like Cloudflare\n",
" \"render_js\": True, # Enable JavaScript rendering with a cloud headless browser\n",
" \"proxy_pool\": \"public_residential_pool\", # Select a proxy pool (datacenter or residnetial)\n",
" \"country\": \"us\", # Select a proxy location\n",
" \"auto_scroll\": True, # Auto scroll the page\n",
" \"js\": \"\", # Execute custom JavaScript code by the headless browser\n",
"}\n",
"\n",
"scrapfly_loader = ScrapflyLoader(\n",
" [\"https://web-scraping.dev/products\"],\n",
" api_key=\"Your ScrapFly API key\", # Get your API key from https://www.scrapfly.io/\n",
" ignore_scrape_failures=True, # Ignore unprocessable web pages and log their exceptions\n",
" scrape_config=scrapfly_scrape_config, # Pass the scrape_config object\n",
" scrape_format=\"markdown\", # The scrape result format, either `markdown`(default) or `text`\n",
")\n",
"\n",
"# Load documents from URLs as markdown\n",
"documents = scrapfly_loader.load()\n",
"print(documents)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -403,6 +403,9 @@ if TYPE_CHECKING:
from langchain_community.document_loaders.s3_file import (
S3FileLoader,
)
from langchain_community.document_loaders.scrapfly import (
ScrapflyLoader,
)
from langchain_community.document_loaders.sharepoint import (
SharePointLoader,
)
@ -654,6 +657,7 @@ _module_lookup = {
"RocksetLoader": "langchain_community.document_loaders.rocksetdb",
"S3DirectoryLoader": "langchain_community.document_loaders.s3_directory",
"S3FileLoader": "langchain_community.document_loaders.s3_file",
"ScrapflyLoader": "langchain_community.document_loaders.scrapfly",
"SQLDatabaseLoader": "langchain_community.document_loaders.sql_database",
"SRTLoader": "langchain_community.document_loaders.srt",
"SeleniumURLLoader": "langchain_community.document_loaders.url_selenium",
@ -854,6 +858,7 @@ __all__ = [
"RocksetLoader",
"S3DirectoryLoader",
"S3FileLoader",
"ScrapflyLoader",
"SQLDatabaseLoader",
"SRTLoader",
"SeleniumURLLoader",

@ -0,0 +1,69 @@
"""Scrapfly Web Reader."""
import logging
from typing import Iterator, List, Literal, Optional
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.utils import get_from_env
logger = logging.getLogger(__file__)
class ScrapflyLoader(BaseLoader):
"""Turn a url to llm accessible markdown with `Scrapfly.io`.
For further details, visit: https://scrapfly.io/docs/sdk/python
"""
def __init__(
self,
urls: List[str],
*,
api_key: Optional[str] = None,
scrape_format: Literal["markdown", "text"] = "markdown",
scrape_config: Optional[dict] = None,
continue_on_failure: bool = True,
) -> None:
"""Initialize client.
Args:
urls: List of urls to scrape.
api_key: The Scrapfly API key. If not specified must have env var
SCRAPFLY_API_KEY set.
scrape_format: Scrape result format, one or "markdown" or "text".
scrape_config: Dictionary of ScrapFly scrape config object.
continue_on_failure: Whether to continue if scraping a url fails.
"""
try:
from scrapfly import ScrapflyClient
except ImportError:
raise ImportError(
"`scrapfly` package not found, please run `pip install scrapfly-sdk`"
)
if not urls:
raise ValueError("URLs must be provided.")
api_key = api_key or get_from_env("api_key", "SCRAPFLY_API_KEY")
self.scrapfly = ScrapflyClient(key=api_key)
self.urls = urls
self.scrape_format = scrape_format
self.scrape_config = scrape_config
self.continue_on_failure = continue_on_failure
def lazy_load(self) -> Iterator[Document]:
from scrapfly import ScrapeConfig
scrape_config = self.scrape_config if self.scrape_config is not None else {}
for url in self.urls:
try:
response = self.scrapfly.scrape(
ScrapeConfig(url, format=self.scrape_format, **scrape_config)
)
yield Document(
page_content=response.scrape_result["content"],
metadata={"url": url},
)
except Exception as e:
if self.continue_on_failure:
logger.error(f"Error fetching data from {url}, exception: {e}")
else:
raise e

@ -138,6 +138,7 @@ EXPECTED_ALL = [
"RocksetLoader",
"S3DirectoryLoader",
"S3FileLoader",
"ScrapflyLoader",
"SQLDatabaseLoader",
"SRTLoader",
"SeleniumURLLoader",

Loading…
Cancel
Save