community[minor]: Add Scrapfly Loader community integration (#22036)

Added [Scrapfly](https://scrapfly.io/) Web Loader integration. Scrapfly is a web scraping API that allows extracting web page data into accessible markdown or text datasets. - __Description__: Added Scrapfly web loader for retrieving web page data as markdown or text. - Dependencies: scrapfly-sdk - Twitter: @thealchemi1st --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
1 month ago · 3c1d77dd64
parent 9a66c43146
commit 3c1d77dd64
4 changed files with 182 additions and 0 deletions
--- a/docs/docs/integrations/document_loaders/scrapfly.ipynb
+++ b/docs/docs/integrations/document_loaders/scrapfly.ipynb
@ -0,0 +1,107 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ScrapFly\n",
    "[ScrapFly](https://scrapfly.io/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown or text."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Installation\n",
    "Install ScrapFly Python SDK and he required Langchain packages using pip:\n",
    "```shell\n",
    "pip install scrapfly-sdk langchain langchain-community\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Usage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import ScrapflyLoader\n",
    "\n",
    "scrapfly_loader = ScrapflyLoader(\n",
    "    [\"https://web-scraping.dev/products\"],\n",
    "    api_key=\"Your ScrapFly API key\",  # Get your API key from https://www.scrapfly.io/\n",
    "    ignore_scrape_failures=True,  # Ignore unprocessable web pages and log their exceptions\n",
    ")\n",
    "\n",
    "# Load documents from URLs as markdown\n",
    "documents = scrapfly_loader.load()\n",
    "print(documents)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The ScrapflyLoader also allows passigng ScrapeConfig object for customizing the scrape request. See the documentation for the full feature details and their API params: https://scrapfly.io/docs/scrape-api/getting-started"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import ScrapflyLoader\n",
    "\n",
    "scrapfly_scrape_config = {\n",
    "    \"asp\": True,  # Bypass scraping blocking and antibot solutions, like Cloudflare\n",
    "    \"render_js\": True,  # Enable JavaScript rendering with a cloud headless browser\n",
    "    \"proxy_pool\": \"public_residential_pool\",  # Select a proxy pool (datacenter or residnetial)\n",
    "    \"country\": \"us\",  # Select a proxy location\n",
    "    \"auto_scroll\": True,  # Auto scroll the page\n",
    "    \"js\": \"\",  # Execute custom JavaScript code by the headless browser\n",
    "}\n",
    "\n",
    "scrapfly_loader = ScrapflyLoader(\n",
    "    [\"https://web-scraping.dev/products\"],\n",
    "    api_key=\"Your ScrapFly API key\",  # Get your API key from https://www.scrapfly.io/\n",
    "    ignore_scrape_failures=True,  # Ignore unprocessable web pages and log their exceptions\n",
    "    scrape_config=scrapfly_scrape_config,  # Pass the scrape_config object\n",
    "    scrape_format=\"markdown\",  # The scrape result format, either `markdown`(default) or `text`\n",
    ")\n",
    "\n",
    "# Load documents from URLs as markdown\n",
    "documents = scrapfly_loader.load()\n",
    "print(documents)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/libs/community/langchain_community/document_loaders/init.py
+++ b/libs/community/langchain_community/document_loaders/init.py
@ -403,6 +403,9 @@ if TYPE_CHECKING:
    from langchain_community.document_loaders.s3_file import (
        S3FileLoader,
    )
    from langchain_community.document_loaders.scrapfly import (
        ScrapflyLoader,
    )
    from langchain_community.document_loaders.sharepoint import (
        SharePointLoader,
    )
@ -654,6 +657,7 @@ _module_lookup = {
    "RocksetLoader": "langchain_community.document_loaders.rocksetdb",
    "S3DirectoryLoader": "langchain_community.document_loaders.s3_directory",
    "S3FileLoader": "langchain_community.document_loaders.s3_file",
    "ScrapflyLoader": "langchain_community.document_loaders.scrapfly",
    "SQLDatabaseLoader": "langchain_community.document_loaders.sql_database",
    "SRTLoader": "langchain_community.document_loaders.srt",
    "SeleniumURLLoader": "langchain_community.document_loaders.url_selenium",
@ -854,6 +858,7 @@ __all__ = [
    "RocksetLoader",
    "S3DirectoryLoader",
    "S3FileLoader",
    "ScrapflyLoader",
    "SQLDatabaseLoader",
    "SRTLoader",
    "SeleniumURLLoader",
--- a/libs/community/langchain_community/document_loaders/scrapfly.py
+++ b/libs/community/langchain_community/document_loaders/scrapfly.py
@ -0,0 +1,69 @@
 """Scrapfly Web Reader."""
 import logging
 from typing import Iterator, List, Literal, Optional
 from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document
 from langchain_core.utils import get_from_env
 logger = logging.getLogger(__file__)
 class ScrapflyLoader(BaseLoader):
    """Turn a url to llm accessible markdown with `Scrapfly.io`.
    For further details, visit: https://scrapfly.io/docs/sdk/python
    """
    def __init__(
        self,
        urls: List[str],
        *,
        api_key: Optional[str] = None,
        scrape_format: Literal["markdown", "text"] = "markdown",
        scrape_config: Optional[dict] = None,
        continue_on_failure: bool = True,
    ) -> None:
        """Initialize client.
        Args:
            urls: List of urls to scrape.
            api_key: The Scrapfly API key. If not specified must have env var
                SCRAPFLY_API_KEY set.
            scrape_format: Scrape result format, one or "markdown" or "text".
            scrape_config: Dictionary of ScrapFly scrape config object.
            continue_on_failure: Whether to continue if scraping a url fails.
        """
        try:
            from scrapfly import ScrapflyClient
        except ImportError:
            raise ImportError(
                "`scrapfly` package not found, please run `pip install scrapfly-sdk`"
            )
        if not urls:
            raise ValueError("URLs must be provided.")
        api_key = api_key or get_from_env("api_key", "SCRAPFLY_API_KEY")
        self.scrapfly = ScrapflyClient(key=api_key)
        self.urls = urls
        self.scrape_format = scrape_format
        self.scrape_config = scrape_config
        self.continue_on_failure = continue_on_failure
    def lazy_load(self) -> Iterator[Document]:
        from scrapfly import ScrapeConfig
        scrape_config = self.scrape_config if self.scrape_config is not None else {}
        for url in self.urls:
            try:
                response = self.scrapfly.scrape(
                    ScrapeConfig(url, format=self.scrape_format, **scrape_config)
                )
                yield Document(
                    page_content=response.scrape_result["content"],
                    metadata={"url": url},
                )
            except Exception as e:
                if self.continue_on_failure:
                    logger.error(f"Error fetching data from {url}, exception: {e}")
                else:
                    raise e
--- a/libs/community/tests/unit_tests/document_loaders/test_imports.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py
@ -138,6 +138,7 @@ EXPECTED_ALL = [
    "RocksetLoader",
    "S3DirectoryLoader",
    "S3FileLoader",
    "ScrapflyLoader",
    "SQLDatabaseLoader",
    "SRTLoader",
    "SeleniumURLLoader",