From 4b1b7959a21cd7915797f9bd788a373472daab69 Mon Sep 17 00:00:00 2001 From: Oleg Kulyk Date: Thu, 25 Jul 2024 04:11:43 +0300 Subject: [PATCH] community[minor]: Add ScrapingAnt Loader Community Integration (#24514) Added [ScrapingAnt](https://scrapingant.com/) Web Loader integration. ScrapingAnt is a web scraping API that allows extracting web page data into accessible and well-formatted markdown. Description: Added ScrapingAnt web loader for retrieving web page data as markdown Dependencies: scrapingant-client Twitter: @WeRunTheWorld3 --------- Co-authored-by: Oleg Kulyk --- .../document_loaders/scrapingant.ipynb | 188 ++++++++++++++++++ .../document_loaders/__init__.py | 5 + .../document_loaders/scrapingant.py | 66 ++++++ .../document_loaders/test_imports.py | 1 + 4 files changed, 260 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/scrapingant.ipynb create mode 100644 libs/community/langchain_community/document_loaders/scrapingant.py diff --git a/docs/docs/integrations/document_loaders/scrapingant.ipynb b/docs/docs/integrations/document_loaders/scrapingant.ipynb new file mode 100644 index 0000000000..46de054f4c --- /dev/null +++ b/docs/docs/integrations/document_loaders/scrapingant.ipynb @@ -0,0 +1,188 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: ScrapingAnt\n", + "---\n", + "\n", + "# ScrapingAnt\n", + "## Overview\n", + "[ScrapingAnt](https://scrapingant.com/) is a web scraping API with headless browser capabilities, proxies, and anti-bot bypass. It allows for extracting web page data into accessible LLM markdown.\n", + "\n", + "This particular integration uses only Markdown extraction feature, but don't hesitate to [reach out to us](mailto:support@scrapingant.com) if you need more features provided by ScrapingAnt, but not yet implemented in this integration.\n", + "\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Serializable | JS support |\n", + "|:---------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------|:-----:|:------------:|:----------:|\n", + "| [ScrapingAntLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.scrapingant.ScrapingAntLoader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ❌ | ❌ | ❌ | \n", + "\n", + "### Loader features\n", + "| Source | Document Lazy Loading | Async Support |\n", + "|:-----------------:|:---------------------:|:-------------:| \n", + "| ScrapingAntLoader | ✅ | ❌ | \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Install ScrapingAnt Python SDK and he required Langchain packages using pip:\n", + "```shell\n", + "pip install scrapingant-client langchain langchain-community\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Instantiation" + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2024-07-22T18:18:50.903258Z", + "start_time": "2024-07-22T18:18:35.265390Z" + } + }, + "source": [ + "from langchain_community.document_loaders import ScrapingAntLoader\n", + "\n", + "scrapingant_loader = ScrapingAntLoader(\n", + " [\"https://scrapingant.com/\", \"https://example.com/\"], # List of URLs to scrape\n", + " api_key=\"\", # Get your API key from https://scrapingant.com/\n", + " continue_on_failure=True, # Ignore unprocessable web pages and log their exceptions\n", + ")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(metadata={'url': 'https://scrapingant.com/'}, page_content=\"![](images/loader.svg)\\n\\n[![](images/ScrapingAnt-1.svg)](/) Features Pricing\\n\\nServices\\n\\n[Web Scraping API](/) [LLM-ready data extraction](/llm-ready-data-extraction)\\n[AI data scraping](/ai-data-scraper) [Residential Proxy](/residential-proxies)\\n\\n[Blog](https://scrapingant.com/blog/)\\n\\nDocumentatation\\n\\n[Web Scraping API](https://docs.scrapingant.com) [Residential\\nProxies](https://proxydocs.scrapingant.com)\\n\\nContact Us\\n\\n[Sign In](https://app.scrapingant.com/login)\\n\\n![](images/icon-menu.svg)\\n\\n![](images/Capterra-Rating.png)\\n\\n# Enterprise-Grade Scraping API. \\nAnt Sized Pricing.\\n\\n## Get the mission-critical speed, reliability, and features you need at a\\nfraction of the cost! \\n\\nGot Questions? \\n(get expert advice)\\n\\n[ Try Our Free Plan (10,000 API Credits) ](https://app.scrapingant.com/signup)\\n\\n![](images/lines-10-white.svg)![](images/lines-12-white.svg)\\n\\n### Proudly scaling with us\\n\\n![](images/_2cd6c6d09d261d19_281d72aa098ecca8.png)![](images/_bb8ca9c8d001abd4_dc29a36ce27bdee8_1_bb8ca9c8d001abd4_dc29a36ce27bdee8.png)![](images/_d84700234b61df23_9abf58d176a2d7fc.png)![](images/_ca6d37170ae5cd25_fca779750afd17ef.png)![](images/Screenshot-2024-05-22-at-23.28.16.png)\\n\\n### Industry Leading Pricing\\n\\nFrom our generous 10,000 API credit free plan to our industry leading paid\\nplans, we strive to provide unbeatable bang for your buck. That's just what\\nants do! \\n\\u200d\\n\\n![](images/industry-leading-prcing--compressed.webp)\\n\\nCost per 1,000 API Credits - Level 1 Plan\\n\\n### Unparalleled Value\\n\\nLow cost per API credit is great, but what’s even more important is how much\\ndata you can actually collect for each credit spent. Like any good Ant we\\nnever waste a crumb!\\n\\n![](images/unparalleled-value-compressed.webp)\\n\\nGoogle SERP API - Cost per 1,000 Requests – Level 1 Plan\\n\\n![](images/Doodle-4-White.svg)![](images/Doodle-Left-1-White.svg)\\n\\n## Ultimate Black Box Scraping Solution\\n\\n### Unlimited Concurrency \\n\\u200d\\n\\nWith unlimited parallel requests easily gather LARGE volumes of data from\\nmultiple locations in record time. Available on ALL plan levels. \\n\\u200d\\n\\n### Lightning Fast Scraping WITHOUT Getting Blocked\\n\\nOur proprietary algo seamlessly switches to the exact right proxy for almost\\nany situation, saving you and your dev team countless hours of frustration. \\n\\u200d\\n\\n#### What's inside?\\n\\n * Chrome Page Rendering\\n\\n * Low Latency Rotating Proxies \\n\\n * Javascript Execution\\n\\n * Custom Cookies\\n\\n * Fastest AWS & Hetzner Servers\\n\\n * Unlimited Parallel Requests\\n\\n * Headless Browsers \\n\\n * Residential Proxies\\n\\n * Supports All Programming Languages & Proxy\\n\\n * CAPTCHA Avoidance\\n\\n[ Try Our Free Plan (10,000 API Credits) ](https://app.scrapingant.com/signup)\\n\\n![](images/Doodle-3-White.svg)\\n\\n###### Metrics\\n\\n## The most reliable web scraping API\\n\\nOur clients have saved up to 40% of data collection budgets by integrating\\nScrapingAnt API instead of self-made solutions development.\\n\\n99.99%\\n\\nUptime over the last year.\\n\\n85.5%\\n\\nAnti-scraping avoidance rate with our custom cloud browser solution\\n\\n![](images/icon-gallery-dark.svg)\\n\\n### Unlimited parallel requests\\n\\n![](images/icon-id-dark.svg)\\n\\n### 3+ million proxy servers across the world\\n\\n![](images/icon-switcher-white.svg)\\n\\n### Open your web page as in a real browser\\n\\n![](images/Doodle-9-Dark.svg)\\n\\nSimple API integration\\n\\n1\\n\\n### Choose your plan\\n\\nWe offer subscription plans, or you can always request custom pricing. \\n **Free for personal use!**\\n\\n2\\n\\n### Test the API\\n\\nScrape your target website with our **UI request executor** or generate\\nscraping code for your preferred language.\\n\\n3\\n\\n### Scrape the Web\\n\\nBuild your data extraction pipeline using our **API** and forget about **rate\\nlimits** and **blocks**.\\n\\n![](images/Doodle-4-White.svg)![](images/Doodle-Left-1-White.svg)\\n\\n###### Pricing\\n\\n## Industry leading pricing that scales with your business.\\n\\n### Enthusiast\\n\\n#### 100.000 API credits\\n\\n$19\\n\\n/mo\\n\\nIdeal for freelancers or students.\\n\\n[ Get Started ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nEmail support\\n\\n![](images/check-small.svg)\\n\\nDocumentation-only integration\\n\\n### Startup\\n\\n#### 500.000 API credits\\n\\n$49\\n\\n/mo\\n\\nFor small to medium sized teams looking to grow. \\n \\nPopular choice!\\n\\n[ Get Started ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nPriority email support\\n\\n![](images/check-small.svg)\\n\\nExpert assistance\\n\\n![](images/check-small.svg)\\n\\nIntegration with custom code snippets\\n\\n### Business\\n\\n#### 3.000.000 API credits\\n\\n$249\\n\\n/mo\\n\\nFor larger teams and companies.\\n\\n[ Get Started ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nPriority email support\\n\\n![](images/check-small.svg)\\n\\nLive integration calls\\n\\n![](images/check-small.svg)\\n\\nExpert guidance and integration planning\\n\\n![](images/check-small.svg)\\n\\nCustom proxy pools\\n\\n![](images/check-small.svg)\\n\\nCustom avoidances\\n\\n![](images/check-small.svg)\\n\\nDedicated manager\\n\\n### Business Pro\\n\\n#### 8.000.000 API credits\\n\\n$599\\n\\n/mo\\n\\nExtended volume Business plan.\\n\\n[ Get Started ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nPriority email support\\n\\n![](images/check-small.svg)\\n\\nLive integration calls\\n\\n![](images/check-small.svg)\\n\\nExpert guidance and integration planning\\n\\n![](images/check-small.svg)\\n\\nCustom proxy pools\\n\\n![](images/check-small.svg)\\n\\nCustom avoidances\\n\\n![](images/check-small.svg)\\n\\nDedicated manager\\n\\n### Custom Plan\\n\\n#### 10M+ API credits\\n\\n$699+\\n\\n/mo\\n\\nExplore custom deals and services we could provide for Enterprise level\\ncustomers.\\n\\n[ Contact us ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nFully customisable solution\\n\\n![](images/check-small.svg)\\n\\nResidential Proxy special prices\\n\\n![](images/check-small.svg)\\n\\nSLA\\n\\n[![](images/Capterra-\\nRating.png)](https://www.capterra.com/p/214735/ScrapingAnt/reviews/)\\n\\n★ ★ ★ ★ ★\\n\\n![](images/5521ce5758e089d7d7f5d226a2e995c3.jpg)\\n\\n#### “Onboarding and API integration was smooth and clear. Everything works\\ngreat. The support was excellent. **Overall a great scraper**.”\\n\\nIllia K., Android Software Developer\\n\\n★ ★ ★ ★ ★\\n\\n![](images/e57164aafb18d9a888776c96cf159368.jpg)\\n\\n#### “Great communication with co-founders helped me to get the job done.\\nGreat proxy diversity and good price.”\\n\\nAndrii M., Senior Software Engineer\\n\\n★ ★ ★ ★ ★\\n\\n![](images/Dmytro-T..jpg)\\n\\n#### “This product helps me to scale and extend my business. The API is easy\\nto integrate and support is really good.”\\n\\nDmytro T., Senior Software Engineer\\n\\n![](images/Doodle-7-Dark.svg)![](images/Doodle-8-Dark.svg)\\n\\n#### Frequently asked questions.\\n\\nIf you have any further questions, [Get in\\ntouch](https://scrapingant.com/#contact) with our friendly team\\n\\n##### What is ScrapingAnt?\\n\\n![](images/icon-arrow-right.svg)\\n\\nScrapingAnt is a service that helps you to solve scraping tasks of any\\ncomplexity. With using of millions proxies around the World and a whole\\nheadless browser cluster we can provide you the best web harvesting and\\nscraping experience. \\n \\nScrapingAnt also provides a custom software development service. Data\\nharvesting, data storage or data querying - we can provide you the best and\\naffordable custom solution that fits all your needs.\\n\\n##### **What is an API Credit?**\\n\\n![](images/icon-arrow-right.svg)\\n\\nEach subscription plan contains a particular amount of API credits per month.\\nDepending on the parameters you configures your API calls it will cost you\\nfrom one to several credits. By default, each request costs 10 API credits\\nbecause JavaScript rendering and Standard proxies are enabled. [Learn more\\nabout requests costs](https://docs.scrapingant.com/api-credits-usage).\\n\\n##### I'm not a developer, can you create custom scraping solutions for me?\\n\\n![](images/icon-arrow-right.svg)\\n\\nYes of course! We regularly create custom scraping scripts and projects for\\nour clients. We are also partnering with several custom software development\\ncompanies, so we won't never be out of resources to help with a scraping\\nproject of any size. Just [Contact Us](https://scrapingant.com/#contact) and\\ndescribe your needs.\\n\\n##### Do I need a credit cart to start the free trial?\\n\\n![](images/icon-arrow-right.svg)\\n\\nScrapingAnt provides a completely free subscription plan which contains 10.000\\nAPI credits that can be consumed during month. Until you will need more - it\\nis completely free and doesn't require a credit card.\\n\\n### “Our clients are pleasantly surprised by the response speed of our team.”\\n\\n![](images/oleg-cartoon-image.jpg)\\n\\nOleg Kulyk, \\nScrapingAnt Founder\\n\\n* Our team will contact you ASAP.\\n\\nThank you! Your submission has been received!\\n\\nOops! Something went wrong while submitting the form.\\n\\n![](images/illustration-speed-lines-white.svg)\\n\\n## Grow your business with us\\n\\n[ Try Our Free Plan! ](https://app.scrapingant.com/signup)\\n\\n[\\n\\n## Features\\n\\n](https://scrapingant.com/#features) [\\n\\n## Pricing\\n\\n](https://scrapingant.com/#pricing) [\\n\\n## Blog\\n\\n](https://scrapingant.com/blog/) [\\n\\n## Documentation\\n\\n](https://docs.scrapingant.com/) [\\n\\n## Web Scraping API\\n\\n](https://scrapingant.com) [\\n\\n## LLM-ready web data\\n\\n](llm-ready-data-extraction.html) [\\n\\n## Residential Proxy\\n\\n](residential-proxies.html) [\\n\\n## Custom Scraper Development\\n\\n](https://scrapingant.com/custom-scraping-solution) [\\n\\n## Affiliate program\\n\\n](https://scrapingant.com/legal/affiliate/) [\\n\\n## Free proxies\\n\\n](https://scrapingant.com/free-proxies/)\\n\\n###### Web Scraping 101 \\n\\n[What is Web Scraping?](https://docs.scrapingant.com/web-scraping-101/what-is-\\nweb-scraping) [**Is Web Scraping Legal?**](https://scrapingant.com/blog/is-\\nweb-scraping-legal) [**10 Main Proxy\\nTypes**](https://scrapingant.com/blog/main-proxy-types) [Datacenter vs\\nResidential Proxies](https://scrapingant.com/blog/residential-vs-datacenter-\\nproxy-webscraping) [Best Proxy Scraping\\nTools](https://scrapingant.com/blog/top-open-source-proxy-scrapers)\\n[**Overcoming scraping challenges with Web Scraping\\nAPI**](https://scrapingant.com/blog/data-scraping-challenges) [IP rate-\\nlimiting avoidance](https://scrapingant.com/blog/avoid-ip-rate-limiting)\\n[Rotating proxies with Puppeteer](https://scrapingant.com/blog/how-to-use-\\nrotating-proxies-with-puppeteer) [Scraping Dynamic Website with\\nPython](https://scrapingant.com/blog/scrape-dynamic-website-with-python) [Web\\nScraping with Python](https://scrapingant.com/blog/top-5-popular-python-\\nlibraries-for-web-scraping-in-2020) [Web Scraping with\\nJava](https://scrapingant.com/blog/web-scraping-java) [Web Scraping with\\nNodeJS](https://scrapingant.com/blog/web-scraping-javascript) [Web Scraping\\nwith Deno](https://scrapingant.com/blog/deno-web-scraping) [**Web Scraping\\nwith R**](https://scrapingant.com/blog/r-web-scraping) [**Web Scraping with\\nPHP**](https://scrapingant.com/blog/web-scraping-php) [**Web Scraping with\\nGo**](https://scrapingant.com/blog/web-scraping-go)\\n\\n###### Use Cases \\n\\n[**Real estate decisions with Booking.com\\nscraping**](https://scrapingant.com/blog/booking-data-scraping) [**Sneaker\\nPrice Data Collection with Web Scraping\\nAPI**](https://scrapingant.com/blog/sneakers-scraping-api) [**Best Web\\nScraping APIs For Freelancers**](https://scrapingant.com/blog/best-web-\\nscraping-api-freelance) [**Smart NFT Decisions with Data\\nCollection**](https://scrapingant.com/blog/nft-data-collection) [**How Data\\nCollection Can Improve HR Processes**](https://scrapingant.com/blog/data-\\ncollection-for-hr-processes) [**Rule eCommerce with Data\\nCollection**](https://scrapingant.com/blog/data-collection-for-ecommerce)\\n[**How companies use Web Scraping to gain a Competitive\\nEdge**](https://scrapingant.com/blog/how-companies-use-web-scraping)\\n[**Benefits of Web Scraping for\\nHospitality**](https://scrapingant.com/blog/web-scraping-for-hospitality)\\n[**Uses of Web Scraping for Price\\nMonitoring**](https://scrapingant.com/blog/web-scraping-for-price-monitoring)\\n[**Benefits of Web Scraping for Real\\nEstate**](https://scrapingant.com/blog/web-scraping-for-real-estate) [**Web\\nScraping for Data Scientists**](https://scrapingant.com/blog/web-scraping-for-\\ndata-scientists) [**How to Collect Data from\\nTikTok**](https://scrapingant.com/blog/web-scraping-for-price-monitoring)\\n\\n###### Legal \\n\\n[Terms of Use](https://scrapingant.com/legal/terms-of-use) [Privacy\\nPolicy](https://scrapingant.com/legal/privacy-policy) [Cookies\\nPolicy](https://scrapingant.com/legal/cookies-policy)\\n\\n###### External Links \\n\\n[Github](https://github.com/ScrapingAnt)\\n[Linkedin](https://linkedin.com/company/scrapingant)\\n[Facebook](https://www.facebook.com/scrapingant)\\n[Twitter](https://twitter.com/ScrapingAnt)\\n\\n[![](images/ScrapingAnt-2.svg)](https://scrapingant.com)\\n\\n© Copyright ScrapingAnt \\nPowered by [DATAANT](https://scrapingant.com)\\n\\n![](images/lines-13-white.svg)\\n\\nBy browsing this site, you agree to our [Cookies\\nPolicy](https://scrapingant.com/legal/cookies-policy)\\n\\n![](images/icon-x_1.svg)\\n\\n\"), Document(metadata={'url': 'https://example.com/'}, page_content='# Example Domain\\n\\nThis domain is for use in illustrative examples in documents. You may use this\\ndomain in literature without prior coordination or asking for permission.\\n\\n[More information...](https://www.iana.org/domains/example)\\n\\n')]\n" + ] + } + ], + "execution_count": 6 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "The ScrapingAntLoader also allows providing a dict - scraping config for customizing the scrape request. As it is based on the [ScrapingAnt Python SDK](https://github.com/ScrapingAnt/scrapingant-client-python) you can pass any of the [common arguments](https://github.com/ScrapingAnt/scrapingant-client-python) to the `scrape_config` parameter." + }, + { + "cell_type": "code", + "metadata": { + "ExecuteTime": { + "end_time": "2024-07-21T22:02:30.701905Z", + "start_time": "2024-07-21T22:02:29.036115Z" + } + }, + "source": [ + "from langchain_community.document_loaders import ScrapingAntLoader\n", + "\n", + "scrapingant_config = {\n", + " \"browser\": True, # Enable browser rendering with a cloud browser\n", + " \"proxy_type\": \"datacenter\", # Select a proxy type (datacenter or residential)\n", + " \"proxy_country\": \"us\", # Select a proxy location\n", + "}\n", + "\n", + "scrapingant_additional_config_loader = ScrapingAntLoader(\n", + " [\"https://scrapingant.com/\"],\n", + " api_key=\"\", # Get your API key from https://scrapingant.com/\n", + " continue_on_failure=True, # Ignore unprocessable web pages and log their exceptions\n", + " scrape_config=scrapingant_config, # Pass the scrape_config object\n", + ")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(metadata={'url': 'https://scrapingant.com/'}, page_content=\"![](images/loader.svg)\\n\\n[![](images/ScrapingAnt-1.svg)](/) Features Pricing\\n\\nServices\\n\\n[Web Scraping API](/) [LLM-ready data extraction](/llm-ready-data-extraction)\\n[AI data scraping](/ai-data-scraper) [Residential Proxy](/residential-proxies)\\n\\n[Blog](https://scrapingant.com/blog/)\\n\\nDocumentatation\\n\\n[Web Scraping API](https://docs.scrapingant.com) [Residential\\nProxies](https://proxydocs.scrapingant.com)\\n\\nContact Us\\n\\n[Sign In](https://app.scrapingant.com/login)\\n\\n![](images/icon-menu.svg)\\n\\n![](images/Capterra-Rating.png)\\n\\n# Enterprise-Grade Scraping API. \\nAnt Sized Pricing.\\n\\n## Get the mission-critical speed, reliability, and features you need at a\\nfraction of the cost! \\n\\nGot Questions? \\n(get expert advice)\\n\\n[ Try Our Free Plan (10,000 API Credits) ](https://app.scrapingant.com/signup)\\n\\n![](images/lines-10-white.svg)![](images/lines-12-white.svg)\\n\\n### Proudly scaling with us\\n\\n![](images/_2cd6c6d09d261d19_281d72aa098ecca8.png)![](images/_bb8ca9c8d001abd4_dc29a36ce27bdee8_1_bb8ca9c8d001abd4_dc29a36ce27bdee8.png)![](images/_d84700234b61df23_9abf58d176a2d7fc.png)![](images/_ca6d37170ae5cd25_fca779750afd17ef.png)![](images/Screenshot-2024-05-22-at-23.28.16.png)\\n\\n### Industry Leading Pricing\\n\\nFrom our generous 10,000 API credit free plan to our industry leading paid\\nplans, we strive to provide unbeatable bang for your buck. That's just what\\nants do! \\n\\u200d\\n\\n![](images/industry-leading-prcing--compressed.webp)\\n\\nCost per 1,000 API Credits - Level 1 Plan\\n\\n### Unparalleled Value\\n\\nLow cost per API credit is great, but what’s even more important is how much\\ndata you can actually collect for each credit spent. Like any good Ant we\\nnever waste a crumb!\\n\\n![](images/unparalleled-value-compressed.webp)\\n\\nGoogle SERP API - Cost per 1,000 Requests – Level 1 Plan\\n\\n![](images/Doodle-4-White.svg)![](images/Doodle-Left-1-White.svg)\\n\\n## Ultimate Black Box Scraping Solution\\n\\n### Unlimited Concurrency \\n\\u200d\\n\\nWith unlimited parallel requests easily gather LARGE volumes of data from\\nmultiple locations in record time. Available on ALL plan levels. \\n\\u200d\\n\\n### Lightning Fast Scraping WITHOUT Getting Blocked\\n\\nOur proprietary algo seamlessly switches to the exact right proxy for almost\\nany situation, saving you and your dev team countless hours of frustration. \\n\\u200d\\n\\n#### What's inside?\\n\\n * Chrome Page Rendering\\n\\n * Low Latency Rotating Proxies \\n\\n * Javascript Execution\\n\\n * Custom Cookies\\n\\n * Fastest AWS & Hetzner Servers\\n\\n * Unlimited Parallel Requests\\n\\n * Headless Browsers \\n\\n * Residential Proxies\\n\\n * Supports All Programming Languages & Proxy\\n\\n * CAPTCHA Avoidance\\n\\n[ Try Our Free Plan (10,000 API Credits) ](https://app.scrapingant.com/signup)\\n\\n![](images/Doodle-3-White.svg)\\n\\n###### Metrics\\n\\n## The most reliable web scraping API\\n\\nOur clients have saved up to 40% of data collection budgets by integrating\\nScrapingAnt API instead of self-made solutions development.\\n\\n99.99%\\n\\nUptime over the last year.\\n\\n85.5%\\n\\nAnti-scraping avoidance rate with our custom cloud browser solution\\n\\n![](images/icon-gallery-dark.svg)\\n\\n### Unlimited parallel requests\\n\\n![](images/icon-id-dark.svg)\\n\\n### 3+ million proxy servers across the world\\n\\n![](images/icon-switcher-white.svg)\\n\\n### Open your web page as in a real browser\\n\\n![](images/Doodle-9-Dark.svg)\\n\\nSimple API integration\\n\\n1\\n\\n### Choose your plan\\n\\nWe offer subscription plans, or you can always request custom pricing. \\n **Free for personal use!**\\n\\n2\\n\\n### Test the API\\n\\nScrape your target website with our **UI request executor** or generate\\nscraping code for your preferred language.\\n\\n3\\n\\n### Scrape the Web\\n\\nBuild your data extraction pipeline using our **API** and forget about **rate\\nlimits** and **blocks**.\\n\\n![](images/Doodle-4-White.svg)![](images/Doodle-Left-1-White.svg)\\n\\n###### Pricing\\n\\n## Industry leading pricing that scales with your business.\\n\\n### Enthusiast\\n\\n#### 100.000 API credits\\n\\n$19\\n\\n/mo\\n\\nIdeal for freelancers or students.\\n\\n[ Get Started ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nEmail support\\n\\n![](images/check-small.svg)\\n\\nDocumentation-only integration\\n\\n### Startup\\n\\n#### 500.000 API credits\\n\\n$49\\n\\n/mo\\n\\nFor small to medium sized teams looking to grow. \\n \\nPopular choice!\\n\\n[ Get Started ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nPriority email support\\n\\n![](images/check-small.svg)\\n\\nExpert assistance\\n\\n![](images/check-small.svg)\\n\\nIntegration with custom code snippets\\n\\n### Business\\n\\n#### 3.000.000 API credits\\n\\n$249\\n\\n/mo\\n\\nFor larger teams and companies.\\n\\n[ Get Started ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nPriority email support\\n\\n![](images/check-small.svg)\\n\\nLive integration calls\\n\\n![](images/check-small.svg)\\n\\nExpert guidance and integration planning\\n\\n![](images/check-small.svg)\\n\\nCustom proxy pools\\n\\n![](images/check-small.svg)\\n\\nCustom avoidances\\n\\n![](images/check-small.svg)\\n\\nDedicated manager\\n\\n### Business Pro\\n\\n#### 8.000.000 API credits\\n\\n$599\\n\\n/mo\\n\\nExtended volume Business plan.\\n\\n[ Get Started ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nPriority email support\\n\\n![](images/check-small.svg)\\n\\nLive integration calls\\n\\n![](images/check-small.svg)\\n\\nExpert guidance and integration planning\\n\\n![](images/check-small.svg)\\n\\nCustom proxy pools\\n\\n![](images/check-small.svg)\\n\\nCustom avoidances\\n\\n![](images/check-small.svg)\\n\\nDedicated manager\\n\\n### Custom Plan\\n\\n#### 10M+ API credits\\n\\n$699+\\n\\n/mo\\n\\nExplore custom deals and services we could provide for Enterprise level\\ncustomers.\\n\\n[ Contact us ](https://app.scrapingant.com/signup)\\n\\n![](images/check-small.svg)\\n\\nFully customisable solution\\n\\n![](images/check-small.svg)\\n\\nResidential Proxy special prices\\n\\n![](images/check-small.svg)\\n\\nSLA\\n\\n[![](images/Capterra-\\nRating.png)](https://www.capterra.com/p/214735/ScrapingAnt/reviews/)\\n\\n★ ★ ★ ★ ★\\n\\n![](images/5521ce5758e089d7d7f5d226a2e995c3.jpg)\\n\\n#### “Onboarding and API integration was smooth and clear. Everything works\\ngreat. The support was excellent. **Overall a great scraper**.”\\n\\nIllia K., Android Software Developer\\n\\n★ ★ ★ ★ ★\\n\\n![](images/e57164aafb18d9a888776c96cf159368.jpg)\\n\\n#### “Great communication with co-founders helped me to get the job done.\\nGreat proxy diversity and good price.”\\n\\nAndrii M., Senior Software Engineer\\n\\n★ ★ ★ ★ ★\\n\\n![](images/Dmytro-T..jpg)\\n\\n#### “This product helps me to scale and extend my business. The API is easy\\nto integrate and support is really good.”\\n\\nDmytro T., Senior Software Engineer\\n\\n![](images/Doodle-7-Dark.svg)![](images/Doodle-8-Dark.svg)\\n\\n#### Frequently asked questions.\\n\\nIf you have any further questions, [Get in\\ntouch](https://scrapingant.com/#contact) with our friendly team\\n\\n##### What is ScrapingAnt?\\n\\n![](images/icon-arrow-right.svg)\\n\\nScrapingAnt is a service that helps you to solve scraping tasks of any\\ncomplexity. With using of millions proxies around the World and a whole\\nheadless browser cluster we can provide you the best web harvesting and\\nscraping experience. \\n \\nScrapingAnt also provides a custom software development service. Data\\nharvesting, data storage or data querying - we can provide you the best and\\naffordable custom solution that fits all your needs.\\n\\n##### **What is an API Credit?**\\n\\n![](images/icon-arrow-right.svg)\\n\\nEach subscription plan contains a particular amount of API credits per month.\\nDepending on the parameters you configures your API calls it will cost you\\nfrom one to several credits. By default, each request costs 10 API credits\\nbecause JavaScript rendering and Standard proxies are enabled. [Learn more\\nabout requests costs](https://docs.scrapingant.com/api-credits-usage).\\n\\n##### I'm not a developer, can you create custom scraping solutions for me?\\n\\n![](images/icon-arrow-right.svg)\\n\\nYes of course! We regularly create custom scraping scripts and projects for\\nour clients. We are also partnering with several custom software development\\ncompanies, so we won't never be out of resources to help with a scraping\\nproject of any size. Just [Contact Us](https://scrapingant.com/#contact) and\\ndescribe your needs.\\n\\n##### Do I need a credit cart to start the free trial?\\n\\n![](images/icon-arrow-right.svg)\\n\\nScrapingAnt provides a completely free subscription plan which contains 10.000\\nAPI credits that can be consumed during month. Until you will need more - it\\nis completely free and doesn't require a credit card.\\n\\n### “Our clients are pleasantly surprised by the response speed of our team.”\\n\\n![](images/oleg-cartoon-image.jpg)\\n\\nOleg Kulyk, \\nScrapingAnt Founder\\n\\n* Our team will contact you ASAP.\\n\\nThank you! Your submission has been received!\\n\\nOops! Something went wrong while submitting the form.\\n\\n![](images/illustration-speed-lines-white.svg)\\n\\n## Grow your business with us\\n\\n[ Try Our Free Plan! ](https://app.scrapingant.com/signup)\\n\\n[\\n\\n## Features\\n\\n](https://scrapingant.com/#features) [\\n\\n## Pricing\\n\\n](https://scrapingant.com/#pricing) [\\n\\n## Blog\\n\\n](https://scrapingant.com/blog/) [\\n\\n## Documentation\\n\\n](https://docs.scrapingant.com/) [\\n\\n## Web Scraping API\\n\\n](https://scrapingant.com) [\\n\\n## LLM-ready web data\\n\\n](llm-ready-data-extraction.html) [\\n\\n## Residential Proxy\\n\\n](residential-proxies.html) [\\n\\n## Custom Scraper Development\\n\\n](https://scrapingant.com/custom-scraping-solution) [\\n\\n## Affiliate program\\n\\n](https://scrapingant.com/legal/affiliate/) [\\n\\n## Free proxies\\n\\n](https://scrapingant.com/free-proxies/)\\n\\n###### Web Scraping 101 \\n\\n[What is Web Scraping?](https://docs.scrapingant.com/web-scraping-101/what-is-\\nweb-scraping) [**Is Web Scraping Legal?**](https://scrapingant.com/blog/is-\\nweb-scraping-legal) [**10 Main Proxy\\nTypes**](https://scrapingant.com/blog/main-proxy-types) [Datacenter vs\\nResidential Proxies](https://scrapingant.com/blog/residential-vs-datacenter-\\nproxy-webscraping) [Best Proxy Scraping\\nTools](https://scrapingant.com/blog/top-open-source-proxy-scrapers)\\n[**Overcoming scraping challenges with Web Scraping\\nAPI**](https://scrapingant.com/blog/data-scraping-challenges) [IP rate-\\nlimiting avoidance](https://scrapingant.com/blog/avoid-ip-rate-limiting)\\n[Rotating proxies with Puppeteer](https://scrapingant.com/blog/how-to-use-\\nrotating-proxies-with-puppeteer) [Scraping Dynamic Website with\\nPython](https://scrapingant.com/blog/scrape-dynamic-website-with-python) [Web\\nScraping with Python](https://scrapingant.com/blog/top-5-popular-python-\\nlibraries-for-web-scraping-in-2020) [Web Scraping with\\nJava](https://scrapingant.com/blog/web-scraping-java) [Web Scraping with\\nNodeJS](https://scrapingant.com/blog/web-scraping-javascript) [Web Scraping\\nwith Deno](https://scrapingant.com/blog/deno-web-scraping) [**Web Scraping\\nwith R**](https://scrapingant.com/blog/r-web-scraping) [**Web Scraping with\\nPHP**](https://scrapingant.com/blog/web-scraping-php) [**Web Scraping with\\nGo**](https://scrapingant.com/blog/web-scraping-go)\\n\\n###### Use Cases \\n\\n[**Real estate decisions with Booking.com\\nscraping**](https://scrapingant.com/blog/booking-data-scraping) [**Sneaker\\nPrice Data Collection with Web Scraping\\nAPI**](https://scrapingant.com/blog/sneakers-scraping-api) [**Best Web\\nScraping APIs For Freelancers**](https://scrapingant.com/blog/best-web-\\nscraping-api-freelance) [**Smart NFT Decisions with Data\\nCollection**](https://scrapingant.com/blog/nft-data-collection) [**How Data\\nCollection Can Improve HR Processes**](https://scrapingant.com/blog/data-\\ncollection-for-hr-processes) [**Rule eCommerce with Data\\nCollection**](https://scrapingant.com/blog/data-collection-for-ecommerce)\\n[**How companies use Web Scraping to gain a Competitive\\nEdge**](https://scrapingant.com/blog/how-companies-use-web-scraping)\\n[**Benefits of Web Scraping for\\nHospitality**](https://scrapingant.com/blog/web-scraping-for-hospitality)\\n[**Uses of Web Scraping for Price\\nMonitoring**](https://scrapingant.com/blog/web-scraping-for-price-monitoring)\\n[**Benefits of Web Scraping for Real\\nEstate**](https://scrapingant.com/blog/web-scraping-for-real-estate) [**Web\\nScraping for Data Scientists**](https://scrapingant.com/blog/web-scraping-for-\\ndata-scientists) [**How to Collect Data from\\nTikTok**](https://scrapingant.com/blog/web-scraping-for-price-monitoring)\\n\\n###### Legal \\n\\n[Terms of Use](https://scrapingant.com/legal/terms-of-use) [Privacy\\nPolicy](https://scrapingant.com/legal/privacy-policy) [Cookies\\nPolicy](https://scrapingant.com/legal/cookies-policy)\\n\\n###### External Links \\n\\n[Github](https://github.com/ScrapingAnt)\\n[Linkedin](https://linkedin.com/company/scrapingant)\\n[Facebook](https://www.facebook.com/scrapingant)\\n[Twitter](https://twitter.com/ScrapingAnt)\\n\\n[![](images/ScrapingAnt-2.svg)](https://scrapingant.com)\\n\\n© Copyright ScrapingAnt \\nPowered by [DATAANT](https://scrapingant.com)\\n\\n![](images/lines-13-white.svg)\\n\\nBy browsing this site, you agree to our [Cookies\\nPolicy](https://scrapingant.com/legal/cookies-policy)\\n\\n![](images/icon-x_1.svg)\\n\\n\")]\n" + ] + } + ], + "execution_count": 5 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Load\n", + "\n", + "Use the `load` method to scrape the web pages and get the extracted markdown content.\n" + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "# Load documents from URLs as markdown\n", + "documents = scrapingant_loader.load()\n", + "\n", + "print(documents)" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Lazy Load\n", + "\n", + "Use the 'lazy_load' method to scrape the web pages and get the extracted markdown content lazily." + ] + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "# Lazy load documents from URLs as markdown\n", + "lazy_documents = scrapingant_loader.lazy_load()\n", + "\n", + "for document in lazy_documents:\n", + " print(document)" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## API reference\n", + "\n", + "This loader is based on the [ScrapingAnt Python SDK](https://docs.scrapingant.com/python-client). For more configuration options, see the [common arguments](https://github.com/ScrapingAnt/scrapingant-client-python/tree/master?tab=readme-ov-file#common-arguments)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index e03f769312..eb059d6fbe 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -411,6 +411,9 @@ if TYPE_CHECKING: from langchain_community.document_loaders.scrapfly import ( ScrapflyLoader, ) + from langchain_community.document_loaders.scrapingant import ( + ScrapingAntLoader, + ) from langchain_community.document_loaders.sharepoint import ( SharePointLoader, ) @@ -666,6 +669,7 @@ _module_lookup = { "S3DirectoryLoader": "langchain_community.document_loaders.s3_directory", "S3FileLoader": "langchain_community.document_loaders.s3_file", "ScrapflyLoader": "langchain_community.document_loaders.scrapfly", + "ScrapingAntLoader": "langchain_community.document_loaders.scrapingant", "SQLDatabaseLoader": "langchain_community.document_loaders.sql_database", "SRTLoader": "langchain_community.document_loaders.srt", "SeleniumURLLoader": "langchain_community.document_loaders.url_selenium", @@ -870,6 +874,7 @@ __all__ = [ "S3DirectoryLoader", "S3FileLoader", "ScrapflyLoader", + "ScrapingAntLoader", "SQLDatabaseLoader", "SRTLoader", "SeleniumURLLoader", diff --git a/libs/community/langchain_community/document_loaders/scrapingant.py b/libs/community/langchain_community/document_loaders/scrapingant.py new file mode 100644 index 0000000000..43b3bfd417 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/scrapingant.py @@ -0,0 +1,66 @@ +"""ScrapingAnt Web Extractor.""" + +import logging +from typing import Iterator, List, Optional + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document +from langchain_core.utils import get_from_env + +logger = logging.getLogger(__file__) + + +class ScrapingAntLoader(BaseLoader): + """Turn an url to LLM accessible markdown with `ScrapingAnt`. + + For further details, visit: https://docs.scrapingant.com/python-client + """ + + def __init__( + self, + urls: List[str], + *, + api_key: Optional[str] = None, + scrape_config: Optional[dict] = None, + continue_on_failure: bool = True, + ) -> None: + """Initialize client. + + Args: + urls: List of urls to scrape. + api_key: The ScrapingAnt API key. If not specified must have env var + SCRAPINGANT_API_KEY set. + scrape_config: The scraping config from ScrapingAntClient.markdown_request + continue_on_failure: Whether to continue if scraping an url fails. + """ + try: + from scrapingant_client import ScrapingAntClient + except ImportError: + raise ImportError( + "`scrapingant-client` package not found," + " run `pip install scrapingant-client`" + ) + if not urls: + raise ValueError("URLs must be provided.") + api_key = api_key or get_from_env("api_key", "SCRAPINGANT_API_KEY") + self.client = ScrapingAntClient(token=api_key) + self.urls = urls + self.scrape_config = scrape_config + self.continue_on_failure = continue_on_failure + + def lazy_load(self) -> Iterator[Document]: + """Fetch data from ScrapingAnt.""" + + scrape_config = self.scrape_config if self.scrape_config is not None else {} + for url in self.urls: + try: + result = self.client.markdown_request(url=url, **scrape_config) + yield Document( + page_content=result.markdown, + metadata={"url": result.url}, + ) + except Exception as e: + if self.continue_on_failure: + logger.error(f"Error fetching data from {url}, exception: {e}") + else: + raise e diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index 5cd9ce3d40..fbf624f537 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -142,6 +142,7 @@ EXPECTED_ALL = [ "S3DirectoryLoader", "S3FileLoader", "ScrapflyLoader", + "ScrapingAntLoader", "SQLDatabaseLoader", "SRTLoader", "SeleniumURLLoader",