diff --git a/README.md b/README.md index c56867c..d015401 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ Contents 5. [Using with Firefox Containers](#using-with-firefox-containers) 6. [Reverse Proxying](#reverse-proxying) 1. [Nginx](#nginx) + 6. [CAPTCHA Troubleshooting](#captcha-troubleshooting) 7. [Contributing](#contributing) 8. [FAQ](#faq) 9. [Public Instances](#public-instances) @@ -603,6 +604,22 @@ server { You can then add SSL support using LetsEncrypt by following a guide such as [this one](https://www.nginx.com/blog/using-free-ssltls-certificates-from-lets-encrypt-with-nginx/). +### CAPTCHA Troubleshooting + +**Note:** The maintainer(s) of Whoogle do not endorse or recommend any +particular approach for solving CAPTCHAs. The solution outlined below was +implemented by the community, and is available to anyone who is interested. + +#### Death By Captcha ([https://deathbycaptcha.com/](https://deathbycaptcha.com/)) + +To use Death By Captcha (DBC) with Whoogle, you'll need to perform the +following steps: + +1. [Create an account with DBC](https://deathbycaptcha.com/register) +2. Install the `deathbycaptcha` pip package (`pip install deathbycaptcha`) +3. Set the `DBC_USER` environment variable to your DBC username +4. Set the `DBC_PASS` environment variable to your DBC password + ## Contributing Under the hood, Whoogle is a basic Flask app with the following structure: diff --git a/app/request.py b/app/request.py index fb3fd1e..b0c77ff 100644 --- a/app/request.py +++ b/app/request.py @@ -1,5 +1,6 @@ from app.models.config import Config from app.utils.misc import read_config_bool +from app.utils import captcha from datetime import datetime from defusedxml import ElementTree as ET import random @@ -341,6 +342,23 @@ class Request: proxies=self.proxies, headers=headers, cookies=cookies) + if response.status_code == 429: + # google's CAPTCHA + # we have to handle it here because we filter out scripts from the page source + # later + print("WARN: CAPTCHA detected") + solved = captcha.solve(response, self.proxies, url=self.search_url + query) + if solved: + print("INFO: CAPTCHA solved. Retrying...") + response = requests.get( + (base_url or self.search_url) + query, + proxies=self.proxies, + headers=headers, + cookies=cookies + ) + if response.status_code == 429: + print("ERROR: It seems our IP is still blacklisted") + # Retry query with new identity if using Tor (max 10 attempts) if 'form id="captcha-form"' in response.text and self.tor: diff --git a/app/utils/captcha.py b/app/utils/captcha.py new file mode 100644 index 0000000..e7893e4 --- /dev/null +++ b/app/utils/captcha.py @@ -0,0 +1,82 @@ +""" +Itegration with third party CAPTCHA solving services +""" +# only deathbycaptcha atm but whatever +import os +import json +import requests + +from bs4 import BeautifulSoup as bs + +try: + import deathbycaptcha +except ImportError: + deathbycaptcha = None + +def parse_params(response): + """ + Parses a page with bs4 to fetch the data needed to solve the captcha. + """ + params = { + "googlekey": "", + "data-s": "", + } + soup = bs(response.text, "html.parser") + + recaptcha = soup.find(id="recaptcha") + if not recaptcha: + # i could save the page for debugging since this is usually + # hard to reproduce + raise AttributeError( + "Couldn't find the element with the CAPTCHA params" + "Are you sure this page contains Google's reCAPTCHA v2 with callback?" + ) + hidden_q = soup.find(type="hidden") + params["q"] = hidden_q.attrs["value"] + params["googlekey"] = recaptcha.attrs["data-sitekey"] + params["data-s"] = recaptcha.attrs["data-s"] + + return params + + +def solve(response, proxies, url): + """ + Get a response with a reCAPTCHA v2 and solve it using a third-party service. + """ + if deathbycaptcha is None: + print("WARN: The deathbycaptcha client is not installed") + return False + + client = deathbycaptcha.HttpClient( + os.environ.get("DBC_USER", "username"), os.environ.get("DBC_PASS", "password") + ) + + params = parse_params(response) + params["pageurl"] = url + params["proxy"] = proxies.get("https", None) + params["proxytype"] = "HTTP" + + q = params.pop("q") + + token = "" + try: + token = client.decode(type=4, token_params=json.dumps(params)) + except Exception as exc: + print( + "ERROR: Deathbycaptcha was unable to solve the captcha. Original exception:", exc + ) + return False + + if not token or token.get("is_correct", "false") == "false": + print("ERROR: Deathbycaptcha was unable to solve the captcha") + return False + text = token.get("text", None) + if text: + form_params = { + "q": q, + "continue": url, + "g-recaptcha-response": text, + } + response = requests.post("https://www.google.com/sorry/index", data=form_params, proxies=proxies) + return True + return False diff --git a/test/test_captcha.py b/test/test_captcha.py new file mode 100644 index 0000000..99aa624 --- /dev/null +++ b/test/test_captcha.py @@ -0,0 +1,47 @@ +""" +Test the integration with third-party CAPTCHA solving services +""" + +from pathlib import Path +from argparse import Namespace + +from app.utils import captcha + +TEST_FILES = Path(__file__).parent / "test_files" + + +def test_parse(): + """ + Test the parsing functionality + """ + + with open(TEST_FILES / "recaptcha_v2_callback.html") as file: + text = file.read() + # primitive mock + response = Namespace() + response.url = "https://www.google.com/search?gbv=1&num=10&q=Liddell&safe=off" + response.text = text + + res = captcha.parse_params(response) + + data_s = ( + "I_wQ5kiIMUbCdcGyC1x6zzK70nD" + "G9kViGr7TS6zaiWsIdZXcmQGoaxN" + "hiGulX8tD_xNYFXLRkLFSkxDnrkIr" + "5o5xSw2Sj1Z-bs5dqP2TyQFGBaTZFY" + "sRBy3CoDJruyranhLqWoWb3mdxvgUb" + "kpS7ZkRSFYFP_dg9WV4rIQxa6OUmrAt" + "S6JKw_UbHN8tJ4mCpz6BKYsGB_fjyD9" + "fuRrzmn2RK8FzsOAiLEWBc0z5Qxdltd" + "owqO1ugNxQdSaqM39pF73cCAqWqEama" + "RRa9iOOVflHptIHjo88" + ) + + expected = { + "googlekey": "6LfwuyUTAAAAAOAmoS0fdqijC2PbbdH4kjq62Y1b", + "data-s": data_s, + 'q': 'EgTIadcWGIXS4pwGIjDL-1ocR_DlZgts3Rfama1w7aWKF_5y2vFWA8eORDe5SvseqGuuMVzIObjhBnZPpgAyAXI' + } + + message = "Results differ\n" f"Expected: {expected}\n" f"Got: {res}" + assert res == expected, message diff --git a/test/test_files/recaptcha_v2_callback.html b/test/test_files/recaptcha_v2_callback.html new file mode 100644 index 0000000..f21508f --- /dev/null +++ b/test/test_files/recaptcha_v2_callback.html @@ -0,0 +1,28 @@ + + +https://www.google.com/search?gbv=1&num=10&q=Liddell&safe=off + +
+

+
+ + + +
+ + +
+
+ +
+About this page

+ +Our systems have detected unusual traffic from your computer network. This page checks to see if it's really you sending the requests, and not a robot. Why did this happen?

+ +