From c5ec5ca2ee4b41baff9c2a7bc0ac6be11b86bd4f Mon Sep 17 00:00:00 2001 From: Moist-Cat Date: Tue, 13 Dec 2022 09:12:09 -0500 Subject: [PATCH 1/8] Ignore VIM backup files --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 02b4e03..181094f 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,7 @@ dist/ # env whoogle.env + +# vim +*~ +*.swp From 5acac97ae87615706039bd6a74efe7ae65d4ef6a Mon Sep 17 00:00:00 2001 From: Moist-Cat Date: Tue, 13 Dec 2022 13:08:49 -0500 Subject: [PATCH 2/8] Parse response and delegate the work to the client --- app/request.py | 6 ++++++ app/routes.py | 11 ----------- app/utils/captcha.py | 24 ++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 11 deletions(-) create mode 100644 app/utils/captcha.py diff --git a/app/request.py b/app/request.py index 571ba51..e1a9c7d 100644 --- a/app/request.py +++ b/app/request.py @@ -1,5 +1,6 @@ from app.models.config import Config from app.utils.misc import read_config_bool +from app.utils import captcha from datetime import datetime from defusedxml import ElementTree as ET import random @@ -330,6 +331,11 @@ class Request: proxies=self.proxies, headers=headers, cookies=cookies) + if response.status_code == "429": + # google's CAPTCHA + # we have to handle it here because we filter out scripts from the page source + # later + captcha.solve(response.text) # Retry query with new identity if using Tor (max 10 attempts) if 'form id="captcha-form"' in response.text and self.tor: diff --git a/app/routes.py b/app/routes.py index d48a387..43826ae 100644 --- a/app/routes.py +++ b/app/routes.py @@ -318,17 +318,6 @@ def search(): translation = app.config['TRANSLATIONS'][localization_lang] translate_to = localization_lang.replace('lang_', '') - # Return 503 if temporarily blocked by captcha - if has_captcha(str(response)): - return render_template( - 'error.html', - blocked=True, - error_message=translation['ratelimit'], - translation=translation, - farside='https://farside.link', - config=g.user_config, - query=urlparse.unquote(query), - params=g.user_config.to_params(keys=['preferences'])), 503 response = bold_search_terms(response, query) # Feature to display IP address diff --git a/app/utils/captcha.py b/app/utils/captcha.py new file mode 100644 index 0000000..a85fc72 --- /dev/null +++ b/app/utils/captcha.py @@ -0,0 +1,24 @@ +import os + +from bs4 import BeautifulSoup as bs +try: + import deathbycaptcha +except ImportError: + deathbycaptcha = None + +def parse_params(response): + params = { + "googlekey":"", + "data-s": "", + "pageurl": "" + } + soup = bs(response) + +def solve(response): + if deathbycaptcha is None: + raise ImportError("The deathbycaptcha client is not installed") + + client = deathbycaptcha.HttpClient( + os.env.get("DBC_USER", "username"), + os.env.get("DBC_PASS") + ) From b26aeec17323aeca9681506db0b90675708d45e8 Mon Sep 17 00:00:00 2001 From: Moist-Cat Date: Wed, 14 Dec 2022 11:12:20 -0500 Subject: [PATCH 3/8] Tests pass --- app/utils/captcha.py | 49 +++++++++++++++++++--- test/test_captcha.py | 47 +++++++++++++++++++++ test/test_files/recaptcha_v2_callback.html | 34 +++++++++++++++ 3 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 test/test_captcha.py create mode 100644 test/test_files/recaptcha_v2_callback.html diff --git a/app/utils/captcha.py b/app/utils/captcha.py index a85fc72..dbd146c 100644 --- a/app/utils/captcha.py +++ b/app/utils/captcha.py @@ -1,24 +1,63 @@ +""" +Itegration with third party CAPTCHA solving services +""" +# only deathbycaptcha atm but whatever import os from bs4 import BeautifulSoup as bs + try: import deathbycaptcha except ImportError: deathbycaptcha = None + +class UnableToSolve(Exception): + """ + The third-party service was unable to solve the CAPTCHA + """ + + def parse_params(response): + """ + Parses a page with bs4 to fetch the data needed to solve the captcha. + """ params = { - "googlekey":"", + "googlekey": "", "data-s": "", - "pageurl": "" + "pageurl": "", } - soup = bs(response) + soup = bs(response.text) + + recaptcha = soup.find(id="recaptcha") + if not recaptcha: + # i could save the page for debugging since this is usually + # hard to reproduce + raise AttributeError( + "Couldn't find the element with the CAPTCHA params" + "Are you sure this page contains Google's reCAPTCHA v2 with callback?" + ) + params["googlekey"] = recaptcha.attrs["data-sitekey"] + params["data-s"] = recaptcha.attrs["data-s"] + + params["pageurl"] = response.url + + return params + def solve(response): + """ + Get a response with a reCAPTCHA v2 and solve it using a third-party service + """ if deathbycaptcha is None: raise ImportError("The deathbycaptcha client is not installed") client = deathbycaptcha.HttpClient( - os.env.get("DBC_USER", "username"), - os.env.get("DBC_PASS") + os.environ.get("DBC_USER", "username"), os.environ.get("DBC_PASS", "password") ) + + params = parse_params(response) + + token = client.decode(type=4, token_params=params) + if not token or token == "?": + raise UnableToSolve("Deathbycaptcha was unable to solve the captcha") diff --git a/test/test_captcha.py b/test/test_captcha.py new file mode 100644 index 0000000..dd2a111 --- /dev/null +++ b/test/test_captcha.py @@ -0,0 +1,47 @@ +""" +Test the integration with third-party CAPTCHA solving services +""" + +from pathlib import Path +from argparse import Namespace + +from app.utils import captcha + +TEST_FILES = Path(__file__).parent / "test_files" + + +def test_parse(): + """ + Test the parsing functionality + """ + + with open(TEST_FILES / "recaptcha_v2_callback.html") as file: + text = file.read() + # primitive mock + response = Namespace() + response.url = "https://www.google.com/search?gbv=1&num=10&q=Liddell&safe=off" + response.text = text + + res = captcha.parse_params(response) + + data_s = ( + "I_wQ5kiIMUbCdcGyC1x6zzK70nD" + "G9kViGr7TS6zaiWsIdZXcmQGoaxN" + "hiGulX8tD_xNYFXLRkLFSkxDnrkIr" + "5o5xSw2Sj1Z-bs5dqP2TyQFGBaTZFY" + "sRBy3CoDJruyranhLqWoWb3mdxvgUb" + "kpS7ZkRSFYFP_dg9WV4rIQxa6OUmrAt" + "S6JKw_UbHN8tJ4mCpz6BKYsGB_fjyD9" + "fuRrzmn2RK8FzsOAiLEWBc0z5Qxdltd" + "owqO1ugNxQdSaqM39pF73cCAqWqEama" + "RRa9iOOVflHptIHjo88" + ) + + expected = { + "googlekey": "6LfwuyUTAAAAAOAmoS0fdqijC2PbbdH4kjq62Y1b", + "data-s": data_s, + "pageurl": "https://www.google.com/search?gbv=1&num=10&q=Liddell&safe=off", + } + + message = "Results differ\n" f"Expected: {expected}\n" f"Got: {res}" + assert res == expected, message diff --git a/test/test_files/recaptcha_v2_callback.html b/test/test_files/recaptcha_v2_callback.html new file mode 100644 index 0000000..d8d3fb7 --- /dev/null +++ b/test/test_files/recaptcha_v2_callback.html @@ -0,0 +1,34 @@ + + +https://www.google.com/search?gbv=1&num=10&q=Liddell&safe=off + +
+

+
+ + + +
+ + +
+
+ +
+About this page

+ +Our systems have detected unusual traffic from your computer network. This page checks to see if it's really you sending the requests, and not a robot. Why did this happen?

+ + + +IP address: 200.105.215.22
Time: 2022-12-13T16:32:06Z
URL: https://www.google.com/search?gbv=1&num=10&q=Liddell&safe=off
+
+
+ + From 9e8c48a8823646c9b9f30dbfd2efb5bfa19fb8ac Mon Sep 17 00:00:00 2001 From: Moist-Cat Date: Wed, 14 Dec 2022 16:34:17 -0500 Subject: [PATCH 4/8] Finish integration --- app/request.py | 16 +++++++- app/routes.py | 11 ++++++ app/utils/captcha.py | 44 +++++++++++++++++----- test/test_files/recaptcha_v2_callback.html | 6 --- 4 files changed, 60 insertions(+), 17 deletions(-) diff --git a/app/request.py b/app/request.py index e1a9c7d..a0adfdc 100644 --- a/app/request.py +++ b/app/request.py @@ -331,11 +331,23 @@ class Request: proxies=self.proxies, headers=headers, cookies=cookies) - if response.status_code == "429": + if response.status_code == 429: # google's CAPTCHA # we have to handle it here because we filter out scripts from the page source # later - captcha.solve(response.text) + print("WARN: CAPTCHA detected") + solved = captcha.solve(response, self.proxies, url=self.search_url + query) + if solved: + print("INFO: CAPTCHA solved. Retrying...") + response = requests.get( + (base_url or self.search_url) + query, + proxies=self.proxies, + headers=headers, + cookies=cookies + ) + if response.status_code == 429: + print("ERROR: It seems our IP is still blacklisted") + # Retry query with new identity if using Tor (max 10 attempts) if 'form id="captcha-form"' in response.text and self.tor: diff --git a/app/routes.py b/app/routes.py index 43826ae..d48a387 100644 --- a/app/routes.py +++ b/app/routes.py @@ -318,6 +318,17 @@ def search(): translation = app.config['TRANSLATIONS'][localization_lang] translate_to = localization_lang.replace('lang_', '') + # Return 503 if temporarily blocked by captcha + if has_captcha(str(response)): + return render_template( + 'error.html', + blocked=True, + error_message=translation['ratelimit'], + translation=translation, + farside='https://farside.link', + config=g.user_config, + query=urlparse.unquote(query), + params=g.user_config.to_params(keys=['preferences'])), 503 response = bold_search_terms(response, query) # Feature to display IP address diff --git a/app/utils/captcha.py b/app/utils/captcha.py index dbd146c..d4f03ba 100644 --- a/app/utils/captcha.py +++ b/app/utils/captcha.py @@ -3,6 +3,8 @@ Itegration with third party CAPTCHA solving services """ # only deathbycaptcha atm but whatever import os +import json +import requests from bs4 import BeautifulSoup as bs @@ -11,7 +13,6 @@ try: except ImportError: deathbycaptcha = None - class UnableToSolve(Exception): """ The third-party service was unable to solve the CAPTCHA @@ -25,7 +26,6 @@ def parse_params(response): params = { "googlekey": "", "data-s": "", - "pageurl": "", } soup = bs(response.text) @@ -37,27 +37,53 @@ def parse_params(response): "Couldn't find the element with the CAPTCHA params" "Are you sure this page contains Google's reCAPTCHA v2 with callback?" ) + hidden_q = soup.find(type="hidden") + params["q"] = hidden_q.attrs["value"] params["googlekey"] = recaptcha.attrs["data-sitekey"] params["data-s"] = recaptcha.attrs["data-s"] - params["pageurl"] = response.url - return params -def solve(response): +def solve(response, proxies, url): """ Get a response with a reCAPTCHA v2 and solve it using a third-party service """ if deathbycaptcha is None: - raise ImportError("The deathbycaptcha client is not installed") + print("WARN: The deathbycaptcha client is not installed") + return False client = deathbycaptcha.HttpClient( os.environ.get("DBC_USER", "username"), os.environ.get("DBC_PASS", "password") ) params = parse_params(response) + params["pageurl"] = url + params["proxy"] = proxies.get("https", None) + params["proxytype"] = "HTTP" + + q = params.pop("q") + + token = "" + try: + token = client.decode(type=4, token_params=json.dumps(params)) + except Exception as exc: + print( + "ERROR: Deathbycaptcha was unable to solve the captcha. Original exception:", exc + ) + return False - token = client.decode(type=4, token_params=params) - if not token or token == "?": - raise UnableToSolve("Deathbycaptcha was unable to solve the captcha") + if not token or token.get("is_correct", "false") == "false": + print("ERROR: Deathbycaptcha was unable to solve the captcha") + return False + text = token.get("text", None) + if text: + form_params = { + "q": q, + "continue": url, + "g-recaptcha-response": text, + } + response = requests.post("https://www.google.com/sorry/index", data=form_params, proxies=proxies) + print(response, form_params, response.text) + return True + return False diff --git a/test/test_files/recaptcha_v2_callback.html b/test/test_files/recaptcha_v2_callback.html index d8d3fb7..f21508f 100644 --- a/test/test_files/recaptcha_v2_callback.html +++ b/test/test_files/recaptcha_v2_callback.html @@ -26,9 +26,3 @@ Our systems have detected unusual traffic from your computer network. This page - -IP address: 200.105.215.22
Time: 2022-12-13T16:32:06Z
URL: https://www.google.com/search?gbv=1&num=10&q=Liddell&safe=off
- - - - From 851f266bc2215719cd7feb2926d1eb70a7986f95 Mon Sep 17 00:00:00 2001 From: Moist-Cat Date: Tue, 27 Dec 2022 13:30:56 -0500 Subject: [PATCH 5/8] Clean up --- app/utils/captcha.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app/utils/captcha.py b/app/utils/captcha.py index d4f03ba..3e78ab8 100644 --- a/app/utils/captcha.py +++ b/app/utils/captcha.py @@ -47,7 +47,7 @@ def parse_params(response): def solve(response, proxies, url): """ - Get a response with a reCAPTCHA v2 and solve it using a third-party service + Get a response with a reCAPTCHA v2 and solve it using a third-party service. """ if deathbycaptcha is None: print("WARN: The deathbycaptcha client is not installed") @@ -84,6 +84,5 @@ def solve(response, proxies, url): "g-recaptcha-response": text, } response = requests.post("https://www.google.com/sorry/index", data=form_params, proxies=proxies) - print(response, form_params, response.text) return True return False From 440cb161386b3c46abbdcd524febdf1e363ba21e Mon Sep 17 00:00:00 2001 From: Moist-Cat Date: Tue, 27 Dec 2022 13:50:27 -0500 Subject: [PATCH 6/8] Fix tests --- test/test_captcha.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_captcha.py b/test/test_captcha.py index dd2a111..99aa624 100644 --- a/test/test_captcha.py +++ b/test/test_captcha.py @@ -40,7 +40,7 @@ def test_parse(): expected = { "googlekey": "6LfwuyUTAAAAAOAmoS0fdqijC2PbbdH4kjq62Y1b", "data-s": data_s, - "pageurl": "https://www.google.com/search?gbv=1&num=10&q=Liddell&safe=off", + 'q': 'EgTIadcWGIXS4pwGIjDL-1ocR_DlZgts3Rfama1w7aWKF_5y2vFWA8eORDe5SvseqGuuMVzIObjhBnZPpgAyAXI' } message = "Results differ\n" f"Expected: {expected}\n" f"Got: {res}" From 667dd9cdfeefe68da47c3d526886b25b061a0b35 Mon Sep 17 00:00:00 2001 From: Ben Busby Date: Thu, 29 Dec 2022 15:39:15 -0700 Subject: [PATCH 7/8] Update readme w/ DBC instructions --- README.md | 17 +++++++++++++++++ app/utils/captcha.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5beaca1..ab05461 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ Contents 4. [Using with Firefox Containers](#using-with-firefox-containers) 5. [Reverse Proxying](#reverse-proxying) 1. [Nginx](#nginx) + 6. [CAPTCHA Troubleshooting](#captcha-troubleshooting) 7. [Contributing](#contributing) 8. [FAQ](#faq) 9. [Public Instances](#public-instances) @@ -516,6 +517,22 @@ server { You can then add SSL support using LetsEncrypt by following a guide such as [this one](https://www.nginx.com/blog/using-free-ssltls-certificates-from-lets-encrypt-with-nginx/). +### CAPTCHA Troubleshooting + +**Note:** The maintainer(s) of Whoogle do not endorse or recommend any +particular approach for solving CAPTCHAs. The solution outlined below was +implemented by the community, and is available to anyone who is interested. + +#### Death By Captcha ([https://deathbycaptcha.com/](https://deathbycaptcha.com/)) + +To use Death By Captcha (DBC) with Whoogle, you'll need to perform the +following steps: + +1. [Create an account with DBC](https://deathbycaptcha.com/register) +2. Install the `deathbycaptcha` pip package (`pip install deathbycaptcha`) +3. Set the `DBC_USER` environment variable to your DBC username +4. Set the `DBC_PASS` environment variable to your DBC password + ## Contributing Under the hood, Whoogle is a basic Flask app with the following structure: diff --git a/app/utils/captcha.py b/app/utils/captcha.py index 3e78ab8..d3543d3 100644 --- a/app/utils/captcha.py +++ b/app/utils/captcha.py @@ -27,7 +27,7 @@ def parse_params(response): "googlekey": "", "data-s": "", } - soup = bs(response.text) + soup = bs(response.text, "html.parser") recaptcha = soup.find(id="recaptcha") if not recaptcha: From 8bb94722a6b2959dfc8ab1ffe65b1d121ed907d5 Mon Sep 17 00:00:00 2001 From: Moist-Cat Date: Tue, 3 Jan 2023 13:13:49 -0500 Subject: [PATCH 8/8] Remove useless exception --- app/utils/captcha.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/app/utils/captcha.py b/app/utils/captcha.py index d3543d3..e7893e4 100644 --- a/app/utils/captcha.py +++ b/app/utils/captcha.py @@ -13,12 +13,6 @@ try: except ImportError: deathbycaptcha = None -class UnableToSolve(Exception): - """ - The third-party service was unable to solve the CAPTCHA - """ - - def parse_params(response): """ Parses a page with bs4 to fetch the data needed to solve the captcha.