pull/915/merge
MoistCat 1 month ago committed by GitHub
commit c83eb6a5d8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -41,6 +41,7 @@ Contents
5. [Using with Firefox Containers](#using-with-firefox-containers)
6. [Reverse Proxying](#reverse-proxying)
1. [Nginx](#nginx)
6. [CAPTCHA Troubleshooting](#captcha-troubleshooting)
7. [Contributing](#contributing)
8. [FAQ](#faq)
9. [Public Instances](#public-instances)
@ -603,6 +604,22 @@ server {
You can then add SSL support using LetsEncrypt by following a guide such as [this one](https://www.nginx.com/blog/using-free-ssltls-certificates-from-lets-encrypt-with-nginx/).
### CAPTCHA Troubleshooting
**Note:** The maintainer(s) of Whoogle do not endorse or recommend any
particular approach for solving CAPTCHAs. The solution outlined below was
implemented by the community, and is available to anyone who is interested.
#### Death By Captcha ([https://deathbycaptcha.com/](https://deathbycaptcha.com/))
To use Death By Captcha (DBC) with Whoogle, you'll need to perform the
following steps:
1. [Create an account with DBC](https://deathbycaptcha.com/register)
2. Install the `deathbycaptcha` pip package (`pip install deathbycaptcha`)
3. Set the `DBC_USER` environment variable to your DBC username
4. Set the `DBC_PASS` environment variable to your DBC password
## Contributing
Under the hood, Whoogle is a basic Flask app with the following structure:

@ -1,5 +1,6 @@
from app.models.config import Config
from app.utils.misc import read_config_bool
from app.utils import captcha
from datetime import datetime
from defusedxml import ElementTree as ET
import random
@ -341,6 +342,23 @@ class Request:
proxies=self.proxies,
headers=headers,
cookies=cookies)
if response.status_code == 429:
# google's CAPTCHA
# we have to handle it here because we filter out scripts from the page source
# later
print("WARN: CAPTCHA detected")
solved = captcha.solve(response, self.proxies, url=self.search_url + query)
if solved:
print("INFO: CAPTCHA solved. Retrying...")
response = requests.get(
(base_url or self.search_url) + query,
proxies=self.proxies,
headers=headers,
cookies=cookies
)
if response.status_code == 429:
print("ERROR: It seems our IP is still blacklisted")
# Retry query with new identity if using Tor (max 10 attempts)
if 'form id="captcha-form"' in response.text and self.tor:

@ -0,0 +1,82 @@
"""
Itegration with third party CAPTCHA solving services
"""
# only deathbycaptcha atm but whatever
import os
import json
import requests
from bs4 import BeautifulSoup as bs
try:
import deathbycaptcha
except ImportError:
deathbycaptcha = None
def parse_params(response):
"""
Parses a page with bs4 to fetch the data needed to solve the captcha.
"""
params = {
"googlekey": "",
"data-s": "",
}
soup = bs(response.text, "html.parser")
recaptcha = soup.find(id="recaptcha")
if not recaptcha:
# i could save the page for debugging since this is usually
# hard to reproduce
raise AttributeError(
"Couldn't find the element with the CAPTCHA params"
"Are you sure this page contains Google's reCAPTCHA v2 with callback?"
)
hidden_q = soup.find(type="hidden")
params["q"] = hidden_q.attrs["value"]
params["googlekey"] = recaptcha.attrs["data-sitekey"]
params["data-s"] = recaptcha.attrs["data-s"]
return params
def solve(response, proxies, url):
"""
Get a response with a reCAPTCHA v2 and solve it using a third-party service.
"""
if deathbycaptcha is None:
print("WARN: The deathbycaptcha client is not installed")
return False
client = deathbycaptcha.HttpClient(
os.environ.get("DBC_USER", "username"), os.environ.get("DBC_PASS", "password")
)
params = parse_params(response)
params["pageurl"] = url
params["proxy"] = proxies.get("https", None)
params["proxytype"] = "HTTP"
q = params.pop("q")
token = ""
try:
token = client.decode(type=4, token_params=json.dumps(params))
except Exception as exc:
print(
"ERROR: Deathbycaptcha was unable to solve the captcha. Original exception:", exc
)
return False
if not token or token.get("is_correct", "false") == "false":
print("ERROR: Deathbycaptcha was unable to solve the captcha")
return False
text = token.get("text", None)
if text:
form_params = {
"q": q,
"continue": url,
"g-recaptcha-response": text,
}
response = requests.post("https://www.google.com/sorry/index", data=form_params, proxies=proxies)
return True
return False

@ -0,0 +1,47 @@
"""
Test the integration with third-party CAPTCHA solving services
"""
from pathlib import Path
from argparse import Namespace
from app.utils import captcha
TEST_FILES = Path(__file__).parent / "test_files"
def test_parse():
"""
Test the parsing functionality
"""
with open(TEST_FILES / "recaptcha_v2_callback.html") as file:
text = file.read()
# primitive mock
response = Namespace()
response.url = "https://www.google.com/search?gbv=1&num=10&q=Liddell&safe=off"
response.text = text
res = captcha.parse_params(response)
data_s = (
"I_wQ5kiIMUbCdcGyC1x6zzK70nD"
"G9kViGr7TS6zaiWsIdZXcmQGoaxN"
"hiGulX8tD_xNYFXLRkLFSkxDnrkIr"
"5o5xSw2Sj1Z-bs5dqP2TyQFGBaTZFY"
"sRBy3CoDJruyranhLqWoWb3mdxvgUb"
"kpS7ZkRSFYFP_dg9WV4rIQxa6OUmrAt"
"S6JKw_UbHN8tJ4mCpz6BKYsGB_fjyD9"
"fuRrzmn2RK8FzsOAiLEWBc0z5Qxdltd"
"owqO1ugNxQdSaqM39pF73cCAqWqEama"
"RRa9iOOVflHptIHjo88"
)
expected = {
"googlekey": "6LfwuyUTAAAAAOAmoS0fdqijC2PbbdH4kjq62Y1b",
"data-s": data_s,
'q': 'EgTIadcWGIXS4pwGIjDL-1ocR_DlZgts3Rfama1w7aWKF_5y2vFWA8eORDe5SvseqGuuMVzIObjhBnZPpgAyAXI'
}
message = "Results differ\n" f"Expected: {expected}\n" f"Got: {res}"
assert res == expected, message

@ -0,0 +1,28 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head><meta http-equiv="content-type" content="text/html; charset=utf-8"><meta name="viewport" content="initial-scale=1"><title>https://www.google.com/search?gbv=1&amp;num=10&amp;q=Liddell&amp;safe=off</title></head>
<body style="font-family: arial, sans-serif; background-color: #fff; color: #000; padding:20px; font-size:18px;" onload="e=document.getElementById('captcha');if(e){e.focus();} if(solveSimpleChallenge) {solveSimpleChallenge(,);}">
<div style="max-width:400px;">
<hr noshade size="1" style="color:#ccc; background-color:#ccc;"><br>
<form id="captcha-form" action="index" method="post">
<noscript>
<div style="font-size:13px;">
In order to continue, please enable javascript on your web browser.
</div>
</noscript>
<script src="https://www.google.com/recaptcha/api.js" async defer></script>
<script>var submitCallback = function(response) {document.getElementById('captcha-form').submit();};</script>
<div id="recaptcha" class="g-recaptcha" data-sitekey="6LfwuyUTAAAAAOAmoS0fdqijC2PbbdH4kjq62Y1b" data-callback="submitCallback" data-s="I_wQ5kiIMUbCdcGyC1x6zzK70nDG9kViGr7TS6zaiWsIdZXcmQGoaxNhiGulX8tD_xNYFXLRkLFSkxDnrkIr5o5xSw2Sj1Z-bs5dqP2TyQFGBaTZFYsRBy3CoDJruyranhLqWoWb3mdxvgUbkpS7ZkRSFYFP_dg9WV4rIQxa6OUmrAtS6JKw_UbHN8tJ4mCpz6BKYsGB_fjyD9fuRrzmn2RK8FzsOAiLEWBc0z5QxdltdowqO1ugNxQdSaqM39pF73cCAqWqEamaRRa9iOOVflHptIHjo88"></div>
<input type='hidden' name='q' value='EgTIadcWGIXS4pwGIjDL-1ocR_DlZgts3Rfama1w7aWKF_5y2vFWA8eORDe5SvseqGuuMVzIObjhBnZPpgAyAXI'><input type="hidden" name="continue" value="https://www.google.com/search?gbv=1&amp;num=10&amp;q=Liddell&amp;safe=off">
</form>
<hr noshade size="1" style="color:#ccc; background-color:#ccc;">
<div style="font-size:13px;">
<b>About this page</b><br><br>
Our systems have detected unusual traffic from your computer network. This page checks to see if it&#39;s really you sending the requests, and not a robot. <a href="#" onclick="document.getElementById('infoDiv').style.display='block';">Why did this happen?</a><br><br>
<div id="infoDiv" style="display:none; background-color:#eee; padding:10px; margin:0 0 15px 0; line-height:1.4em;">
This page appears when Google automatically detects requests coming from your computer network which appear to be in violation of the <a href="//www.google.com/policies/terms/">Terms of Service</a>. The block will expire shortly after those requests stop. In the meantime, solving the above CAPTCHA will let you continue to use our services.<br><br>This traffic may have been sent by malicious software, a browser plug-in, or a script that sends automated requests. If you share your network connection, ask your administrator for help &mdash; a different computer using the same IP address may be responsible. <a href="//support.google.com/websearch/answer/86640">Learn more</a><br><br>Sometimes you may be asked to solve the CAPTCHA if you are using advanced terms that robots are known to use, or sending requests very quickly.
</div>
Loading…
Cancel
Save