diff --git a/app/filter.py b/app/filter.py index 66e9c6e..d03a112 100644 --- a/app/filter.py +++ b/app/filter.py @@ -1,6 +1,7 @@ from app.request import VALID_PARAMS from app.utils.results import * -from bs4.element import ResultSet +from bs4 import BeautifulSoup +from bs4.element import ResultSet, Tag from cryptography.fernet import Fernet import re import urllib.parse as urlparse @@ -8,7 +9,7 @@ from urllib.parse import parse_qs class Filter: - def __init__(self, user_keys: dict, mobile=False, config=None): + def __init__(self, user_keys: dict, mobile=False, config=None) -> None: if config is None: config = {} @@ -29,7 +30,7 @@ class Filter: def elements(self): return self._elements - def reskin(self, page): + def reskin(self, page: str) -> str: # Aesthetic only re-skinning if self.dark: page = page.replace( @@ -39,22 +40,22 @@ class Filter: return page - def encrypt_path(self, msg, is_element=False): + def encrypt_path(self, path, is_element=False) -> str: # Encrypts path to avoid plaintext results in logs if is_element: # Element paths are encrypted separately from text, to allow key # regeneration once all items have been served to the user enc_path = Fernet( self.user_keys['element_key'] - ).encrypt(msg.encode()).decode() + ).encrypt(path.encode()).decode() self._elements += 1 return enc_path return Fernet( self.user_keys['text_key'] - ).encrypt(msg.encode()).decode() + ).encrypt(path.encode()).decode() - def clean(self, soup): + def clean(self, soup) -> BeautifulSoup: self.main_divs = soup.find('div', {'id': 'main'}) self.remove_ads() self.fix_question_section() @@ -90,7 +91,12 @@ class Filter: return soup - def remove_ads(self): + def remove_ads(self) -> None: + """Removes ads found in the list of search result divs + + Returns: + None (The soup object is modified directly) + """ if not self.main_divs: return @@ -99,7 +105,16 @@ class Filter: if has_ad_content(_.text)] _ = div.decompose() if len(div_ads) else None - def fix_question_section(self): + def fix_question_section(self) -> None: + """Collapses the "People Also Asked" section into a "details" element + + These sections are typically the only sections in the results page that + are structured as

Title

...
, so they are + extracted by checking all result divs for h2 children. + + Returns: + None (The soup object is modified directly) + """ if not self.main_divs: return @@ -126,7 +141,14 @@ class Filter: for question in questions: question['style'] = 'padding: 10px; font-style: italic;' - def update_element_src(self, element, mime): + def update_element_src(self, element: Tag, mime: str) -> None: + """Encrypts the original src of an element and rewrites the element src + to use the "/element?src=" pass-through. + + Returns: + None (The soup element is modified directly) + + """ src = element['src'] if src.startswith('//'): @@ -145,7 +167,8 @@ class Filter: src, is_element=True) + '&type=' + urlparse.quote(mime) - def update_styling(self, soup): + def update_styling(self, soup) -> None: + """""" # Remove unnecessary button(s) for button in soup.find_all('button'): button.decompose() @@ -168,7 +191,17 @@ class Filter: except AttributeError: pass - def update_link(self, link): + def update_link(self, link: Tag) -> None: + """Update internal link paths with encrypted path, otherwise remove + unnecessary redirects and/or marketing params from the url + + Args: + link: A bs4 Tag element to inspect and update + + Returns: + None (the tag is updated directly) + + """ # Replace href with only the intended destination (no "utm" type tags) href = link['href'].replace('https://www.google.com', '') if 'advanced_search' in href or 'tbm=shop' in href: diff --git a/app/request.py b/app/request.py index 71eeb45..fadcc18 100644 --- a/app/request.py +++ b/app/request.py @@ -29,10 +29,10 @@ class TorError(Exception): altogether). """ - def __init__(self, message, disable=False): + def __init__(self, message, disable=False) -> None: self.message = message self.disable = disable - super().__init__(self.message) + super().__init__(message) def send_tor_signal(signal: Signal) -> bool: @@ -64,7 +64,7 @@ def gen_query(query, args, config, near_city=None) -> str: # Use :past(hour/day/week/month/year) if available # example search "new restaurants :past month" - sub_lang = '' + lang = '' if ':past' in query and 'tbs' not in args: time_range = str.strip(query.split(':past', 1)[-1]) param_dict['tbs'] = '&tbs=' + ('qdr:' + str.lower(time_range[0])) @@ -79,9 +79,10 @@ def gen_query(query, args, config, near_city=None) -> str: # Example: # &tbs=qdr:h,lr:lang_1pl # -- the lr param needs to be extracted and remove the leading '1' - sub_lang = [_ for _ in result_tbs.split(',') if 'lr:' in _] - sub_lang = sub_lang[0][sub_lang[0].find('lr:') + - 3:len(sub_lang[0])] if len(sub_lang) > 0 else '' + result_params = [_ for _ in result_tbs.split(',') if 'lr:' in _] + if len(result_params) > 0: + result_param = result_params[0] + lang = result_param[result_param.find('lr:') + 3:len(result_param)] # Ensure search query is parsable query = urlparse.quote(query) @@ -103,8 +104,8 @@ def gen_query(query, args, config, near_city=None) -> str: if 'source' in args: param_dict['source'] = '&source=' + args.get('source') param_dict['lr'] = ('&lr=' + ''.join( - [_ for _ in sub_lang if not _.isdigit()] - )) if sub_lang else '' + [_ for _ in lang if not _.isdigit()] + )) if lang else '' else: param_dict['lr'] = ( '&lr=' + config.lang_search @@ -150,12 +151,12 @@ class Request: # Set up proxy, if previously configured if os.environ.get('WHOOGLE_PROXY_LOC'): auth_str = '' - if os.environ.get('WHOOGLE_PROXY_USER'): - auth_str = os.environ.get('WHOOGLE_PROXY_USER') + \ - ':' + os.environ.get('WHOOGLE_PROXY_PASS') + if os.environ.get('WHOOGLE_PROXY_USER', ''): + auth_str = os.environ.get('WHOOGLE_PROXY_USER', '') + \ + ':' + os.environ.get('WHOOGLE_PROXY_PASS', '') self.proxies = { - 'http': os.environ.get('WHOOGLE_PROXY_TYPE') + '://' + - auth_str + '@' + os.environ.get('WHOOGLE_PROXY_LOC'), + 'http': os.environ.get('WHOOGLE_PROXY_TYPE', '') + '://' + + auth_str + '@' + os.environ.get('WHOOGLE_PROXY_LOC', ''), } self.proxies['https'] = self.proxies['http'].replace('http', 'https') diff --git a/app/routes.py b/app/routes.py index 006be07..35f1066 100644 --- a/app/routes.py +++ b/app/routes.py @@ -347,7 +347,7 @@ def window(): return render_template('display.html', response=results) -def run_app(): +def run_app() -> None: parser = argparse.ArgumentParser( description='Whoogle Search console runner') parser.add_argument( diff --git a/app/utils/results.py b/app/utils/results.py index 58c450f..2a9e60e 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -57,6 +57,7 @@ def get_first_link(soup: BeautifulSoup) -> str: # Return the first search result URL if 'url?q=' in a['href']: return filter_link_args(a['href']) + return '' def get_site_alt(link: str) -> str: diff --git a/app/utils/search.py b/app/utils/search.py index ee75f3f..9694d14 100644 --- a/app/utils/search.py +++ b/app/utils/search.py @@ -24,15 +24,24 @@ def needs_https(url: str) -> bool: bool: True/False representing the need to upgrade """ - https_only = os.getenv('HTTPS_ONLY', False) + https_only = bool(os.getenv('HTTPS_ONLY', 0)) is_heroku = url.endswith('.herokuapp.com') is_http = url.startswith('http://') return (is_heroku and is_http) or (https_only and is_http) -def has_captcha(site_contents: str) -> bool: - return CAPTCHA in site_contents +def has_captcha(results: str) -> bool: + """Checks to see if the search results are blocked by a captcha + + Args: + results: The search page html as a string + + Returns: + bool: True/False indicating if a captcha element was found + + """ + return CAPTCHA in results class Search: @@ -118,23 +127,23 @@ class Search: """ mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent - content_filter = Filter( - self.session['fernet_keys'], - mobile=mobile, - config=self.config) - full_query = gen_query( - self.query, - self.request_params, - self.config, - content_filter.near) + content_filter = Filter(self.session['fernet_keys'], + mobile=mobile, + config=self.config) + full_query = gen_query(self.query, + self.request_params, + self.config, + content_filter.near) get_body = g.user_request.send(query=full_query) # Produce cleanable html soup from response html_soup = bsoup(content_filter.reskin(get_body.text), 'html.parser') - html_soup.insert( - 0, - bsoup(TOR_BANNER, 'html.parser') - if g.user_request.tor_valid else bsoup('', 'html.parser')) + + # Indicate whether or not a Tor connection is active + tor_banner = bsoup('', 'html.parser') + if g.user_request.tor_valid: + tor_banner = bsoup(TOR_BANNER, 'html.parser') + html_soup.insert(0, tor_banner) if self.feeling_lucky: return get_first_link(html_soup), 0