2022-06-09 22:30:55 +00:00
|
|
|
|
import cssutils
|
2021-03-24 19:13:52 +00:00
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
from bs4.element import ResultSet, Tag
|
2020-04-29 00:19:34 +00:00
|
|
|
|
from cryptography.fernet import Fernet
|
2021-04-05 14:37:39 +00:00
|
|
|
|
from flask import render_template
|
2022-06-24 16:51:15 +00:00
|
|
|
|
import urllib.parse as urlparse
|
|
|
|
|
from urllib.parse import parse_qs
|
|
|
|
|
import re
|
2022-06-09 22:30:55 +00:00
|
|
|
|
|
|
|
|
|
from app.models.g_classes import GClasses
|
|
|
|
|
from app.request import VALID_PARAMS, MAPS_URL
|
|
|
|
|
from app.utils.misc import get_abs_url, read_config_bool
|
2022-06-24 16:51:15 +00:00
|
|
|
|
from app.utils.results import (
|
|
|
|
|
BLANK_B64, GOOG_IMG, GOOG_STATIC, G_M_LOGO_URL, LOGO_URL, SITE_ALTS,
|
|
|
|
|
has_ad_content, filter_link_args, append_anon_view, get_site_alt,
|
|
|
|
|
)
|
|
|
|
|
from app.models.endpoint import Endpoint
|
|
|
|
|
from app.models.config import Config
|
|
|
|
|
|
2020-04-10 20:52:27 +00:00
|
|
|
|
|
2022-06-27 18:33:08 +00:00
|
|
|
|
MAPS_ARGS = ['q', 'daddr']
|
|
|
|
|
|
2022-02-02 19:57:05 +00:00
|
|
|
|
minimal_mode_sections = ['Top stories', 'Images', 'People also ask']
|
2022-02-07 17:47:25 +00:00
|
|
|
|
unsupported_g_pages = [
|
|
|
|
|
'support.google.com',
|
|
|
|
|
'accounts.google.com',
|
|
|
|
|
'policies.google.com',
|
|
|
|
|
'google.com/preferences',
|
|
|
|
|
'google.com/intl',
|
|
|
|
|
'advanced_search',
|
|
|
|
|
'tbm=shop'
|
|
|
|
|
]
|
2022-02-02 19:57:05 +00:00
|
|
|
|
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
2021-05-29 16:21:20 +00:00
|
|
|
|
def extract_q(q_str: str, href: str) -> str:
|
|
|
|
|
"""Extracts the 'q' element from a result link. This is typically
|
|
|
|
|
either the link to a result's website, or a string.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
q_str: The result link to parse
|
|
|
|
|
href: The full url to check for standalone 'q' elements first,
|
|
|
|
|
rather than parsing the whole query string and then checking.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: The 'q' element of the link, or an empty string
|
|
|
|
|
"""
|
|
|
|
|
return parse_qs(q_str)['q'][0] if ('&q=' in href or '?q=' in href) else ''
|
|
|
|
|
|
|
|
|
|
|
2022-06-27 18:33:08 +00:00
|
|
|
|
def build_map_url(href: str) -> str:
|
|
|
|
|
"""Tries to extract known args that explain the location in the url. If a
|
|
|
|
|
location is found, returns the default url with it. Otherwise, returns the
|
|
|
|
|
url unchanged.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
href: The full url to check.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: The parsed url, or the url unchanged.
|
|
|
|
|
"""
|
|
|
|
|
# parse the url
|
|
|
|
|
parsed_url = parse_qs(href)
|
|
|
|
|
# iterate through the known parameters and try build the url
|
|
|
|
|
for param in MAPS_ARGS:
|
|
|
|
|
if param in parsed_url:
|
|
|
|
|
return MAPS_URL + "?q=" + parsed_url[param][0]
|
|
|
|
|
|
|
|
|
|
# query could not be extracted returning unchanged url
|
|
|
|
|
return href
|
|
|
|
|
|
|
|
|
|
|
2021-06-04 15:09:30 +00:00
|
|
|
|
def clean_query(query: str) -> str:
|
|
|
|
|
"""Strips the blocked site list from the query, if one is being
|
|
|
|
|
used.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: The query string
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: The query string without any "-site:..." filters
|
|
|
|
|
"""
|
|
|
|
|
return query[:query.find('-site:')] if '-site:' in query else query
|
|
|
|
|
|
|
|
|
|
|
2022-04-13 17:29:07 +00:00
|
|
|
|
def clean_css(css: str, page_url: str) -> str:
|
|
|
|
|
"""Removes all remote URLs from a CSS string.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
css: The CSS string
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
str: The filtered CSS, with URLs proxied through Whoogle
|
|
|
|
|
"""
|
|
|
|
|
sheet = cssutils.parseString(css)
|
|
|
|
|
urls = cssutils.getUrls(sheet)
|
|
|
|
|
|
|
|
|
|
for url in urls:
|
|
|
|
|
abs_url = get_abs_url(url, page_url)
|
|
|
|
|
if abs_url.startswith('data:'):
|
|
|
|
|
continue
|
|
|
|
|
css = css.replace(
|
|
|
|
|
url,
|
2022-04-27 20:25:14 +00:00
|
|
|
|
f'{Endpoint.element}?type=image/png&url={abs_url}'
|
2022-04-13 17:29:07 +00:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return css
|
|
|
|
|
|
|
|
|
|
|
2020-04-15 23:41:53 +00:00
|
|
|
|
class Filter:
|
2021-06-23 22:59:57 +00:00
|
|
|
|
# Limit used for determining if a result is a "regular" result or a list
|
|
|
|
|
# type result (such as "people also asked", "related searches", etc)
|
|
|
|
|
RESULT_CHILD_LIMIT = 7
|
|
|
|
|
|
2022-04-13 17:29:07 +00:00
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
user_key: str,
|
|
|
|
|
config: Config,
|
|
|
|
|
root_url='',
|
|
|
|
|
page_url='',
|
2022-06-03 20:03:57 +00:00
|
|
|
|
query='',
|
2022-04-13 17:29:07 +00:00
|
|
|
|
mobile=False) -> None:
|
2022-12-05 20:28:29 +00:00
|
|
|
|
self.soup = None
|
2021-12-07 04:39:50 +00:00
|
|
|
|
self.config = config
|
2020-04-16 16:01:02 +00:00
|
|
|
|
self.mobile = mobile
|
2021-04-01 04:23:30 +00:00
|
|
|
|
self.user_key = user_key
|
2022-04-13 17:29:07 +00:00
|
|
|
|
self.page_url = page_url
|
2022-06-03 20:03:57 +00:00
|
|
|
|
self.query = query
|
2020-06-02 18:54:47 +00:00
|
|
|
|
self.main_divs = ResultSet('')
|
|
|
|
|
self._elements = 0
|
2022-04-13 17:29:07 +00:00
|
|
|
|
self._av = set()
|
2020-04-15 23:41:53 +00:00
|
|
|
|
|
2022-04-20 20:55:19 +00:00
|
|
|
|
self.root_url = root_url[:-1] if root_url.endswith('/') else root_url
|
|
|
|
|
|
2020-04-24 02:59:43 +00:00
|
|
|
|
def __getitem__(self, name):
|
|
|
|
|
return getattr(self, name)
|
|
|
|
|
|
2020-06-02 18:54:47 +00:00
|
|
|
|
@property
|
|
|
|
|
def elements(self):
|
|
|
|
|
return self._elements
|
|
|
|
|
|
2021-03-24 19:13:52 +00:00
|
|
|
|
def encrypt_path(self, path, is_element=False) -> str:
|
2020-06-02 18:54:47 +00:00
|
|
|
|
# Encrypts path to avoid plaintext results in logs
|
|
|
|
|
if is_element:
|
2020-12-17 21:06:47 +00:00
|
|
|
|
# Element paths are encrypted separately from text, to allow key
|
|
|
|
|
# regeneration once all items have been served to the user
|
2021-04-01 04:23:30 +00:00
|
|
|
|
enc_path = Fernet(self.user_key).encrypt(path.encode()).decode()
|
2020-06-02 18:54:47 +00:00
|
|
|
|
self._elements += 1
|
|
|
|
|
return enc_path
|
|
|
|
|
|
2021-04-01 04:23:30 +00:00
|
|
|
|
return Fernet(self.user_key).encrypt(path.encode()).decode()
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
2021-03-24 19:13:52 +00:00
|
|
|
|
def clean(self, soup) -> BeautifulSoup:
|
2022-12-05 20:28:29 +00:00
|
|
|
|
self.soup = soup
|
|
|
|
|
self.main_divs = self.soup.find('div', {'id': 'main'})
|
2020-06-02 18:54:47 +00:00
|
|
|
|
self.remove_ads()
|
2021-10-21 02:01:04 +00:00
|
|
|
|
self.remove_block_titles()
|
|
|
|
|
self.remove_block_url()
|
2021-06-23 22:59:57 +00:00
|
|
|
|
self.collapse_sections()
|
2022-12-05 20:28:29 +00:00
|
|
|
|
self.update_css()
|
|
|
|
|
self.update_styling()
|
|
|
|
|
self.remove_block_tabs()
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
for img in [_ for _ in self.soup.find_all('img') if 'src' in _.attrs]:
|
2020-06-02 18:54:47 +00:00
|
|
|
|
self.update_element_src(img, 'image/png')
|
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
for audio in [_ for _ in self.soup.find_all('audio') if 'src' in _.attrs]:
|
2020-06-02 18:54:47 +00:00
|
|
|
|
self.update_element_src(audio, 'audio/mpeg')
|
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
for link in self.soup.find_all('a', href=True):
|
2020-06-02 18:54:47 +00:00
|
|
|
|
self.update_link(link)
|
2020-04-29 00:19:34 +00:00
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
if self.config.alts:
|
|
|
|
|
self.site_alt_swap()
|
|
|
|
|
|
|
|
|
|
input_form = self.soup.find('form')
|
2020-04-29 00:59:33 +00:00
|
|
|
|
if input_form is not None:
|
2021-12-07 04:39:50 +00:00
|
|
|
|
input_form['method'] = 'GET' if self.config.get_only else 'POST'
|
2022-04-18 21:27:45 +00:00
|
|
|
|
# Use a relative URI for submissions
|
|
|
|
|
input_form['action'] = 'search'
|
2020-04-29 00:19:34 +00:00
|
|
|
|
|
2020-04-29 15:46:18 +00:00
|
|
|
|
# Ensure no extra scripts passed through
|
2022-12-05 20:28:29 +00:00
|
|
|
|
for script in self.soup('script'):
|
2020-04-29 15:46:18 +00:00
|
|
|
|
script.decompose()
|
|
|
|
|
|
2020-06-07 20:06:49 +00:00
|
|
|
|
# Update default footer and header
|
2022-12-05 20:28:29 +00:00
|
|
|
|
footer = self.soup.find('footer')
|
2020-05-24 20:03:11 +00:00
|
|
|
|
if footer:
|
2020-06-07 20:06:49 +00:00
|
|
|
|
# Remove divs that have multiple links beyond just page navigation
|
2020-12-17 21:06:47 +00:00
|
|
|
|
[_.decompose() for _ in footer.find_all('div', recursive=False)
|
|
|
|
|
if len(_.find_all('a', href=True)) > 3]
|
2023-02-21 16:57:44 +00:00
|
|
|
|
for link in footer.find_all('a', href=True):
|
|
|
|
|
link['href'] = f'{link["href"]}&preferences={self.config.preferences}'
|
2020-04-29 15:46:18 +00:00
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
header = self.soup.find('header')
|
2020-05-24 20:03:11 +00:00
|
|
|
|
if header:
|
|
|
|
|
header.decompose()
|
2022-12-05 20:28:29 +00:00
|
|
|
|
self.remove_site_blocks(self.soup)
|
|
|
|
|
return self.soup
|
2020-04-29 00:19:34 +00:00
|
|
|
|
|
2022-02-08 17:57:00 +00:00
|
|
|
|
def remove_site_blocks(self, soup) -> None:
|
2022-02-11 21:42:11 +00:00
|
|
|
|
if not self.config.block or not soup.body:
|
2022-02-08 17:57:00 +00:00
|
|
|
|
return
|
|
|
|
|
search_string = ' '.join(['-site:' +
|
|
|
|
|
_ for _ in self.config.block.split(',')])
|
|
|
|
|
selected = soup.body.findAll(text=re.compile(search_string))
|
|
|
|
|
|
|
|
|
|
for result in selected:
|
|
|
|
|
result.string.replace_with(result.string.replace(
|
|
|
|
|
search_string, ''))
|
|
|
|
|
|
2021-03-24 19:13:52 +00:00
|
|
|
|
def remove_ads(self) -> None:
|
|
|
|
|
"""Removes ads found in the list of search result divs
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
None (The soup object is modified directly)
|
|
|
|
|
"""
|
2020-06-02 18:54:47 +00:00
|
|
|
|
if not self.main_divs:
|
2020-04-29 16:03:34 +00:00
|
|
|
|
return
|
2020-04-29 00:19:34 +00:00
|
|
|
|
|
2020-06-02 18:54:47 +00:00
|
|
|
|
for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
|
2020-12-17 21:06:47 +00:00
|
|
|
|
div_ads = [_ for _ in div.find_all('span', recursive=True)
|
|
|
|
|
if has_ad_content(_.text)]
|
|
|
|
|
_ = div.decompose() if len(div_ads) else None
|
2020-04-29 00:19:34 +00:00
|
|
|
|
|
2021-10-21 02:01:04 +00:00
|
|
|
|
def remove_block_titles(self) -> None:
|
2021-12-07 04:39:50 +00:00
|
|
|
|
if not self.main_divs or not self.config.block_title:
|
2021-10-21 02:01:04 +00:00
|
|
|
|
return
|
2023-03-14 17:22:53 +00:00
|
|
|
|
block_title = re.compile(self.config.block_title)
|
2021-10-21 02:01:04 +00:00
|
|
|
|
for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
|
|
|
|
|
block_divs = [_ for _ in div.find_all('h3', recursive=True)
|
|
|
|
|
if block_title.search(_.text) is not None]
|
|
|
|
|
_ = div.decompose() if len(block_divs) else None
|
|
|
|
|
|
|
|
|
|
def remove_block_url(self) -> None:
|
2021-12-07 04:39:50 +00:00
|
|
|
|
if not self.main_divs or not self.config.block_url:
|
2021-10-21 02:01:04 +00:00
|
|
|
|
return
|
2023-03-14 17:22:53 +00:00
|
|
|
|
block_url = re.compile(self.config.block_url)
|
2021-10-21 02:01:04 +00:00
|
|
|
|
for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]:
|
|
|
|
|
block_divs = [_ for _ in div.find_all('a', recursive=True)
|
|
|
|
|
if block_url.search(_.attrs['href']) is not None]
|
|
|
|
|
_ = div.decompose() if len(block_divs) else None
|
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
def remove_block_tabs(self) -> None:
|
2022-02-07 17:47:25 +00:00
|
|
|
|
if self.main_divs:
|
|
|
|
|
for div in self.main_divs.find_all(
|
|
|
|
|
'div',
|
|
|
|
|
attrs={'class': f'{GClasses.main_tbm_tab}'}
|
|
|
|
|
):
|
|
|
|
|
_ = div.decompose()
|
|
|
|
|
else:
|
|
|
|
|
# when in images tab
|
2022-12-05 20:28:29 +00:00
|
|
|
|
for div in self.soup.find_all(
|
2022-02-07 17:47:25 +00:00
|
|
|
|
'div',
|
|
|
|
|
attrs={'class': f'{GClasses.images_tbm_tab}'}
|
|
|
|
|
):
|
|
|
|
|
_ = div.decompose()
|
|
|
|
|
|
2021-06-23 22:59:57 +00:00
|
|
|
|
def collapse_sections(self) -> None:
|
|
|
|
|
"""Collapses long result sections ("people also asked", "related
|
|
|
|
|
searches", etc) into "details" elements
|
2021-03-24 19:13:52 +00:00
|
|
|
|
|
|
|
|
|
These sections are typically the only sections in the results page that
|
2021-06-23 22:59:57 +00:00
|
|
|
|
have more than ~5 child divs within a primary result div.
|
2021-03-24 19:13:52 +00:00
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
None (The soup object is modified directly)
|
|
|
|
|
"""
|
2021-10-26 16:38:20 +00:00
|
|
|
|
minimal_mode = read_config_bool('WHOOGLE_MINIMAL')
|
|
|
|
|
|
2021-06-23 22:59:57 +00:00
|
|
|
|
def pull_child_divs(result_div: BeautifulSoup):
|
|
|
|
|
try:
|
|
|
|
|
return result_div.findChildren(
|
|
|
|
|
'div', recursive=False
|
|
|
|
|
)[0].findChildren(
|
|
|
|
|
'div', recursive=False)
|
|
|
|
|
except IndexError:
|
|
|
|
|
return []
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
2021-06-23 22:59:57 +00:00
|
|
|
|
if not self.main_divs:
|
2020-12-15 16:09:48 +00:00
|
|
|
|
return
|
|
|
|
|
|
2021-06-23 22:59:57 +00:00
|
|
|
|
# Loop through results and check for the number of child divs in each
|
2022-02-11 21:44:08 +00:00
|
|
|
|
for result in self.main_divs.find_all():
|
2021-06-23 22:59:57 +00:00
|
|
|
|
result_children = pull_child_divs(result)
|
2021-10-26 16:38:20 +00:00
|
|
|
|
if minimal_mode:
|
2022-02-02 19:57:05 +00:00
|
|
|
|
if any(f">{x}</span" in str(s) for s in result_children
|
|
|
|
|
for x in minimal_mode_sections):
|
|
|
|
|
result.decompose()
|
|
|
|
|
continue
|
2022-02-11 21:44:08 +00:00
|
|
|
|
for s in result_children:
|
|
|
|
|
if ('Twitter ›' in str(s)):
|
|
|
|
|
result.decompose()
|
|
|
|
|
continue
|
2022-02-02 19:57:05 +00:00
|
|
|
|
if len(result_children) < self.RESULT_CHILD_LIMIT:
|
2021-10-26 16:35:12 +00:00
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
if len(result_children) < self.RESULT_CHILD_LIMIT:
|
|
|
|
|
continue
|
2021-06-23 22:59:57 +00:00
|
|
|
|
|
|
|
|
|
# Find and decompose the first element with an inner HTML text val.
|
|
|
|
|
# This typically extracts the title of the section (i.e. "Related
|
|
|
|
|
# Searches", "People also ask", etc)
|
2022-01-18 20:47:35 +00:00
|
|
|
|
# If there are more than one child tags with text
|
|
|
|
|
# parenthesize the rest except the first
|
2021-06-23 22:59:57 +00:00
|
|
|
|
label = 'Collapsed Results'
|
2022-01-18 20:47:35 +00:00
|
|
|
|
subtitle = None
|
2021-06-23 22:59:57 +00:00
|
|
|
|
for elem in result_children:
|
|
|
|
|
if elem.text:
|
2022-01-18 20:47:35 +00:00
|
|
|
|
content = list(elem.strings)
|
|
|
|
|
label = content[0]
|
|
|
|
|
if len(content) > 1:
|
|
|
|
|
subtitle = '<span> (' + \
|
|
|
|
|
''.join(content[1:]) + ')</span>'
|
2021-06-23 22:59:57 +00:00
|
|
|
|
elem.decompose()
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Create the new details element to wrap around the result's
|
2021-07-04 19:20:19 +00:00
|
|
|
|
# first parent
|
|
|
|
|
parent = None
|
|
|
|
|
idx = 0
|
|
|
|
|
while not parent and idx < len(result_children):
|
|
|
|
|
parent = result_children[idx].parent
|
|
|
|
|
idx += 1
|
2021-10-26 16:38:20 +00:00
|
|
|
|
|
2021-06-23 22:59:57 +00:00
|
|
|
|
details = BeautifulSoup(features='html.parser').new_tag('details')
|
|
|
|
|
summary = BeautifulSoup(features='html.parser').new_tag('summary')
|
|
|
|
|
summary.string = label
|
2022-01-18 20:47:35 +00:00
|
|
|
|
|
|
|
|
|
if subtitle:
|
|
|
|
|
soup = BeautifulSoup(subtitle, 'html.parser')
|
|
|
|
|
summary.append(soup)
|
|
|
|
|
|
2021-06-23 22:59:57 +00:00
|
|
|
|
details.append(summary)
|
|
|
|
|
|
2021-10-26 16:38:20 +00:00
|
|
|
|
if parent and not minimal_mode:
|
2021-06-23 22:59:57 +00:00
|
|
|
|
parent.wrap(details)
|
2021-10-26 16:38:20 +00:00
|
|
|
|
elif parent and minimal_mode:
|
|
|
|
|
# Remove parent element from document if "minimal mode" is
|
|
|
|
|
# enabled
|
|
|
|
|
parent.decompose()
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
2022-04-13 17:29:07 +00:00
|
|
|
|
def update_element_src(self, element: Tag, mime: str, attr='src') -> None:
|
2021-03-24 19:13:52 +00:00
|
|
|
|
"""Encrypts the original src of an element and rewrites the element src
|
|
|
|
|
to use the "/element?src=" pass-through.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
None (The soup element is modified directly)
|
|
|
|
|
|
|
|
|
|
"""
|
2022-04-13 17:29:07 +00:00
|
|
|
|
src = element[attr].split(' ')[0]
|
2021-02-20 20:04:32 +00:00
|
|
|
|
|
|
|
|
|
if src.startswith('//'):
|
|
|
|
|
src = 'https:' + src
|
2022-04-13 17:29:07 +00:00
|
|
|
|
elif src.startswith('data:'):
|
|
|
|
|
return
|
2021-02-20 20:04:32 +00:00
|
|
|
|
|
|
|
|
|
if src.startswith(LOGO_URL):
|
2020-06-02 18:54:47 +00:00
|
|
|
|
# Re-brand with Whoogle logo
|
2021-04-09 15:00:02 +00:00
|
|
|
|
element.replace_with(BeautifulSoup(
|
2021-12-07 04:39:50 +00:00
|
|
|
|
render_template('logo.html'),
|
2021-04-09 15:00:02 +00:00
|
|
|
|
features='html.parser'))
|
2020-06-02 18:54:47 +00:00
|
|
|
|
return
|
2022-04-18 21:27:45 +00:00
|
|
|
|
elif src.startswith(G_M_LOGO_URL):
|
|
|
|
|
# Re-brand with single-letter Whoogle logo
|
|
|
|
|
element['src'] = 'static/img/favicon/apple-icon.png'
|
|
|
|
|
element.parent['href'] = 'home'
|
|
|
|
|
return
|
2021-02-20 20:04:32 +00:00
|
|
|
|
elif src.startswith(GOOG_IMG) or GOOG_STATIC in src:
|
2020-06-02 18:54:47 +00:00
|
|
|
|
element['src'] = BLANK_B64
|
|
|
|
|
return
|
|
|
|
|
|
2022-04-13 17:29:07 +00:00
|
|
|
|
element[attr] = f'{self.root_url}/{Endpoint.element}?url=' + (
|
|
|
|
|
self.encrypt_path(
|
|
|
|
|
src,
|
|
|
|
|
is_element=True
|
|
|
|
|
) + '&type=' + urlparse.quote(mime)
|
|
|
|
|
)
|
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
def update_css(self) -> None:
|
2022-04-13 17:29:07 +00:00
|
|
|
|
"""Updates URLs used in inline styles to be proxied by Whoogle
|
|
|
|
|
using the /element endpoint.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
None (The soup element is modified directly)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# Filter all <style> tags
|
2022-12-05 20:28:29 +00:00
|
|
|
|
for style in self.soup.find_all('style'):
|
2022-04-13 17:29:07 +00:00
|
|
|
|
style.string = clean_css(style.string, self.page_url)
|
|
|
|
|
|
|
|
|
|
# TODO: Convert remote stylesheets to style tags and proxy all
|
|
|
|
|
# remote requests
|
|
|
|
|
# for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
|
|
|
|
|
# print(link)
|
2020-12-17 21:06:47 +00:00
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
def update_styling(self) -> None:
|
2022-06-09 22:30:55 +00:00
|
|
|
|
# Update CSS classes for result divs
|
2022-12-05 20:28:29 +00:00
|
|
|
|
soup = GClasses.replace_css_classes(self.soup)
|
2022-06-09 22:30:55 +00:00
|
|
|
|
|
2020-04-29 00:19:34 +00:00
|
|
|
|
# Remove unnecessary button(s)
|
2022-12-05 20:28:29 +00:00
|
|
|
|
for button in self.soup.find_all('button'):
|
2020-04-29 00:19:34 +00:00
|
|
|
|
button.decompose()
|
|
|
|
|
|
|
|
|
|
# Remove svg logos
|
2022-12-05 20:28:29 +00:00
|
|
|
|
for svg in self.soup.find_all('svg'):
|
2020-04-29 00:19:34 +00:00
|
|
|
|
svg.decompose()
|
|
|
|
|
|
|
|
|
|
# Update logo
|
2022-12-05 20:28:29 +00:00
|
|
|
|
logo = self.soup.find('a', {'class': 'l'})
|
2020-04-29 00:19:34 +00:00
|
|
|
|
if logo and self.mobile:
|
2020-12-17 21:06:47 +00:00
|
|
|
|
logo['style'] = ('display:flex; justify-content:center; '
|
|
|
|
|
'align-items:center; color:#685e79; '
|
|
|
|
|
'font-size:18px; ')
|
2020-04-29 00:19:34 +00:00
|
|
|
|
|
|
|
|
|
# Fix search bar length on mobile
|
|
|
|
|
try:
|
2022-12-05 20:28:29 +00:00
|
|
|
|
search_bar = self.soup.find('header').find('form').find('div')
|
2020-04-29 00:19:34 +00:00
|
|
|
|
search_bar['style'] = 'width: 100%;'
|
|
|
|
|
except AttributeError:
|
|
|
|
|
pass
|
|
|
|
|
|
2022-02-07 17:47:25 +00:00
|
|
|
|
# Fix body max width on images tab
|
2022-12-05 20:28:29 +00:00
|
|
|
|
style = self.soup.find('style')
|
|
|
|
|
div = self.soup.find('div', attrs={
|
|
|
|
|
'class': f'{GClasses.images_tbm_tab}'})
|
2022-02-07 17:47:25 +00:00
|
|
|
|
if style and div and not self.mobile:
|
|
|
|
|
css = style.string
|
|
|
|
|
css_html_tag = (
|
|
|
|
|
'html{'
|
|
|
|
|
'font-family: Roboto, Helvetica Neue, Arial, sans-serif;'
|
|
|
|
|
'font-size: 14px;'
|
|
|
|
|
'line-height: 20px;'
|
|
|
|
|
'text-size-adjust: 100%;'
|
|
|
|
|
'word-wrap: break-word;'
|
|
|
|
|
'}'
|
|
|
|
|
)
|
|
|
|
|
css = f"{css_html_tag}{css}"
|
|
|
|
|
css = re.sub('body{(.*?)}',
|
|
|
|
|
'body{padding:0 8px;margin:0 auto;max-width:736px;}',
|
|
|
|
|
css)
|
|
|
|
|
style.string = css
|
|
|
|
|
|
2021-03-24 19:13:52 +00:00
|
|
|
|
def update_link(self, link: Tag) -> None:
|
|
|
|
|
"""Update internal link paths with encrypted path, otherwise remove
|
|
|
|
|
unnecessary redirects and/or marketing params from the url
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
link: A bs4 Tag element to inspect and update
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
None (the tag is updated directly)
|
|
|
|
|
|
|
|
|
|
"""
|
2022-08-01 19:46:06 +00:00
|
|
|
|
parsed_link = urlparse.urlparse(link['href'])
|
|
|
|
|
if '/url?q=' in link['href']:
|
|
|
|
|
link_netloc = extract_q(parsed_link.query, link['href'])
|
|
|
|
|
else:
|
|
|
|
|
link_netloc = parsed_link.netloc
|
2022-05-16 15:53:48 +00:00
|
|
|
|
|
2022-02-07 17:47:25 +00:00
|
|
|
|
# Remove any elements that direct to unsupported Google pages
|
2022-05-16 15:53:48 +00:00
|
|
|
|
if any(url in link_netloc for url in unsupported_g_pages):
|
2020-12-17 21:06:47 +00:00
|
|
|
|
# FIXME: The "Shopping" tab requires further filtering (see #136)
|
2020-10-25 17:52:30 +00:00
|
|
|
|
# Temporarily removing all links to that tab for now.
|
2023-02-04 23:36:16 +00:00
|
|
|
|
|
2022-09-20 17:10:27 +00:00
|
|
|
|
# Replaces the /url google unsupported link to the direct url
|
|
|
|
|
link['href'] = link_netloc
|
2022-03-01 19:48:33 +00:00
|
|
|
|
parent = link.parent
|
2022-09-20 17:10:27 +00:00
|
|
|
|
|
2022-08-01 19:46:06 +00:00
|
|
|
|
if 'google.com/preferences?hl=' in link_netloc:
|
|
|
|
|
# Handle case where a search is performed in a different
|
|
|
|
|
# language than what is configured. This usually returns a
|
|
|
|
|
# div with the same classes as normal search results, but with
|
|
|
|
|
# a link to configure language preferences through Google.
|
|
|
|
|
# Since we want all language config done through Whoogle, we
|
|
|
|
|
# can safely decompose this element.
|
|
|
|
|
while parent:
|
|
|
|
|
p_cls = parent.attrs.get('class') or []
|
|
|
|
|
if f'{GClasses.result_class_a}' in p_cls:
|
|
|
|
|
parent.decompose()
|
|
|
|
|
break
|
|
|
|
|
parent = parent.parent
|
|
|
|
|
else:
|
|
|
|
|
# Remove cases where google links appear in the footer
|
|
|
|
|
while parent:
|
|
|
|
|
p_cls = parent.attrs.get('class') or []
|
|
|
|
|
if parent.name == 'footer' or f'{GClasses.footer}' in p_cls:
|
|
|
|
|
link.decompose()
|
|
|
|
|
parent = parent.parent
|
2020-06-02 18:54:47 +00:00
|
|
|
|
return
|
|
|
|
|
|
2022-02-07 17:47:25 +00:00
|
|
|
|
# Replace href with only the intended destination (no "utm" type tags)
|
|
|
|
|
href = link['href'].replace('https://www.google.com', '')
|
2020-06-02 18:54:47 +00:00
|
|
|
|
result_link = urlparse.urlparse(href)
|
2021-05-29 16:21:20 +00:00
|
|
|
|
q = extract_q(result_link.query, href)
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
2022-08-01 18:12:55 +00:00
|
|
|
|
if q.startswith('/') and q not in self.query and 'spell=1' not in href:
|
2020-12-17 21:06:47 +00:00
|
|
|
|
# Internal google links (i.e. mail, maps, etc) should still
|
|
|
|
|
# be forwarded to Google
|
2021-05-29 16:21:20 +00:00
|
|
|
|
link['href'] = 'https://google.com' + q
|
2022-04-18 21:27:45 +00:00
|
|
|
|
elif q.startswith('https://accounts.google.com'):
|
|
|
|
|
# Remove Sign-in link
|
|
|
|
|
link.decompose()
|
|
|
|
|
return
|
2020-06-02 18:54:47 +00:00
|
|
|
|
elif '/search?q=' in href:
|
2020-12-17 21:06:47 +00:00
|
|
|
|
# "li:1" implies the query should be interpreted verbatim,
|
|
|
|
|
# which is accomplished by wrapping the query in double quotes
|
2020-07-26 17:53:59 +00:00
|
|
|
|
if 'li:1' in href:
|
2021-05-29 16:21:20 +00:00
|
|
|
|
q = '"' + q + '"'
|
|
|
|
|
new_search = 'search?q=' + self.encrypt_path(q)
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
|
|
|
|
query_params = parse_qs(urlparse.urlparse(href).query)
|
|
|
|
|
for param in VALID_PARAMS:
|
2020-12-17 21:06:47 +00:00
|
|
|
|
if param not in query_params:
|
|
|
|
|
continue
|
|
|
|
|
param_val = query_params[param][0]
|
2020-06-02 18:54:47 +00:00
|
|
|
|
new_search += '&' + param + '=' + param_val
|
|
|
|
|
link['href'] = new_search
|
|
|
|
|
elif 'url?q=' in href:
|
|
|
|
|
# Strip unneeded arguments
|
2021-05-29 16:21:20 +00:00
|
|
|
|
link['href'] = filter_link_args(q)
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
2022-04-13 17:29:07 +00:00
|
|
|
|
# Add alternate viewing options for results,
|
|
|
|
|
# if the result doesn't already have an AV link
|
|
|
|
|
netloc = urlparse.urlparse(link['href']).netloc
|
|
|
|
|
if self.config.anon_view and netloc not in self._av:
|
|
|
|
|
self._av.add(netloc)
|
|
|
|
|
append_anon_view(link, self.config)
|
2021-08-24 15:06:41 +00:00
|
|
|
|
|
2020-06-02 18:54:47 +00:00
|
|
|
|
else:
|
2021-05-27 16:01:57 +00:00
|
|
|
|
if href.startswith(MAPS_URL):
|
|
|
|
|
# Maps links don't work if a site filter is applied
|
2022-06-27 18:33:08 +00:00
|
|
|
|
link['href'] = build_map_url(link['href'])
|
2022-04-27 20:25:14 +00:00
|
|
|
|
elif (href.startswith('/?') or href.startswith('/search?') or
|
|
|
|
|
href.startswith('/imgres?')):
|
2022-04-18 21:27:45 +00:00
|
|
|
|
# make sure that tags can be clicked as relative URLs
|
|
|
|
|
link['href'] = href[1:]
|
|
|
|
|
elif href.startswith('/intl/'):
|
|
|
|
|
# do nothing, keep original URL for ToS
|
|
|
|
|
pass
|
|
|
|
|
elif href.startswith('/preferences'):
|
|
|
|
|
# there is no config specific URL, remove this
|
|
|
|
|
link.decompose()
|
|
|
|
|
return
|
2021-05-27 16:01:57 +00:00
|
|
|
|
else:
|
|
|
|
|
link['href'] = href
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
2022-06-24 16:50:14 +00:00
|
|
|
|
if self.config.new_tab and (
|
|
|
|
|
link["href"].startswith("http")
|
|
|
|
|
or link["href"].startswith("imgres?")
|
|
|
|
|
):
|
|
|
|
|
link["target"] = "_blank"
|
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
def site_alt_swap(self) -> None:
|
|
|
|
|
"""Replaces link locations and page elements if "alts" config
|
|
|
|
|
is enabled
|
|
|
|
|
"""
|
|
|
|
|
for site, alt in SITE_ALTS.items():
|
|
|
|
|
for div in self.soup.find_all('div', text=re.compile(site)):
|
|
|
|
|
# Use the number of words in the div string to determine if the
|
|
|
|
|
# string is a result description (shouldn't replace domains used
|
|
|
|
|
# in desc text).
|
|
|
|
|
# Also ignore medium.com replacements since these are handled
|
|
|
|
|
# specifically in the link description replacement, and medium
|
|
|
|
|
# results are never given their own "card" result where this
|
|
|
|
|
# replacement would make sense.
|
|
|
|
|
if site == 'medium.com' or len(div.string.split(' ')) > 1:
|
|
|
|
|
continue
|
2020-06-02 18:54:47 +00:00
|
|
|
|
|
2022-12-05 20:28:29 +00:00
|
|
|
|
div.string = div.string.replace(site, alt)
|
|
|
|
|
|
|
|
|
|
for link in self.soup.find_all('a', href=True):
|
|
|
|
|
# Search and replace all link descriptions
|
|
|
|
|
# with alternative location
|
|
|
|
|
link['href'] = get_site_alt(link['href'])
|
|
|
|
|
link_desc = link.find_all(
|
|
|
|
|
text=re.compile('|'.join(SITE_ALTS.keys())))
|
|
|
|
|
if len(link_desc) == 0:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Replace link description
|
|
|
|
|
link_desc = link_desc[0]
|
2022-03-30 20:46:33 +00:00
|
|
|
|
if site not in link_desc or not alt:
|
2021-10-24 05:23:37 +00:00
|
|
|
|
continue
|
2022-12-05 20:28:29 +00:00
|
|
|
|
|
2021-10-24 05:23:37 +00:00
|
|
|
|
new_desc = BeautifulSoup(features='html.parser').new_tag('div')
|
2022-12-05 20:28:29 +00:00
|
|
|
|
link_str = str(link_desc)
|
|
|
|
|
|
|
|
|
|
# Medium links should be handled differently, since 'medium.com'
|
|
|
|
|
# is a common substring of domain names, but shouldn't be
|
|
|
|
|
# replaced (i.e. 'philomedium.com' should stay as it is).
|
|
|
|
|
if 'medium.com' in link_str:
|
|
|
|
|
if link_str.startswith('medium.com') or '.medium.com' in link_str:
|
2023-02-04 23:36:16 +00:00
|
|
|
|
link_str = 'farside.link/scribe' + link_str[
|
|
|
|
|
link_str.find('medium.com') + len('medium.com'):]
|
|
|
|
|
new_desc.string = link_str
|
2022-12-05 20:28:29 +00:00
|
|
|
|
else:
|
|
|
|
|
new_desc.string = link_str.replace(site, alt)
|
|
|
|
|
|
2021-10-24 05:23:37 +00:00
|
|
|
|
link_desc.replace_with(new_desc)
|
2021-04-16 14:16:14 +00:00
|
|
|
|
|
|
|
|
|
def view_image(self, soup) -> BeautifulSoup:
|
|
|
|
|
"""Replaces the soup with a new one that handles mobile results and
|
|
|
|
|
adds the link of the image full res to the results.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
soup: A BeautifulSoup object containing the image mobile results.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
BeautifulSoup: The new BeautifulSoup object
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# get some tags that are unchanged between mobile and pc versions
|
|
|
|
|
cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
|
2022-12-29 22:17:34 +00:00
|
|
|
|
next_pages = soup.find('table', attrs={'class': "uZgmoc"})
|
2021-04-16 14:16:14 +00:00
|
|
|
|
|
|
|
|
|
results = []
|
|
|
|
|
# find results div
|
2022-12-29 22:17:34 +00:00
|
|
|
|
results_div = soup.find('div', attrs={'class': "nQvrDb"})
|
|
|
|
|
# find all the results (if any)
|
|
|
|
|
results_all = []
|
|
|
|
|
if results_div:
|
|
|
|
|
results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})
|
2021-04-16 14:16:14 +00:00
|
|
|
|
|
|
|
|
|
for item in results_all:
|
|
|
|
|
urls = item.find('a')['href'].split('&imgrefurl=')
|
|
|
|
|
|
2021-11-02 22:22:24 +00:00
|
|
|
|
# Skip urls that are not two-element lists
|
|
|
|
|
if len(urls) != 2:
|
|
|
|
|
continue
|
|
|
|
|
|
Improve public instance session management (#480)
This introduces a new approach to handling user sessions, which should
allow for users to set more reliable config settings on public instances.
Previously, when a user with cookies disabled would update their config,
this would modify the app's default config file, which would in turn
cause new users to inherit these settings when visiting the app for the
first time and cause users to inherit these settings when their current
session cookie expired (which was after 30 days by default I believe).
There was also some half-baked logic for determining on the backend
whether or not a user had cookies disabled, which lead to some issues
with out of control session file creation by Flask.
Now, when a user visits the site, their initial request is forwarded to
a session/<session id> endpoint, and during that subsequent request
their current session id is matched against the one found in the url. If
the ids match, the user has cookies enabled. If not, their original
request is modified with a 'cookies_disabled' query param that tells
Flask not to bother trying to set up a new session for that user, and
instead just use the app's fallback Fernet key for encryption and the
default config.
Since attempting to create a session for a user with cookies disabled
creates a new session file, there is now also a clean-up routine included
in the new session decorator, which will remove all sessions that don't
include a valid key in the dict. NOTE!!! This means that current user
sessions on public instances will be cleared once this update is merged
in. In the long run that's a good thing though, since this will allow session
mgmt to be a lot more reliable overall for users regardless of their cookie
preference.
Individual user sessions still use a unique Fernet key for encrypting queries,
but users with cookies disabled will use the default app key for encryption
and decryption.
Sessions are also now (semi)permanent and have a lifetime of 1 year.
2021-11-18 02:35:30 +00:00
|
|
|
|
img_url = urlparse.unquote(urls[0].replace(
|
2022-05-25 17:18:17 +00:00
|
|
|
|
f'/{Endpoint.imgres}?imgurl=', ''))
|
2021-06-16 14:40:18 +00:00
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Try to strip out only the necessary part of the web page link
|
|
|
|
|
web_page = urlparse.unquote(urls[1].split('&')[0])
|
|
|
|
|
except IndexError:
|
|
|
|
|
web_page = urlparse.unquote(urls[1])
|
|
|
|
|
|
2021-04-16 14:16:14 +00:00
|
|
|
|
img_tbn = urlparse.unquote(item.find('a').find('img')['src'])
|
2021-06-16 14:40:18 +00:00
|
|
|
|
|
2021-04-16 14:16:14 +00:00
|
|
|
|
results.append({
|
2021-06-16 14:40:18 +00:00
|
|
|
|
'domain': urlparse.urlparse(web_page).netloc,
|
2021-04-16 14:16:14 +00:00
|
|
|
|
'img_url': img_url,
|
2021-06-16 14:40:18 +00:00
|
|
|
|
'web_page': web_page,
|
2021-04-16 14:16:14 +00:00
|
|
|
|
'img_tbn': img_tbn
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(render_template('imageresults.html',
|
|
|
|
|
length=len(results),
|
|
|
|
|
results=results,
|
|
|
|
|
view_label="View Image"),
|
|
|
|
|
features='html.parser')
|
2022-02-07 17:47:25 +00:00
|
|
|
|
|
2021-04-16 14:16:14 +00:00
|
|
|
|
# replace correction suggested by google object if exists
|
|
|
|
|
if len(cor_suggested):
|
|
|
|
|
soup.find_all(
|
|
|
|
|
'table',
|
|
|
|
|
attrs={'class': "By0U9"}
|
|
|
|
|
)[0].replaceWith(cor_suggested[0])
|
|
|
|
|
# replace next page object at the bottom of the page
|
|
|
|
|
soup.find_all('table',
|
|
|
|
|
attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
|
|
|
|
|
return soup
|