@ -119,6 +119,7 @@ class Filter:
page_url = ' ' ,
query = ' ' ,
mobile = False ) - > None :
self . soup = None
self . config = config
self . mobile = mobile
self . user_key = user_key
@ -149,46 +150,50 @@ class Filter:
return Fernet ( self . user_key ) . encrypt ( path . encode ( ) ) . decode ( )
def clean ( self , soup ) - > BeautifulSoup :
self . main_divs = soup . find ( ' div ' , { ' id ' : ' main ' } )
self . soup = soup
self . main_divs = self . soup . find ( ' div ' , { ' id ' : ' main ' } )
self . remove_ads ( )
self . remove_block_titles ( )
self . remove_block_url ( )
self . collapse_sections ( )
self . update_css ( soup )
self . update_styling ( soup )
self . remove_block_tabs ( soup )
self . update_css ( )
self . update_styling ( )
self . remove_block_tabs ( )
for img in [ _ for _ in soup . find_all ( ' img ' ) if ' src ' in _ . attrs ] :
for img in [ _ for _ in self . soup . find_all ( ' img ' ) if ' src ' in _ . attrs ] :
self . update_element_src ( img , ' image/png ' )
for audio in [ _ for _ in soup . find_all ( ' audio ' ) if ' src ' in _ . attrs ] :
for audio in [ _ for _ in self . soup . find_all ( ' audio ' ) if ' src ' in _ . attrs ] :
self . update_element_src ( audio , ' audio/mpeg ' )
for link in soup . find_all ( ' a ' , href = True ) :
for link in self . soup . find_all ( ' a ' , href = True ) :
self . update_link ( link )
input_form = soup . find ( ' form ' )
if self . config . alts :
self . site_alt_swap ( )
input_form = self . soup . find ( ' form ' )
if input_form is not None :
input_form [ ' method ' ] = ' GET ' if self . config . get_only else ' POST '
# Use a relative URI for submissions
input_form [ ' action ' ] = ' search '
# Ensure no extra scripts passed through
for script in soup ( ' script ' ) :
for script in self . soup ( ' script ' ) :
script . decompose ( )
# Update default footer and header
footer = soup . find ( ' footer ' )
footer = self . soup . find ( ' footer ' )
if footer :
# Remove divs that have multiple links beyond just page navigation
[ _ . decompose ( ) for _ in footer . find_all ( ' div ' , recursive = False )
if len ( _ . find_all ( ' a ' , href = True ) ) > 3 ]
header = soup . find ( ' header ' )
header = self . soup . find ( ' header ' )
if header :
header . decompose ( )
self . remove_site_blocks ( soup )
return soup
self . remove_site_blocks ( self . soup )
return self . soup
def remove_site_blocks ( self , soup ) - > None :
if not self . config . block or not soup . body :
@ -233,7 +238,7 @@ class Filter:
if block_url . search ( _ . attrs [ ' href ' ] ) is not None ]
_ = div . decompose ( ) if len ( block_divs ) else None
def remove_block_tabs ( self , soup ) - > None :
def remove_block_tabs ( self ) - > None :
if self . main_divs :
for div in self . main_divs . find_all (
' div ' ,
@ -242,7 +247,7 @@ class Filter:
_ = div . decompose ( )
else :
# when in images tab
for div in soup . find_all (
for div in self . soup . find_all (
' div ' ,
attrs = { ' class ' : f ' { GClasses . images_tbm_tab } ' }
) :
@ -369,7 +374,7 @@ class Filter:
) + ' &type= ' + urlparse . quote ( mime )
)
def update_css ( self , soup ) - > None :
def update_css ( self ) - > None :
""" Updates URLs used in inline styles to be proxied by Whoogle
using the / element endpoint .
@ -378,7 +383,7 @@ class Filter:
"""
# Filter all <style> tags
for style in soup . find_all ( ' style ' ) :
for style in self . soup . find_all ( ' style ' ) :
style . string = clean_css ( style . string , self . page_url )
# TODO: Convert remote stylesheets to style tags and proxy all
@ -386,20 +391,20 @@ class Filter:
# for link in soup.find_all('link', attrs={'rel': 'stylesheet'}):
# print(link)
def update_styling ( self , soup ) - > None :
def update_styling ( self ) - > None :
# Update CSS classes for result divs
soup = GClasses . replace_css_classes ( soup )
soup = GClasses . replace_css_classes ( self . soup )
# Remove unnecessary button(s)
for button in soup . find_all ( ' button ' ) :
for button in self . soup . find_all ( ' button ' ) :
button . decompose ( )
# Remove svg logos
for svg in soup . find_all ( ' svg ' ) :
for svg in self . soup . find_all ( ' svg ' ) :
svg . decompose ( )
# Update logo
logo = soup . find ( ' a ' , { ' class ' : ' l ' } )
logo = self . soup . find ( ' a ' , { ' class ' : ' l ' } )
if logo and self . mobile :
logo [ ' style ' ] = ( ' display:flex; justify-content:center; '
' align-items:center; color:#685e79; '
@ -407,14 +412,15 @@ class Filter:
# Fix search bar length on mobile
try :
search_bar = soup . find ( ' header ' ) . find ( ' form ' ) . find ( ' div ' )
search_bar = self . soup . find ( ' header ' ) . find ( ' form ' ) . find ( ' div ' )
search_bar [ ' style ' ] = ' width: 100 % ; '
except AttributeError :
pass
# Fix body max width on images tab
style = soup . find ( ' style ' )
div = soup . find ( ' div ' , attrs = { ' class ' : f ' { GClasses . images_tbm_tab } ' } )
style = self . soup . find ( ' style ' )
div = self . soup . find ( ' div ' , attrs = {
' class ' : f ' { GClasses . images_tbm_tab } ' } )
if style and div and not self . mobile :
css = style . string
css_html_tag = (
@ -444,7 +450,6 @@ class Filter:
"""
parsed_link = urlparse . urlparse ( link [ ' href ' ] )
link_netloc = ' '
if ' /url?q= ' in link [ ' href ' ] :
link_netloc = extract_q ( parsed_link . query , link [ ' href ' ] )
else :
@ -543,25 +548,54 @@ class Filter:
) :
link [ " target " ] = " _blank "
# Replace link location if "alts" config is enabled
if self . config . alts :
# Search and replace all link descriptions
# with alternative location
link [ ' href ' ] = get_site_alt ( link [ ' href ' ] )
link_desc = link . find_all (
text = re . compile ( ' | ' . join ( SITE_ALTS . keys ( ) ) ) )
if len ( link_desc ) == 0 :
return
def site_alt_swap ( self ) - > None :
""" Replaces link locations and page elements if " alts " config
is enabled
"""
for site , alt in SITE_ALTS . items ( ) :
for div in self . soup . find_all ( ' div ' , text = re . compile ( site ) ) :
# Use the number of words in the div string to determine if the
# string is a result description (shouldn't replace domains used
# in desc text).
# Also ignore medium.com replacements since these are handled
# specifically in the link description replacement, and medium
# results are never given their own "card" result where this
# replacement would make sense.
if site == ' medium.com ' or len ( div . string . split ( ' ' ) ) > 1 :
continue
div . string = div . string . replace ( site , alt )
for link in self . soup . find_all ( ' a ' , href = True ) :
# Search and replace all link descriptions
# with alternative location
link [ ' href ' ] = get_site_alt ( link [ ' href ' ] )
link_desc = link . find_all (
text = re . compile ( ' | ' . join ( SITE_ALTS . keys ( ) ) ) )
if len ( link_desc ) == 0 :
continue
# Replace link description
link_desc = link_desc [ 0 ]
for site , alt in SITE_ALTS . items ( ) :
# Replace link description
link_desc = link_desc [ 0 ]
if site not in link_desc or not alt :
continue
new_desc = BeautifulSoup ( features = ' html.parser ' ) . new_tag ( ' div ' )
new_desc . string = str ( link_desc ) . replace ( site , alt )
link_str = str ( link_desc )
# Medium links should be handled differently, since 'medium.com'
# is a common substring of domain names, but shouldn't be
# replaced (i.e. 'philomedium.com' should stay as it is).
if ' medium.com ' in link_str :
if link_str . startswith ( ' medium.com ' ) or ' .medium.com ' in link_str :
new_desc . string = link_str . replace (
' medium.com ' , ' farside.link/scribe ' )
else :
new_desc . string = link_str
else :
new_desc . string = link_str . replace ( site , alt )
link_desc . replace_with ( new_desc )
break
def view_image ( self , soup ) - > BeautifulSoup :
""" Replaces the soup with a new one that handles mobile results and