@ -1,6 +1,7 @@
from app . request import VALID_PARAMS
from app . utils . results import *
from bs4 . element import ResultSet
from bs4 import BeautifulSoup
from bs4 . element import ResultSet , Tag
from cryptography . fernet import Fernet
import re
import urllib . parse as urlparse
@ -8,7 +9,7 @@ from urllib.parse import parse_qs
class Filter :
def __init__ ( self , user_keys : dict , mobile = False , config = None ) :
def __init__ ( self , user_keys : dict , mobile = False , config = None ) - > None :
if config is None :
config = { }
@ -29,7 +30,7 @@ class Filter:
def elements ( self ) :
return self . _elements
def reskin ( self , page ):
def reskin ( self , page : str ) - > str :
# Aesthetic only re-skinning
if self . dark :
page = page . replace (
@ -39,22 +40,22 @@ class Filter:
return page
def encrypt_path ( self , msg , is_element = False ) :
def encrypt_path ( self , path , is_element = False ) - > str :
# Encrypts path to avoid plaintext results in logs
if is_element :
# Element paths are encrypted separately from text, to allow key
# regeneration once all items have been served to the user
enc_path = Fernet (
self . user_keys [ ' element_key ' ]
) . encrypt ( msg . encode ( ) ) . decode ( )
) . encrypt ( path . encode ( ) ) . decode ( )
self . _elements + = 1
return enc_path
return Fernet (
self . user_keys [ ' text_key ' ]
) . encrypt ( msg . encode ( ) ) . decode ( )
) . encrypt ( path . encode ( ) ) . decode ( )
def clean ( self , soup ) :
def clean ( self , soup ) - > BeautifulSoup :
self . main_divs = soup . find ( ' div ' , { ' id ' : ' main ' } )
self . remove_ads ( )
self . fix_question_section ( )
@ -90,7 +91,12 @@ class Filter:
return soup
def remove_ads ( self ) :
def remove_ads ( self ) - > None :
""" Removes ads found in the list of search result divs
Returns :
None ( The soup object is modified directly )
"""
if not self . main_divs :
return
@ -99,7 +105,16 @@ class Filter:
if has_ad_content ( _ . text ) ]
_ = div . decompose ( ) if len ( div_ads ) else None
def fix_question_section ( self ) :
def fix_question_section ( self ) - > None :
""" Collapses the " People Also Asked " section into a " details " element
These sections are typically the only sections in the results page that
are structured as < div > < h2 > Title < / h2 > < div > . . . < / div > < / div > , so they are
extracted by checking all result divs for h2 children .
Returns :
None ( The soup object is modified directly )
"""
if not self . main_divs :
return
@ -126,7 +141,14 @@ class Filter:
for question in questions :
question [ ' style ' ] = ' padding: 10px; font-style: italic; '
def update_element_src ( self , element , mime ) :
def update_element_src ( self , element : Tag , mime : str ) - > None :
""" Encrypts the original src of an element and rewrites the element src
to use the " /element?src= " pass - through .
Returns :
None ( The soup element is modified directly )
"""
src = element [ ' src ' ]
if src . startswith ( ' // ' ) :
@ -145,7 +167,8 @@ class Filter:
src ,
is_element = True ) + ' &type= ' + urlparse . quote ( mime )
def update_styling ( self , soup ) :
def update_styling ( self , soup ) - > None :
""" """
# Remove unnecessary button(s)
for button in soup . find_all ( ' button ' ) :
button . decompose ( )
@ -168,7 +191,17 @@ class Filter:
except AttributeError :
pass
def update_link ( self , link ) :
def update_link ( self , link : Tag ) - > None :
""" Update internal link paths with encrypted path, otherwise remove
unnecessary redirects and / or marketing params from the url
Args :
link : A bs4 Tag element to inspect and update
Returns :
None ( the tag is updated directly )
"""
# Replace href with only the intended destination (no "utm" type tags)
href = link [ ' href ' ] . replace ( ' https://www.google.com ' , ' ' )
if ' advanced_search ' in href or ' tbm=shop ' in href :