whoogle-search/app/filter.py

from bs4 import BeautifulSoup
import re
import urllib.parse as urlparse
from urllib.parse import parse_qs

SKIP_ARGS = ['ref_src', 'utm']


class Filter:
    def __init__(self, mobile=False, config=None):
        if config is None:
            config = {}

        self.near = config['near'] if 'near' in config else None
        self.dark = config['dark'] if 'dark' in config else False
        self.nojs = config['nojs'] if 'nojs' in config else False
        self.mobile = mobile

    def __getitem__(self, name):
        return getattr(self, name)

    def reskin(self, page):
        # Aesthetic only re-skinning
        page = page.replace('>G<', '>Sh<')
        pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE)
        page = pattern.sub('685e79', page)
        if self.dark:
            page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')

        return page

    def clean(self, soup):
        def remove_ads():
            main_divs = soup.find('div', {'id': 'main'})
            if main_divs is None:
                return

            result_divs = main_divs.findAll('div', recursive=False)

            # Only ads/sponsored content use classes in the list of result divs
            ad_divs = [ad_div for ad_div in result_divs if 'class' in ad_div.attrs]
            for div in ad_divs:
                div.decompose()

        def sync_images():
            for img in soup.find_all('img'):
                if img['src'].startswith('//'):
                    img['src'] = 'https:' + img['src']

                img['src'] = '/tmp?image_url=' + img['src']

        def update_styling():
            # Remove unnecessary button(s)
            for button in soup.find_all('button'):
                button.decompose()

            # Remove svg logos
            for svg in soup.find_all('svg'):
                svg.decompose()

            # Update logo
            logo = soup.find('a', {'class': 'l'})
            if logo and self.mobile:
                logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; ' \
                                'font-size:18px; '

            # Fix search bar length on mobile
            try:
                search_bar = soup.find('header').find('form').find('div')
                search_bar['style'] = 'width: 100%;'
            except AttributeError:
                pass

            # Set up dark mode if active
            if self.dark:
                soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
                for input_element in soup.findAll('input'):
                    input_element['style'] = 'color:#fff;'

        def update_links():
            # Replace hrefs with only the intended destination (no "utm" type tags)
            for a in soup.find_all('a', href=True):
                href = a['href']
                if '/advanced_search' in href:
                    a.decompose()
                    continue

                if 'url?q=' in href:
                    # Strip unneeded arguments
                    result_link = urlparse.urlparse(href)
                    result_link = parse_qs(result_link.query)['q'][0]

                    parsed_link = urlparse.urlparse(result_link)
                    link_args = parse_qs(parsed_link.query)
                    safe_args = {}

                    for arg in link_args.keys():
                        if arg in SKIP_ARGS:
                            continue

                        safe_args[arg] = link_args[arg]

                    # Remove original link query and replace with filtered args
                    result_link = result_link.replace(parsed_link.query, '')
                    if len(safe_args) > 1:
                        result_link = result_link + urlparse.urlencode(safe_args)
                    else:
                        result_link = result_link.replace('?', '')

                    a['href'] = result_link

                    # Add no-js option
                    if self.nojs:
                        nojs_link = soup.new_tag('a')
                        nojs_link['href'] = '/window?location=' + result_link
                        nojs_link['style'] = 'display:block;width:100%;'
                        nojs_link.string = 'NoJS Link: ' + nojs_link['href']
                        a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
                        a.append(nojs_link)

            # Ensure no extra scripts passed through
            try:
                for script in soup('script'):
                    script.decompose()
                soup.find('div', id='sfooter').decompose()
            except Exception:
                pass

        remove_ads()
        sync_images()
        update_styling()
        update_links()
        return soup
Refactored routes, added filter class for returned results, added dockerignore 4 years ago			`from bs4 import BeautifulSoup`
			`import re`
			`import urllib.parse as urlparse`
			`from urllib.parse import parse_qs`

Improved bad url arg filtering 4 years ago			`SKIP_ARGS = ['ref_src', 'utm']`

Added testing and ci build, refactored filter class, refactored project structure 4 years ago
			`class Filter:`
			`def __init__(self, mobile=False, config=None):`
			`if config is None:`
			`config = {}`

Fixed config bug in filter, updated run script to work on mac os 4 years ago			`self.near = config['near'] if 'near' in config else None`
			`self.dark = config['dark'] if 'dark' in config else False`
Minor refactor of filter class, updated tests, fixed html/css, added ua to config 4 years ago			`self.nojs = config['nojs'] if 'nojs' in config else False`
			`self.mobile = mobile`
Added testing and ci build, refactored filter class, refactored project structure 4 years ago
Refactoring of user requests and routing Curl requests and user agent related functionality was moved to its own request class. Routes was refactored to only include strictly routing related functionality. Filter class was cleaned up (had routing/request related logic in here, which didn't make sense) 4 years ago			`def __getitem__(self, name):`
			`return getattr(self, name)`

Added testing and ci build, refactored filter class, refactored project structure 4 years ago			`def reskin(self, page):`
			`# Aesthetic only re-skinning`
			`page = page.replace('>G<', '>Sh<')`
			`pattern = re.compile('4285f4\|ea4335\|fbcc05\|34a853\|fbbc05', re.IGNORECASE)`
			`page = pattern.sub('685e79', page)`
Fixed config bug in filter, updated run script to work on mac os 4 years ago			`if self.dark:`
Added testing and ci build, refactored filter class, refactored project structure 4 years ago			`page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')`

			`return page`

			`def clean(self, soup):`
Added image proxying, refactored filter class Images were previously directly fetched from google search results, which was a potential privacy hazard. All image sources are now modified to be passed through shoogle's routing first, which will then fetch raw image data and pass it through to the user. Filter class was refactored to split the primary clean method into smaller, more manageable submethods. 4 years ago			`def remove_ads():`
			`main_divs = soup.find('div', {'id': 'main'})`
			`if main_divs is None:`
			`return`

Added testing and ci build, refactored filter class, refactored project structure 4 years ago			`result_divs = main_divs.findAll('div', recursive=False)`

			`# Only ads/sponsored content use classes in the list of result divs`
			`ad_divs = [ad_div for ad_div in result_divs if 'class' in ad_div.attrs]`
			`for div in ad_divs:`
			`div.decompose()`

Added image proxying, refactored filter class Images were previously directly fetched from google search results, which was a potential privacy hazard. All image sources are now modified to be passed through shoogle's routing first, which will then fetch raw image data and pass it through to the user. Filter class was refactored to split the primary clean method into smaller, more manageable submethods. 4 years ago			`def sync_images():`
			`for img in soup.find_all('img'):`
			`if img['src'].startswith('//'):`
			`img['src'] = 'https:' + img['src']`

			`img['src'] = '/tmp?image_url=' + img['src']`

			`def update_styling():`
			`# Remove unnecessary button(s)`
			`for button in soup.find_all('button'):`
			`button.decompose()`

			`# Remove svg logos`
			`for svg in soup.find_all('svg'):`
			`svg.decompose()`

			`# Update logo`
			`logo = soup.find('a', {'class': 'l'})`
			`if logo and self.mobile:`
			`logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; ' \`
			`'font-size:18px; '`

			`# Fix search bar length on mobile`
			`try:`
			`search_bar = soup.find('header').find('form').find('div')`
			`search_bar['style'] = 'width: 100%;'`
			`except AttributeError:`
			`pass`

			`# Set up dark mode if active`
			`if self.dark:`
			`soup.find('html')['style'] = 'scrollbar-color: #333 #111;'`
			`for input_element in soup.findAll('input'):`
			`input_element['style'] = 'color:#fff;'`

			`def update_links():`
			`# Replace hrefs with only the intended destination (no "utm" type tags)`
			`for a in soup.find_all('a', href=True):`
			`href = a['href']`
			`if '/advanced_search' in href:`
			`a.decompose()`
			`continue`

			`if 'url?q=' in href:`
			`# Strip unneeded arguments`
			`result_link = urlparse.urlparse(href)`
			`result_link = parse_qs(result_link.query)['q'][0]`

			`parsed_link = urlparse.urlparse(result_link)`
			`link_args = parse_qs(parsed_link.query)`
			`safe_args = {}`

			`for arg in link_args.keys():`
			`if arg in SKIP_ARGS:`
			`continue`

			`safe_args[arg] = link_args[arg]`

			`# Remove original link query and replace with filtered args`
			`result_link = result_link.replace(parsed_link.query, '')`
			`if len(safe_args) > 1:`
			`result_link = result_link + urlparse.urlencode(safe_args)`
			`else:`
			`result_link = result_link.replace('?', '')`

			`a['href'] = result_link`

			`# Add no-js option`
			`if self.nojs:`
			`nojs_link = soup.new_tag('a')`
			`nojs_link['href'] = '/window?location=' + result_link`
			`nojs_link['style'] = 'display:block;width:100%;'`
			`nojs_link.string = 'NoJS Link: ' + nojs_link['href']`
			`a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))`
			`a.append(nojs_link)`

			`# Ensure no extra scripts passed through`
			`try:`
			`for script in soup('script'):`
			`script.decompose()`
			`soup.find('div', id='sfooter').decompose()`
			`except Exception:`
			`pass`

			`remove_ads()`
			`sync_images()`
			`update_styling()`
			`update_links()`
Added testing and ci build, refactored filter class, refactored project structure 4 years ago			`return soup`