Added testing and ci build, refactored filter class, refactored project structure

4 years ago · b5b6e64177
parent 2600f494b7
commit b5b6e64177
15 changed files with 290 additions and 149 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,3 +5,4 @@ __pycache__/
 *.pem
 *.xml
 config.json
+test/static
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,6 @@
+language: python
+python: 3.6
+install:
+    - pip install -r config/requirements.txt
+script:
+    - ./run test
--- a/4
+++ b/4
@ -3,6 +3,6 @@ FROM python:3
 WORKDIR /usr/src/app
 COPY . .

-RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir -r config/requirements.txt

-CMD ["./run.sh"]
+CMD ["./run"]
--- a/README.md
+++ b/README.md
@ -29,14 +29,26 @@ heroku open
 Now you're done! This series of commands can take a while, but once you run it once, you shouldn't have to run it again. The final command, `heroku open` will launch a tab in your web browser, where you can test out Shoogle and even [set it as your primary search engine](https://github.com/benbusby/shoogle#set-shoogle-as-your-primary-search-engine).

 #### B) Using your own server, or alternative container deployment
-There are other methods for deploying docker containers that are well outlined in [this article](https://rollout.io/blog/the-shortlist-of-docker-hosting/), but there are too many to describe set up for each here. Generally it should be about the same amount of effort as the Heroku deployment. 
+There are other methods for deploying docker containers that are well outlined in [this article](https://rollout.io/blog/the-shortlist-of-docker-hosting/), but there are too many to describe set up for each here. Generally it should be about the same amount of effort as the Heroku deployment.

 Depending on your preferences, you can also deploy the app yourself on your own infrastructure. This route would require a few extra steps:
  - A server (I personally recommend [Digital Ocean](https://www.digitalocean.com/pricing/) or [Linode](https://www.linode.com/pricing/), their cheapest tiers will work fine)
  - Your own URL (I suppose this is optional, but recommended)
  - SSL certificates (free through [Let's Encrypt](https://letsencrypt.org/getting-started/))
  - A bit more experience or willingness to work through issues
-  
+
+## Setup (Local Only)
+If you want to test the app out on your own machine first, you can build it with the following instructions:
+
+```bash
+git clone https://github.com/benbusby/shoogle.git
+cd shoogle
+python3 -m venv venv
+source venv/bin/activate
+pip install -r config/requirements.txt
+./run
+```
+
 ## Usage
 Same as most search engines, with the exception of filtering by time range.

@ -44,7 +56,7 @@ To filter by a range of time, append ":past <time>" to the end of your search, w

 ## Extra Steps
 ### Set Shoogle as your primary search engine
-1. From the main shoogle folder, run `python opensearch.py "<your app url>"`
+1. From the main shoogle folder, run `python config/opensearch.py "<your app url>"`
 2. Rebuild and release your updated app
  - `heroku container:push web` and then `heroku container:release web`
 3. Update browser settings
@ -69,4 +81,4 @@ Part of the deal with Heroku's free tier is that you're allocated 550 hours/mont

 A good solution for this is to set up a simple cronjob on any device at your home that is consistently powered on and connected to the internet (in my case, a PiHole worked perfectly). All the device needs to do is fetch app content on a consistent basis to keep the app alive in whatever ~17 hour window you want it on (17 hrs * 31 days = 527, meaning you'd still have 23 leftover hours each month if you searched outside of your target window).

-For instance: `*/20 7-23 * * * curl https://<your heroku app name>.herokuapp.com > /home/<username>/shoogle-refresh` will fetch the home page of the app every 20 minutes between 7am and midnight, allowing for downtime from midnight to 7am. And again, this wouldn't be a hard limit - you'd still have plenty of remaining hours of uptime each month in case you were searching after this window has closed. 
+For instance: `*/20 7-23 * * * curl https://<your heroku app name>.herokuapp.com > /home/<username>/shoogle-refresh` will fetch the home page of the app every 20 minutes between 7am and midnight, allowing for downtime from midnight to 7am. And again, this wouldn't be a hard limit - you'd still have plenty of remaining hours of uptime each month in case you were searching after this window has closed.
--- a/app/filter.py
+++ b/app/filter.py
@ -3,109 +3,118 @@ import re
 import urllib.parse as urlparse
 from urllib.parse import parse_qs

-AD_CLASS = 'ZINbbc'
-SPONS_CLASS = 'D1fz0e'
-
-
-def reskin(page, dark_mode=False):
-    # Aesthetic only re-skinning
-    page = page.replace('>G<', '>Sh<')
-    pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE)
-    page = pattern.sub('685e79', page)
-    if dark_mode:
-        page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')
-
-    return page
-
-
-def gen_query(q, args, near_city=None):
-    # Use :past(hour/day/week/month/year) if available
-    # example search "new restaurants :past month"
-    tbs = ''
-    # if 'tbs' in request.args:
-    #     tbs = '&tbs=' + request.args.get('tbs')
-    #     q = q.replace(q.split(':past', 1)[-1], '').replace(':past', '')
-    if ':past' in q:
-        time_range = str.strip(q.split(':past', 1)[-1])
-        tbs = '&tbs=qdr:' + str.lower(time_range[0])
-
-    # Ensure search query is parsable
-    q = urlparse.quote(q)
-
-    # Pass along type of results (news, images, books, etc)
-    tbm = ''
-    if 'tbm' in args:
-        tbm = '&tbm=' + args.get('tbm')
-
-    # Get results page start value (10 per page, ie page 2 start val = 20)
-    start = ''
-    if 'start' in args:
-        start = '&start=' + args.get('start')
-
-    # Grab city from config, if available
-    near = ''
-    if near_city:
-        near = '&near=' + urlparse.quote(near_city)
-
-    return q + tbs + tbm + start + near
-
-
-def cook(soup, user_agent, nojs=False, dark_mode=False):
-    # Remove all ads (TODO: Ad specific div classes probably change over time, look into a more generic method)
-    main_divs = soup.find('div', {'id': 'main'})
-    if main_divs is not None:
-        ad_divs = main_divs.findAll('div', {'class': AD_CLASS}, recursive=False)
-        sponsored_divs = main_divs.findAll('div', {'class': SPONS_CLASS}, recursive=False)
-        for div in ad_divs + sponsored_divs:
-            div.decompose()
-
-    # Remove unnecessary button(s)
-    for button in soup.find_all('button'):
-        button.decompose()
-
-    # Remove svg logos
-    for svg in soup.find_all('svg'):
-        svg.decompose()
-
-    # Update logo
-    logo = soup.find('a', {'class': 'l'})
-    if logo is not None and ('Android' in user_agent or 'iPhone' in user_agent):
-        logo.insert(0, 'Shoogle')
-        logo['style'] = 'display: flex;justify-content: center;align-items: center;color: #685e79;font-size: 18px;'
-
-    # Replace hrefs with only the intended destination (no "utm" type tags)
-    for a in soup.find_all('a', href=True):
-        href = a['href']
-        if '/advanced_search' in href:
-            a.decompose()
-            continue
-
-        if 'url?q=' in href:
-            # Strip unneeded arguments
-            href = urlparse.urlparse(href)
-            href = parse_qs(href.query)['q'][0]
-
-            # Add no-js option
-            if nojs:
-                nojs_link = soup.new_tag('a')
-                nojs_link['href'] = '/window?location=' + href
-                nojs_link['style'] = 'display:block;width:100%;'
-                nojs_link.string = 'NoJS Link: ' + nojs_link['href']
-                a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
-                a.append(nojs_link)
-
-    # Set up dark mode if active
-    if dark_mode:
-        soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
-        for input_element in soup.findAll('input'):
-            input_element['style'] = 'color:#fff;'
-
-    # Ensure no extra scripts passed through
-    try:
-        for script in soup('script'):
-            script.decompose()
-        soup.find('div', id='sfooter').decompose()
-    except Exception:
-        pass
-
-    return soup
+
+class Filter:
+    def __init__(self, mobile=False, config=None):
+        self.mobile = False
+        self.dark_mode = False
+        self.nojs = False
+        self.near_city = None
+
+        if config is None:
+            config = {}
+
+        near_city = config['near'] if 'near' in config else None
+        dark_mode = config['dark_mode'] if 'dark_mode' in config else False
+        nojs = config['nojs'] if 'nojs' in config else False
+        mobile = mobile
+
+    def reskin(self, page):
+        # Aesthetic only re-skinning
+        page = page.replace('>G<', '>Sh<')
+        pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE)
+        page = pattern.sub('685e79', page)
+        if self.dark_mode:
+            page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea')
+
+        return page
+
+    def gen_query(self, q, args):
+        # Use :past(hour/day/week/month/year) if available
+        # example search "new restaurants :past month"
+        tbs = ''
+        if ':past' in q:
+            time_range = str.strip(q.split(':past', 1)[-1])
+            tbs = '&tbs=qdr:' + str.lower(time_range[0])
+
+        # Ensure search query is parsable
+        q = urlparse.quote(q)
+
+        # Pass along type of results (news, images, books, etc)
+        tbm = ''
+        if 'tbm' in args:
+            tbm = '&tbm=' + args.get('tbm')
+
+        # Get results page start value (10 per page, ie page 2 start val = 20)
+        start = ''
+        if 'start' in args:
+            start = '&start=' + args.get('start')
+
+        # Grab city from config, if available
+        near = ''
+        if self.near_city:
+            near = '&near=' + urlparse.quote(self.near_city)
+
+        return q + tbs + tbm + start + near
+
+    def clean(self, soup):
+        # Remove all ads
+        main_divs = soup.find('div', {'id': 'main'})
+        if main_divs is not None:
+            result_divs = main_divs.findAll('div', recursive=False)
+
+            # Only ads/sponsored content use classes in the list of result divs
+            ad_divs = [ad_div for ad_div in result_divs if 'class' in ad_div.attrs]
+            for div in ad_divs:
+                div.decompose()
+
+        # Remove unnecessary button(s)
+        for button in soup.find_all('button'):
+            button.decompose()
+
+        # Remove svg logos
+        for svg in soup.find_all('svg'):
+            svg.decompose()
+
+        # Update logo
+        logo = soup.find('a', {'class': 'l'})
+        if logo is not None and self.mobile:
+            logo.insert(0, 'Shoogle')
+            logo['style'] = 'display: flex;justify-content: center;align-items: center;color: #685e79;font-size: 18px;'
+
+        # Replace hrefs with only the intended destination (no "utm" type tags)
+        for a in soup.find_all('a', href=True):
+            href = a['href']
+            if '/advanced_search' in href:
+                a.decompose()
+                continue
+
+            if 'url?q=' in href:
+                # Strip unneeded arguments
+                href = urlparse.urlparse(href)
+                href = parse_qs(href.query)['q'][0]
+
+                # Add no-js option
+                if self.nojs:
+                    nojs_link = soup.new_tag('a')
+                    nojs_link['href'] = '/window?location=' + href
+                    nojs_link['style'] = 'display:block;width:100%;'
+                    nojs_link.string = 'NoJS Link: ' + nojs_link['href']
+                    a.append(BeautifulSoup('<br><hr><br>', 'html.parser'))
+                    a.append(nojs_link)
+
+        # Set up dark mode if active
+        if self.dark_mode:
+            soup.find('html')['style'] = 'scrollbar-color: #333 #111;'
+            for input_element in soup.findAll('input'):
+                input_element['style'] = 'color:#fff;'
+
+        # Ensure no extra scripts passed through
+        try:
+            for script in soup('script'):
+                script.decompose()
+            soup.find('div', id='sfooter').decompose()
+        except Exception:
+            pass
+
+        return soup
--- a/app/routes.py
+++ b/app/routes.py
@ -1,4 +1,5 @@
-from app import app, rhyme, filter
+from app import app, rhyme
+from app.filter import Filter
 from bs4 import BeautifulSoup
 from flask import request, redirect, render_template
 from io import BytesIO
@ -7,8 +8,8 @@ import os
 import pycurl
 import urllib.parse as urlparse

-APP_ROOT = os.path.dirname(os.path.abspath(__file__))
-STATIC_FOLDER = os.path.join(APP_ROOT, 'static')
+app.config['APP_ROOT'] = os.getenv('APP_ROOT', os.path.dirname(os.path.abspath(__file__)))
+app.config['STATIC_FOLDER'] = os.getenv('STATIC_FOLDER', os.path.join(app.config['APP_ROOT'], 'static'))

 # Get Mozilla Firefox rhyme (important) and form a new user agent
 mozilla = rhyme.get_rhyme('Mo') + 'zilla'
@ -20,7 +21,7 @@ DESKTOP_UA = mozilla + '/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/2010010
 # Base search url
 SEARCH_URL = 'https://www.google.com/search?gbv=1&q='

-user_config = json.load(open(STATIC_FOLDER + '/config.json'))
+user_config = json.load(open(app.config['STATIC_FOLDER'] + '/config.json'))


 def get_ua(user_agent):
@ -55,29 +56,31 @@ def search():
    if q is None or len(q) <= 0:
        return render_template('error.html')

-    full_query = filter.gen_query(q, request.args)
    user_agent = request.headers.get('User-Agent')
-    dark_mode = 'dark' in user_config and user_config['dark']
-    nojs = 'nojs' in user_config and user_config['nojs']
+    mobile = 'Android' in user_agent or 'iPhone' in user_agent

-    get_body = filter.reskin(send_request(
-        SEARCH_URL + full_query, get_ua(user_agent)), dark_mode=dark_mode)
-
-    soup = filter.cook(BeautifulSoup(get_body, 'html.parser'), user_agent, nojs=nojs, dark_mode=dark_mode)
+    content_filter = Filter(mobile, user_config)
+    full_query = content_filter.gen_query(q, request.args)
+    get_body = send_request(SEARCH_URL + full_query, get_ua(user_agent))
+    get_body = content_filter.reskin(get_body)
+    soup = content_filter.clean(BeautifulSoup(get_body, 'html.parser'))

    return render_template('display.html', query=urlparse.unquote(q), response=soup)


-@app.route('/config', methods=['POST'])
+@app.route('/config', methods=['GET', 'POST'])
 def config():
    global user_config
-    with open(STATIC_FOLDER + '/config.json', 'w') as config_file:
-        config_file.write(json.dumps(json.loads(request.data), indent=4))
-        config_file.close()
+    if request.method == 'GET':
+        return json.dumps(user_config)
+    else:
+        with open(app.config['STATIC_FOLDER'] + '/config.json', 'w') as config_file:
+            config_file.write(json.dumps(json.loads(request.data), indent=4))
+            config_file.close()

-        user_config = json.loads(request.data)
+            user_config = json.loads(request.data)

-    return 'New config: ' + str(request.data)
+        return 'New config: ' + str(request.data)


@app.route('/url', methods=['GET'])
--- a/config/opensearch.py
+++ b/config/opensearch.py
@ -1,7 +1,9 @@
+import os
 import sys

-template_path = './app/static/opensearch.template'
-opensearch_path = './app/static/opensearch.xml'
+script_path = os.path.dirname(os.path.realpath(__file__))
+template_path = script_path + '/../app/static/opensearch.template'
+opensearch_path = script_path + '/../app/static/opensearch.xml'
 replace_tag = 'SHOOGLE_URL'

 if len(sys.argv) != 2:
--- a/config/requirements.txt
+++ b/config/requirements.txt
@ -11,6 +11,7 @@ Phyme==0.0.9
 pycparser==2.19
 pycurl==7.43.0.4
 pyOpenSSL==19.1.0
+pytest==5.4.1
 six==1.14.0
 soupsieve==1.9.5
 Werkzeug==0.16.0
--- a/33
+++ b/33
@ -0,0 +1,33 @@
+#!/bin/bash
+# Usage:
+# ./run # Runs the full web app
+# ./run test # Runs the testing suite
+
+SCRIPT=`realpath $0`
+SCRIPT_DIR=`dirname $SCRIPT`
+
+# Set default port if unavailable
+if [[ -z "${PORT}" ]]; then
+    PORT=5000
+fi
+
+# Set directory to serve static content from
+[[ ! -z $1 ]] && SUBDIR="$1" || SUBDIR="app"
+export APP_ROOT=$SCRIPT_DIR/$SUBDIR
+export STATIC_FOLDER=$APP_ROOT/static
+
+mkdir -p $STATIC_FOLDER
+
+# Create default config json if it doesn't exist
+if [[ ! -f $STATIC_FOLDER/config.json ]]; then
+    echo "{}" > $STATIC_FOLDER/config.json
+fi
+
+pkill flask
+
+# Check for regular vs test run
+if [[ $SUBDIR == "test" ]]; then
+    pytest -sv
+else
+    flask run --host="0.0.0.0" --port=$PORT
+fi
--- a/run.sh
+++ b/run.sh
@ -1,17 +0,0 @@
-#!/bin/bash
-
-SCRIPT=`realpath $0`
-SCRIPT_DIR=`dirname $SCRIPT`
-
-if [[ -z "${PORT}" ]]; then
-    PORT=5000
-fi
-
-# Create config json if it doesn't exist
-if [[ ! -f $SCRIPT_DIR/app/static/config.json ]]; then
-    echo "{}" > $SCRIPT_DIR/app/static/config.json
-fi
-
-pkill flask
-
-flask run --host="0.0.0.0" --port=$PORT
--- a/server.py
+++ b/server.py
@ -1 +0,0 @@
-from app import app
--- a/test/init.py
+++ b/test/init.py
--- a/test/conftest.py
+++ b/test/conftest.py
@ -0,0 +1,8 @@
+from app import app
+import pytest
+
+
+@pytest.fixture
+def client():
+    client = app.test_client()
+    yield client
--- a/test/test_results.py
+++ b/test/test_results.py
@ -0,0 +1,54 @@
+from bs4 import BeautifulSoup
+from app.filter import Filter
+import json
+from datetime import datetime
+from dateutil.parser import *
+from test.conftest import client
+
+
+def get_search_results(data):
+    soup = Filter().clean(BeautifulSoup(rv.data, 'html.parser'))
+
+    main_divs = soup.find('div', {'id': 'main'})
+    assert len(main_divs) > 1
+
+    result_divs = []
+    for div in main_divs:
+        # Result divs should only have 1 inner div
+        if len(list(div.children)) != 1 or not div.findChild() or 'div' not in div.findChild().name:
+            continue
+
+        result_divs.append(div)
+
+    return result_divs
+
+
+def test_search_results(client):
+    rv = client.get('/search?q=test')
+    assert rv._status_code == 200
+
+    assert len(get_search_results(rv.data)) == 10
+
+
+def test_recent_results(client):
+    times = {
+        'pastyear': 365,
+        'pastmonth': 31,
+        'pastweek': 7
+    }
+
+    for time, num_days in times.items():
+        rv = client.get('/search?q=test%20%3A' + time)
+        result_divs = get_search_results(rv.data)
+
+        current_date = datetime.now()
+        for div in result_divs:
+            date_span = div.find('span').decode_contents()
+            if not date_span or len(date_span) > 15:
+                continue
+
+            try:
+                date = parse(date_span)
+                assert (current_date - date).days < num_days
+            except ParserError:
+                assert ' ago' in date_span
--- a/test/test_routes.py
+++ b/test/test_routes.py
@ -0,0 +1,30 @@
+import json
+from test.conftest import client
+
+demo_config = {
+    'near': 'Seattle',
+    'dark_mode': 0,
+    'nojs': 0
+}
+
+
+def test_main(client):
+    rv = client.get('/')
+    assert rv._status_code == 200
+
+
+def test_search(client):
+    rv = client.get('/search?q=test')
+    assert rv._status_code == 200
+
+
+def test_config(client):
+    rv = client.post('/config', data=json.dumps(demo_config))
+    assert rv._status_code == 200
+
+    rv = client.get('/config')
+    assert rv._status_code == 200
+
+    config = json.loads(rv.data)
+    for key in demo_config.keys():
+        assert config[key] == demo_config[key]