whoogle-search/test/test_results.py

86 lines
2.6 KiB
Python
Raw Normal View History

from bs4 import BeautifulSoup
from app.filter import Filter
from app.utils.session_utils import generate_user_keys
from datetime import datetime
from dateutil.parser import *
def get_search_results(data):
Project refactor (#85) * Major refactor of requests and session management - Switches from pycurl to requests library - Allows for less janky decoding, especially with non-latin character sets - Adds session level management of user configs - Allows for each session to set its own config (people are probably going to complain about this, though not sure if it'll be the same number of people who are upset that their friends/family have to share their config) - Updates key gen/regen to more aggressively swap out keys after each request * Added ability to save/load configs by name - New PUT method for config allows changing config with specified name - New methods in js controller to handle loading/saving of configs * Result formatting and removal of unused elements - Fixed question section formatting from results page (added appropriate padding and made questions styled as italic) - Removed user agent display from main config settings * Minor change to button label * Fixed issue with "de-pickling" of flask session Having a gitignore-everything ("*") file within a flask session folder seems to cause a weird bug where the state of the app becomes unusable from continuously trying to prune files listed in the gitignore (and it can't prune '*'). * Switched to pickling saved configs * Updated ad/sponsored content filter and conf naming Configs are now named with a .conf extension to allow for easier manual cleanup/modification of named config files Sponsored content now removed by basic string matching of span content * Version bump to 0.2.0 * Fixed request.send return style
2020-06-02 18:54:47 +00:00
secret_key = generate_user_keys()
soup = Filter(user_keys=secret_key).clean(
BeautifulSoup(data, 'html.parser'))
main_divs = soup.find('div', {'id': 'main'})
assert len(main_divs) > 1
result_divs = []
for div in main_divs:
# Result divs should only have 1 inner div
if (len(list(div.children)) != 1
or not div.findChild()
or 'div' not in div.findChild().name):
continue
result_divs.append(div)
return result_divs
def test_get_results(client):
rv = client.get('/search?q=test')
assert rv._status_code == 200
2020-04-15 23:54:38 +00:00
# Depending on the search, there can be more
# than 10 result divs
assert len(get_search_results(rv.data)) >= 10
assert len(get_search_results(rv.data)) <= 15
def test_post_results(client):
rv = client.post('/search', data=dict(q='test'))
assert rv._status_code == 200
# Depending on the search, there can be more
# than 10 result divs
assert len(get_search_results(rv.data)) >= 10
assert len(get_search_results(rv.data)) <= 15
# TODO: Unit test the site alt method instead -- the results returned
# are too unreliable for this test in particular.
# def test_site_alts(client):
# rv = client.post('/search', data=dict(q='twitter official account'))
# assert b'twitter.com/Twitter' in rv.data
# client.post('/config', data=dict(alts=True))
# assert json.loads(client.get('/config').data)['alts']
# rv = client.post('/search', data=dict(q='twitter official account'))
# assert b'twitter.com/Twitter' not in rv.data
# assert b'nitter.net/Twitter' in rv.data
def test_recent_results(client):
times = {
'past year': 365,
'past month': 31,
'past week': 7
}
for time, num_days in times.items():
rv = client.post('/search', data=dict(q='test :' + time))
result_divs = get_search_results(rv.data)
current_date = datetime.now()
for div in [_ for _ in result_divs if _.find('span')]:
date_span = div.find('span').decode_contents()
if not date_span or len(date_span) > 15 or len(date_span) < 7:
continue
try:
date = parse(date_span)
# Date can have a little bit of wiggle room
assert (current_date - date).days <= (num_days + 5)
except ParserError:
pass