From de7822cd37917190a72b44c69b92351c9cfb19b1 Mon Sep 17 00:00:00 2001 From: emijrp Date: Thu, 24 May 2018 13:28:12 +0200 Subject: [PATCH] duckduckgo parser; remove .zip after upload --- wikispaces.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/wikispaces.py b/wikispaces.py index 76f85c7..6f063fa 100644 --- a/wikispaces.py +++ b/wikispaces.py @@ -21,6 +21,7 @@ import csv import datetime import os +import random import re import subprocess import sys @@ -224,6 +225,33 @@ python3 wikispaces.py https://mywiki.wikispaces.com --upload print(helptext) sys.exit() +def duckduckgo(): + opener = urllib.request.build_opener() + opener.addheaders = [('User-agent', 'Mozilla/5.0')] + urllib.request.install_opener(opener) + + wikis = [] + for i in range(1, 100000): + url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikispaces.com' % (random.randint(100, 5000), random.randint(1000, 9999)) + print('URL search', url) + try: + html = urllib.request.urlopen(url).read().decode('utf-8') + except: + print('Search error') + time.sleep(30) + continue + html = urllib.parse.unquote(html) + m = re.findall(r'://([^/]+?\.wikispaces\.com)', html) + for wiki in m: + wiki = 'https://' + wiki + wiki = re.sub(r'https://www\.', 'https://', wiki) + if not wiki in wikis: + wikis.append(wiki) + yield wiki + sleep = random.randint(5,20) + print('Sleeping %d seconds' % (sleep)) + time.sleep(sleep) + def main(): upload = False isadmin = False @@ -249,6 +277,10 @@ def main(): wikilist = [] if '://' in param: wikilist.append(param.rstrip('/')) + elif param.lower() == 'duckduckgo': + wikilist = duckduckgo() + #for wiki in wikilist: + # print(wiki) else: with open(param, 'r') as f: wikilist = f.read().strip().splitlines() @@ -350,6 +382,7 @@ def main(): itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or '' subprocess.call('ia' + ' upload %s %s %s --metadata="mediatype:web" --metadata="collection:%s" --metadata="title:%s" --metadata="description:%s" --metadata="language:%s" --metadata="last-updated-date:%s" --metadata="originalurl:%s" %s %s' % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemoriginalurl, itemlicenseurl and '--metadata="licenseurl:%s"' % (itemlicenseurl) or '', itemtags_), shell=True) print('You can find it in https://archive.org/details/%s' % (itemid)) + os.remove(wikizip) if __name__ == "__main__": main()