Merge branch 'master' of https://github.com/WikiTeam/wikiteam

5 years ago · aecee2dc53
parent 33a93fd76a 966df37c54
commit aecee2dc53
36 changed files with 2506182 additions and 248445 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,2 @@
+*.com linguist-vendored
+*.org linguist-vendored
--- a/.travis.yml
+++ b/.travis.yml
@ -4,3 +4,5 @@ install:
  - pip install tox 
 script:
  - tox
+notifications:
+  email: false
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # WikiTeam
 ### We archive wikis, from Wikipedia to tiniest wikis

-**WikiTeam software is a set of tools for archiving wikis.** They work on MediaWiki wikis, but we want to expand to other wiki engines. As of January 2016, WikiTeam has preserved more than [27,000 stand-alone wikis](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups), several wikifarms, regular Wikipedia dumps and [34 TB of Wikimedia Commons images](https://archive.org/details/wikimediacommons).
+**WikiTeam software is a set of tools for archiving wikis.** They work on MediaWiki wikis, but we want to expand to other wiki engines. As of 2019, WikiTeam has preserved more than [250,000 wikis](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups), several wikifarms, regular Wikipedia dumps and [34 TB of Wikimedia Commons images](https://archive.org/details/wikimediacommons).

 There are [thousands](http://wikiindex.org) of [wikis](https://wikiapiary.com) in the Internet. Every day some of them are no longer publicly available and, due to lack of backups, lost forever. Millions of people download tons of media files (movies, music, books, etc) from the Internet, serving as a kind of distributed backup. Wikis, most of them under free licenses, disappear from time to time because nobody grabbed a copy of them. That is a shame that we would like to solve.

--- a/batchdownload/taskforce/mediawikis_notarchived_2018.txt
+++ b/batchdownload/taskforce/mediawikis_notarchived_2018.txt
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-

 # dumpgenerator.py A generator of dumps for wikis
-# Copyright (C) 2011-2016 WikiTeam developers
+# Copyright (C) 2011-2018 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -20,7 +20,7 @@
 #     https://github.com/WikiTeam/wikiteam/wiki

 try:
-    from kitchen.text.converters import getwriter
+    from kitchen.text.converters import getwriter, to_unicode
 except ImportError:
    print "Please install the kitchen module."
 import cookielib
@ -39,17 +39,31 @@ except ImportError:             # Python 2.4 compatibility
    from md5 import new as md5
 import os
 import re
+import subprocess
 try:
    import requests
 except ImportError:
    print "Please install or update the Requests module."
    sys.exit(1)
+try:
+    import wikitools
+except ImportError:
+    print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
+try:
+    from lxml import etree
+    from lxml.builder import E
+except ImportError:
+    print "Please install the lxml module if you want to use --xmlrevisions."
 import time
 import urllib
+try:
+    from urlparse import urlparse, urlunparse
+except ImportError:
+    from urllib.parse import urlparse, urlunparse
 UTF8Writer = getwriter('utf8')
 sys.stdout = UTF8Writer(sys.stdout)

-__VERSION__ = '0.3.0-alpha'  # major, minor, micro: semver.org
+__VERSION__ = '0.4.0-alpha'  # major, minor, micro: semver.org

 class PageMissingError(Exception):
    def __init__(self, title, xml):
@ -150,7 +164,7 @@ def getNamespacesScraper(config={}, session=None):
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        r = session.post(
-            url=config['index'], data={'title': 'Special:Allpages'})
+            url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
        raw = r.text
        delay(config=config, session=session)

@ -187,33 +201,41 @@ def getNamespacesAPI(config={}, session=None):
    if namespaces:
        r = session.post(
            url=config['api'],
-            data={
+            params={
                'action': 'query',
                'meta': 'siteinfo',
                'siprop': 'namespaces',
-                'format': 'json'}
+                'format': 'json'},
+            timeout=30
        )
        result = getJSON(r)
        delay(config=config, session=session)
+        try:
+            nsquery = result['query']['namespaces']
+        except KeyError:
+            print "Error: could not get namespaces from the API request"
+            print "HTTP %d" % r.status_code
+            print r.text
+            return None

        if 'all' in namespaces:
            namespaces = []
-            for i in result['query']['namespaces'].keys():
+            for i in nsquery.keys():
                if int(i) < 0:  # -1: Special, -2: Media, excluding
                    continue
                namespaces.append(int(i))
-                namespacenames[int(i)] = result['query']['namespaces'][i]['*']
+                namespacenames[int(i)] = nsquery[i]['*']
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
-            for i in result['query']['namespaces'].keys():
+            for i in nsquery.keys():
                bi = i
                i = int(i)
                if i < 0:  # -1: Special, -2: Media, excluding
                    continue
                if i in namespaces:
                    namespaces2.append(i)
-                    namespacenames[i] = result['query']['namespaces'][bi]['*']
+                    namespacenames[i] = nsquery[bi]['*']
            namespaces = namespaces2
    else:
        namespaces = [0]
@ -249,7 +271,7 @@ def getPageTitlesAPI(config={}, session=None):
            retryCount = 0
            while retryCount < config["retries"]:
                try:
-                    r = session.post(url=config['api'], data=params)
+                    r = session.post(url=config['api'], data=params, timeout=30)
                    break
                except ConnectionError as err:
                    print "Connection error: %s" % (str(err),)
@ -271,21 +293,27 @@ def getPageTitlesAPI(config={}, session=None):
                    apfrom = jsontitles['continue']['apcontinue']
                elif 'apfrom' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apfrom']
-            
+
            # print apfrom
            # print jsontitles
-            allpages = jsontitles['query']['allpages']
+            try:
+                allpages = jsontitles['query']['allpages']
+            except KeyError:
+                print "The allpages API returned nothing. Exit."
+                sys.exit(1)
+
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
            for page in allpages:
-                yield page['title']
+                title = page['title']
+                titles.append(title)
+                yield title
            c += len(allpages)

            if len(titles) != len(set(titles)):
-                # probably we are in a loop, server returning dupe titles, stop
-                # it
-                print 'Probably a loop, finishing'
+                print 'Probably a loop, switching to next namespace. Duplicate title:'
+                print title
                titles = list(set(titles))
                apfrom = ''

@ -301,7 +329,7 @@ def getPageTitlesScraper(config={}, session=None):
        print '    Retrieving titles in the namespace', namespace
        url = '%s?title=Special:Allpages&namespace=%s' % (
            config['index'], namespace)
-        r = session.get(url=url)
+        r = session.get(url=url, timeout=30)
        raw = r.text
        raw = cleanHTML(raw)

@ -353,7 +381,7 @@ def getPageTitlesScraper(config={}, session=None):
                    # to avoid reload dupe subpages links
                    checked_suballpages.append(name)
                    delay(config=config, session=session)
-                    r2 = session.get(url=url)
+                    r2 = session.get(url=url, timeout=10)
                    raw2 = r2.text
                    raw2 = cleanHTML(raw2)
                    rawacum += raw2  # merge it after removed junk
@ -386,13 +414,11 @@ def getPageTitles(config={}, session=None):

    titles = []
    if 'api' in config and config['api']:
-        r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'})
-        test = getJSON(r)
-        if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
-                and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
-            titles = getPageTitlesScraper(config=config, session=session)
-        else:
+        try:
            titles = getPageTitlesAPI(config=config, session=session)
+        except:
+            print "Error: could not get page titles from the API"
+            titles = getPageTitlesScraper(config=config, session=session)
    elif 'index' in config and config['index']:
        titles = getPageTitlesScraper(config=config, session=session)

@ -412,7 +438,7 @@ def getPageTitles(config={}, session=None):

    print '%d page titles loaded' % (c)
    return titlesfilename
-    
+
 def getImageNames(config={}, session=None):
    """ Get list of image names """

@ -436,39 +462,60 @@ def getXMLHeader(config={}, session=None):
    # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
    # xmlns:x....
    randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
-    try:
-        xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
-    except PageMissingError as pme:
-        # The <page> does not exist. Not a problem, if we get the <siteinfo>.
-        xml = pme.xml
-    # Issue 26: Account for missing "Special" namespace.
-    # Hope the canonical special name has not been removed.
-    # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
-    except ExportAbortedError:
+    print config['api']
+    if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
+        xml = None
        try:
-            if config['api']:
-                print "Trying the local name for the Special namespace instead"
-                r = session.post(
-                url=config['api'],
-                data={
-                    'action': 'query',
-                    'meta': 'siteinfo',
-                    'siprop': 'namespaces',
-                    'format': 'json'}
-                )
-                config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
-                    + ':Export'
-                xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+            print 'Getting the XML header from the API'
+            r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
+            xml = r.json()['query']['export']['*']
+            if not xml:
+                r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
+                xml = r.text
+        except requests.exceptions.RetryError:
+            pass
+
+    else:
+        try:
+            xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
        except PageMissingError as pme:
+            # The <page> does not exist. Not a problem, if we get the <siteinfo>.
            xml = pme.xml
+        # Issue 26: Account for missing "Special" namespace.
+        # Hope the canonical special name has not been removed.
+        # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
        except ExportAbortedError:
-            pass
+            try:
+                if config['api']:
+                    print "Trying the local name for the Special namespace instead"
+                    r = session.post(
+                    url=config['api'],
+                    params={
+                        'action': 'query',
+                        'meta': 'siteinfo',
+                        'siprop': 'namespaces',
+                        'format': 'json'},
+                    timeout=120
+                    )
+                    config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+                        + ':Export'
+                    xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+            except PageMissingError as pme:
+                xml = pme.xml
+            except ExportAbortedError:
+                pass

    header = xml.split('</mediawiki>')[0]
    if not re.match(r"\s*<mediawiki", xml):
-        print 'XML export on this wiki is broken, quitting.'
-        logerror(u'XML export on this wiki is broken, quitting.')
-        sys.exit()
+        if config['xmlrevisions']:
+            # Try again the old way
+            print 'Export test via the API failed. Wiki too old? Trying without xmlrevisions.'
+            config['xmlrevisions'] = False
+            header, config = getXMLHeader(config=config, session=session)
+        else:
+            print 'XML export on this wiki is broken, quitting.'
+            logerror(u'XML export on this wiki is broken, quitting.')
+            sys.exit()
    return header, config


@ -512,7 +559,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
        if c > 0 and c < maxretries:
            wait = increment * c < maxseconds and increment * \
                c or maxseconds  # incremental until maxseconds
-            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['pages'], wait)
+            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)
            time.sleep(wait)
            # reducing server load requesting smallest chunks (if curonly then
            # limit = 1 from mother function)
@ -521,6 +568,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
        if c >= maxretries:
            print '    We have retried %d times' % (c)
            print '    MediaWiki error for "%s", network error or whatever...' % (params['pages'])
+            if config['failfast']:
+                print "Exit, it will be for another time"
+                sys.exit()
            # If it's not already what we tried: our last chance, preserve only the last revision...
            # config['curonly'] means that the whole dump is configured to save only the last,
            # params['curonly'] should mean that we've already tried this
@ -550,7 +600,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
                return ''  # empty xml
        # FIXME HANDLE HTTP Errors HERE
        try:
-            r = session.post(url=config['index'], data=params, headers=headers)
+            r = session.post(url=config['index'], params=params, headers=headers, timeout=10)
            handleStatusCode(r)
            xml = fixBOM(r)
        except requests.exceptions.ConnectionError as e:
@ -675,10 +725,9 @@ def cleanXML(xml=''):


 def generateXMLDump(config={}, titles=[], start=None, session=None):
-    """ Generates a XML dump for a list of titles """
+    """ Generates a XML dump for a list of titles or from revision IDs """
    # TODO: titles is now unused.

-    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
    header, config = getXMLHeader(config=config, session=session)
    footer = '</mediawiki>\n'  # new line at the end
    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
@ -686,48 +735,189 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
                                    config['curonly'] and 'current' or 'history')
    xmlfile = ''
    lock = True
-    if start:
-        print "Removing the last chunk of past XML dump: it is probably incomplete."
-        for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
-            pass
-    else:
-        # requested complete xml dump
-        lock = False
+
+    if config['xmlrevisions']:
+        print 'Retrieving the XML for every page from the beginning'
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
        xmlfile.write(header.encode('utf-8'))
-        xmlfile.close()
-
-    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
-    c = 1
-    for title in readTitles(config, start):
-        if not title.strip():
-            continue
-        if title == start:  # start downloading from start, included
-            lock = False
-        if lock:
-            continue
-        delay(config=config, session=session)
-        if c % 10 == 0:
-            print 'Downloaded %d pages' % (c)
        try:
-            for xml in getXMLPage(config=config, title=title, session=session):
+            r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+            for xml in getXMLRevisions(config=config, session=session):
+                numrevs = len(re.findall(r_timestamp, xml))
+                # Due to how generators work, it's expected this may be less
+                print "%d more revisions exported" % numrevs
                xml = cleanXML(xml=xml)
                xmlfile.write(xml.encode('utf-8'))
-        except PageMissingError:
-            logerror(
-                config=config,
-                text=u'The page "%s" was missing in the wiki (probably deleted)' %
-                (title.decode('utf-8'))
-            )
-        # here, XML is a correct <page> </page> chunk or
-        # an empty string due to a deleted page (logged in errors log) or
-        # an empty string due to an error while retrieving the page from server
-        # (logged in errors log)
-        c += 1
+        except AttributeError:
+            print "This wikitools module version is not working"
+            sys.exit()
+    else:
+        print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
+        if start:
+            print "Removing the last chunk of past XML dump: it is probably incomplete."
+            for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
+                pass
+        else:
+            # requested complete xml dump
+            lock = False
+            xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
+            xmlfile.write(header.encode('utf-8'))
+            xmlfile.close()
+
+        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
+        c = 1
+        for title in readTitles(config, start):
+            if not title.strip():
+                continue
+            if title == start:  # start downloading from start, included
+                lock = False
+            if lock:
+                continue
+            delay(config=config, session=session)
+            if c % 10 == 0:
+                print 'Downloaded %d pages' % (c)
+            try:
+                for xml in getXMLPage(config=config, title=title, session=session):
+                    xml = cleanXML(xml=xml)
+                    xmlfile.write(xml.encode('utf-8'))
+            except PageMissingError:
+                logerror(
+                    config=config,
+                    text=u'The page "%s" was missing in the wiki (probably deleted)' %
+                    (title.decode('utf-8'))
+                )
+            # here, XML is a correct <page> </page> chunk or
+            # an empty string due to a deleted page (logged in errors log) or
+            # an empty string due to an error while retrieving the page from server
+            # (logged in errors log)
+            c += 1
+
    xmlfile.write(footer)
    xmlfile.close()
    print 'XML dump saved at...', xmlfilename

+def getXMLRevisions(config={}, session=None, allpages=False):
+    site = wikitools.wiki.Wiki(config['api'])
+    if not 'all' in config['namespaces']:
+        namespaces = config['namespaces']
+    else:
+        namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
+
+    try:
+        for namespace in namespaces:
+            print "Trying to export all revisions from namespace %s" % namespace
+            arvparams = {
+                'action': 'query',
+                'list': 'allrevisions',
+                'arvlimit': 500,
+                'arvnamespace': namespace
+            }
+            if not config['curonly']:
+                # We have to build the XML manually...
+                # Skip flags, presumably needed to add <minor/> which is in the schema.
+                # Also missing: parentid and contentformat.
+                arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
+                arvrequest = wikitools.api.APIRequest(site, arvparams)
+                results = arvrequest.queryGen()
+                for result in results:
+                    for page in result['query']['allrevisions']:
+                        yield makeXmlFromPage(page)
+            else:
+                # Just cycle through revision IDs and use the XML as is
+                arvparams['arvprop'] = 'ids'
+                arvrequest = wikitools.api.APIRequest(site, arvparams)
+                arvresults = arvrequest.queryGen()
+                for result in arvresults:
+                    revids = []
+                    for page in result['query']['allrevisions']:
+                        for revision in page['revisions']:
+                            revids.append(str(revision['revid']))
+                    print "%d more revisions listed, until %s" % (len(revids), revids[-1])
+
+                    exportparams = {
+                        'action': 'query',
+                        'revids': '|'.join(revids),
+                        'export': '1',
+                    }
+                    exportrequest = wikitools.api.APIRequest(site, exportparams)
+                    exportresults = exportrequest.queryGen()
+                    for exportresult in exportresults:
+                        yield exportresult['query']['export']['*']
+
+    except KeyError:
+        print "Warning. Could not use allrevisions, wiki too old."
+        if config['curonly']:
+            for title in readTitles(config):
+                exportparams = {
+                    'action': 'query',
+                    'titles': title,
+                    'export': '1',
+                }
+                exportrequest = wikitools.api.APIRequest(site, exportparams)
+                exportresults = exportrequest.queryGen()
+                for exportresult in exportresults:
+                    yield exportresult['query']['export']['*']
+        else:
+            for title in readTitles(config):
+                pparams = {
+                    'action': 'query',
+                    'titles': title,
+                    'prop': 'revisions',
+                    'rvlimit': 'max',
+                    'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
+                    'rawcontinue': 'yes'
+                }
+                prequest = wikitools.api.APIRequest(site, pparams)
+                try:
+                    results = prequest.query()
+                    pages = results['query']['pages']
+                except KeyError:
+                    raise PageMissingError(title, xml='')
+                for page in pages:
+                    try:
+                        xml = makeXmlFromPage(pages[page])
+                    except PageMissingError:
+                        logerror(
+                            config=config,
+                            text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
+                        )
+                        continue
+                    yield xml
+
+    except wikitools.api.APIError:
+        print "This wikitools version seems not to work for us. Exiting."
+        sys.exit()
+
+def makeXmlFromPage(page):
+    """ Output an XML document as a string from a page as in the API JSON """
+    try:
+        p = E.page(
+                E.title(page['title']),
+                E.ns(to_unicode(page['ns'])),
+                E.id(to_unicode(page['pageid'])),
+        )
+        for rev in page['revisions']:
+            revision = E.revision(
+                E.id(to_unicode(rev['revid'])),
+                E.parentid(to_unicode(rev['parentid'])),
+                E.timestamp(rev['timestamp']),
+                E.contributor(
+                        E.id(to_unicode(rev['userid'])),
+                        E.username(to_unicode(rev['user'])),
+                ),
+                E.comment(rev['comment']),
+                E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
+            )
+            if 'contentmodel' in rev:
+                revision.append(E.model(rev['contentmodel']))
+            # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
+            if 'sha1' in rev:
+                revision.append(E.sha1(rev['sha1']))
+            p.append(revision)
+    except KeyError:
+        raise PageMissingError(page['title'], '')
+    return etree.tostring(p, pretty_print=True)
+
 def readTitles(config={}, start=None):
    """ Read title list from a file, from the title "start" """

@ -863,10 +1053,11 @@ def getImageNamesScraper(config={}, session=None):
        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
        r = session.post(
            url=config['index'],
-            data={
+            params={
                'title': 'Special:Imagelist',
                'limit': limit,
-                'offset': offset})
+                'offset': offset},
+            timeout=30)
        raw = r.text
        delay(config=config, session=session)
        # delicate wiki
@ -967,7 +1158,7 @@ def getImageNamesAPI(config={}, session=None):
            'format': 'json',
            'ailimit': 500}
        # FIXME Handle HTTP Errors HERE
-        r = session.post(url=config['api'], data=params)
+        r = session.post(url=config['api'], params=params, timeout=30)
        handleStatusCode(r)
        jsonimages = getJSON(r)
        delay(config=config, session=session)
@ -1025,7 +1216,7 @@ def getImageNamesAPI(config={}, session=None):
                'iiprop': 'user|url',
                'format': 'json'}
            # FIXME Handle HTTP Errors HERE
-            r = session.post(url=config['api'], data=params)
+            r = session.post(url=config['api'], params=params, timeout=30)
            handleStatusCode(r)
            jsonimages = getJSON(r)
            delay(config=config, session=session)
@ -1112,10 +1303,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
        # saving description if any
        try:
            title = u'Image:%s' % (filename)
-            xmlfiledesc = getXMLFileDesc(
-                config=config,
-                title=title,
-                session=session)  # use Image: for backwards compatibility
+            if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
+                r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
+                xmlfiledesc = r.text
+            else:
+                xmlfiledesc = getXMLFileDesc(
+                    config=config,
+                    title=title,
+                    session=session)  # use Image: for backwards compatibility
        except PageMissingError:
            xmlfiledesc = ''
            logerror(
@ -1170,7 +1365,7 @@ def domain2prefix(config={}, session=None):
        domain = config['index']

    domain = domain.lower()
-    domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
+    domain = re.sub(r'(https?://|www\.|/index\.php.*|/api\.php.*)', '', domain)
    domain = re.sub(r'/', '_', domain)
    domain = re.sub(r'\.', '', domain)
    domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
@ -1211,8 +1406,9 @@ def welcome():
    message += ''
    message += "\n"
    message += "#" * 73
+    message += "\n"
+    message += "# Copyright (C) 2011-%d WikiTeam developers                           #\n" % (datetime.datetime.now().year)
    message += """
-# Copyright (C) 2011-2014 WikiTeam                                      #
 # This program is free software: you can redistribute it and/or modify  #
 # it under the terms of the GNU General Public License as published by  #
 # the Free Software Foundation, either version 3 of the License, or     #
@ -1299,7 +1495,9 @@ def getParameters(params=[]):
        action='store_true',
        help="generates a full history XML dump (--xml --curonly for current revisions only)")
    groupDownload.add_argument('--curonly', action='store_true',
-                               help='store only the current version of pages')
+        help='store only the current version of pages')
+    groupDownload.add_argument('--xmlrevisions', action='store_true',
+                               help='download all revisions from an API generator. MediaWiki 1.27+ only.')
    groupDownload.add_argument(
        '--images', action='store_true', help="generates an image dump")
    groupDownload.add_argument(
@ -1319,6 +1517,10 @@ def getParameters(params=[]):
        '--get-wiki-engine',
        action='store_true',
        help="returns the wiki engine")
+    groupMeta.add_argument(
+        '--failfast',
+        action='store_true',
+        help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.")

    args = parser.parse_args()
    # print args
@ -1350,11 +1552,22 @@ def getParameters(params=[]):
        print 'Using cookies from %s' % args.cookies

    session = requests.Session()
+    try:
+        from requests.packages.urllib3.util.retry import Retry
+        from requests.adapters import HTTPAdapter
+        # Courtesy datashaman https://stackoverflow.com/a/35504626
+        __retries__ = Retry(total=5,
+                        backoff_factor=2,
+                        status_forcelist=[500, 502, 503, 504])
+        session.mount('https://', HTTPAdapter(max_retries=__retries__))
+        session.mount('http://', HTTPAdapter(max_retries=__retries__))
+    except:
+        # Our urllib3/requests is too old
+        pass
    session.cookies = cj
    session.headers.update({'User-Agent': getUserAgent()})
    if args.user and args.password:
        session.auth = (args.user, args.password)
-    # session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))

    # check URLs
    for url in [args.api, args.index, args.wiki]:
@ -1392,6 +1605,7 @@ def getParameters(params=[]):
        retry = 0
        maxretries = args.retries
        retrydelay = 20
+        check = None
        while retry < maxretries:
            try:
                check = checkAPI(api=api, session=session)
@ -1427,15 +1641,20 @@ def getParameters(params=[]):
                session=session):
            print 'index.php is OK'
        else:
-            index = '/'.join(index.split('/')[:-1])
+            try:
+                index = '/'.join(index.split('/')[:-1])
+            except AttributeError:
+                index = None
            if index and checkIndex(
                    index=index,
                    cookies=args.cookies,
                    session=session):
                print 'index.php is OK'
            else:
-                print 'Error in index.php, please, provide a correct path to index.php'
-                sys.exit(1)
+                print 'Error in index.php.'
+                if not args.xmlrevisions:
+                    print 'Please, provide a correct path to index.php or use --xmlrevisions. Terminating.'
+                    sys.exit(1)

    # check user and pass (one requires both)
    if (args.user and not args.password) or (args.password and not args.user):
@ -1483,10 +1702,12 @@ def getParameters(params=[]):
        'curonly': args.curonly,
        'date': datetime.datetime.now().strftime('%Y%m%d'),
        'api': api,
+        'failfast': args.failfast,
        'index': index,
        'images': args.images,
        'logs': False,
        'xml': args.xml,
+        'xmlrevisions': args.xmlrevisions,
        'namespaces': namespaces,
        'exnamespaces': exnamespaces,
        'path': args.path and os.path.normpath(args.path) or '',
@ -1520,18 +1741,23 @@ def checkAPI(api=None, session=None):
            data={
                'action': 'query',
                'meta': 'siteinfo',
-                'format': 'json'}
+                'format': 'json'},
+            timeout=30
        )
-        if r.url == api:
+        if r.status_code == 200:
            break
-        else:
-            api = r.url
+        elif r.status_code < 400:
+            p = r.url
+            api = urlunparse([p.scheme, p.netloc, p.path, '', '', ''])
+        elif r.status_code > 400:
+            print "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
+            return False
    if "MediaWiki API is not enabled for this site." in r.text:
        return False
    try:
        result = getJSON(r)
        index = None
-        if result['query']:
+        if result:
            try:
                index = result['query']['general']['server'] + \
                    result['query']['general']['script']
@ -1548,7 +1774,7 @@ def checkAPI(api=None, session=None):

 def checkIndex(index=None, cookies=None, session=None):
    """ Checking index.php availability """
-    r = session.post(url=index, data={'title': 'Special:Version'})
+    r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
    raw = r.text
    print 'Checking index.php...', index
    # Workaround for issue 71
@ -1587,7 +1813,11 @@ def getJSON(request):
    """Strip Unicode BOM"""
    if request.text.startswith(u'\ufeff'):
        request.encoding = 'utf-8-sig'
-    return request.json()
+    try:
+        return request.json()
+    except:
+        # Maybe an older API version which did not return correct JSON
+        return {}


 def fixBOM(request):
@ -1633,6 +1863,8 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
    else:
        print 'XML dump seems to be corrupted.'
        reply = ''
+        if config['failfast']:
+            reply = 'yes'
        while reply.lower() not in ['yes', 'y', 'no', 'n']:
            reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
        if reply.lower() in ['yes', 'y']:
@ -1679,7 +1911,7 @@ def resumePreviousDump(config={}, other={}):
            if lasttitle == '':
                lasttitle=lasttitles.next()
        except:
-            pass  # probably file does not exists
+            lasttitle = ''  # probably file does not exists
        if lasttitle == '--END--':
            # titles list is complete
            print 'Title list was completed in the previous session'
@ -1810,7 +2042,7 @@ def saveSpecialVersion(config={}, session=None):
    else:
        print 'Downloading Special:Version with extensions and other related info'
        r = session.post(
-            url=config['index'], data={'title': 'Special:Version'})
+            url=config['index'], params={'title': 'Special:Version'}, timeout=10)
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
@ -1825,14 +2057,13 @@ def saveIndexPHP(config={}, session=None):
        print 'index.html exists, do not overwrite'
    else:
        print 'Downloading index.php (Main Page) as index.html'
-        r = session.post(url=config['index'], data={})
+        r = session.post(url=config['index'], params={}, timeout=10)
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
        with open('%s/index.html' % (config['path']), 'w') as outfile:
            outfile.write(raw.encode('utf-8'))

-
 def saveSiteInfo(config={}, session=None):
    """ Save a file with site info """

@ -1845,30 +2076,33 @@ def saveSiteInfo(config={}, session=None):
            # MediaWiki 1.13+
            r = session.post(
                url=config['api'],
-                data={
+                params={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
                    'sinumberingroup': 1,
-                    'format': 'json'})
+                    'format': 'json'},
+                timeout=10)
            # MediaWiki 1.11-1.12
            if not 'query' in getJSON(r):
                r = session.post(
                    url=config['api'],
-                    data={
+                    params={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
-                        'format': 'json'})
+                        'format': 'json'},
+                    timeout=10)
            # MediaWiki 1.8-1.10
            if not 'query' in getJSON(r):
                r = session.post(
                    url=config['api'],
-                    data={
+                    params={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop': 'general|namespaces',
-                        'format': 'json'})
+                        'format': 'json'},
+                    timeout=10)
            result = getJSON(r)
            delay(config=config, session=session)
            with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
@ -1879,10 +2113,14 @@ def avoidWikimediaProjects(config={}, other={}):
    """ Skip Wikimedia projects and redirect to the dumps website """

    # notice about wikipedia dumps
+    url = ''
+    if config['api']:
+        url = url + config['api']
+    if config['index']:
+        url = url + config['index']
    if re.findall(
            r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
-            config['api'] +
-            config['index']):
+            url):
        print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
        print 'Download the dumps from http://dumps.wikimedia.org'
        if not other['force']:
@ -1895,9 +2133,9 @@ def getWikiEngine(url=''):

    session = requests.Session()
    session.headers.update({'User-Agent': getUserAgent()})
-    r = session.post(url=url)
+    r = session.post(url=url, timeout=30)
    if r.status_code == 405 or r.text == '':
-        r = session.get(url=url)
+        r = session.get(url=url, timeout=120)
    result = r.text

    wikiengine = 'Unknown'
@ -1980,7 +2218,7 @@ def mwGetAPIAndIndex(url=''):
    index = ''
    session = requests.Session()
    session.headers.update({'User-Agent': getUserAgent()})
-    r = session.post(url=url)
+    r = session.post(url=url, timeout=120)
    result = r.text

    # API
@ -2042,6 +2280,8 @@ def main(params=[]):
    while not other['resume'] and os.path.isdir(config['path']):
        print '\nWarning!: "%s" path exists' % (config['path'])
        reply = ''
+        if config['failfast']:
+            retry = 'yes'
        while reply.lower() not in ['yes', 'y', 'no', 'n']:
            reply = raw_input(
                'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %
--- a/batchdownload/launcher.py
+++ b/batchdownload/launcher.py
@ -6,12 +6,12 @@
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

@ -30,11 +30,11 @@ def main():
    if len(sys.argv) < 2:
        print 'python script.py file-with-apis.txt'
        sys.exit()
-    
+
    print 'Reading list of APIs from', sys.argv[1]
    wikis = open(sys.argv[1], 'r').read().splitlines()
    print '%d APIs found' % (len(wikis))
-    
+
    for wiki in wikis:
        print "#"*73
        print "# Downloading", wiki
@ -42,17 +42,15 @@ def main():
        wiki = wiki.lower()
        # Make the prefix in standard way; api and index must be defined, not important which is which
        prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
-        
+
        #check if compressed, in that case dump was finished previously
        compressed = False
-        for dirname, dirnames, filenames in os.walk('.'):
-            if dirname == '.':
-                for f in filenames:
-                    if f.startswith(prefix) and f.endswith('.7z'):
-                        compressed = True
-                        zipfilename = f
+        for f in os.listdir('.'):
+            if f.startswith(prefix) and f.endswith('.7z'):
+                compressed = True
+                zipfilename = f
                break #stop searching, dot not explore subdirectories
-        
+
        if compressed:
            print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
            # Get the archive's file list.
@ -67,18 +65,17 @@ def main():
                print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
                # TODO: Find a way like grep -q below without doing a 7z l multiple times?
            continue
-        
+
        #download
        started = False #was this wiki download started before? then resume
        wikidir = ''
-        for dirname, dirnames, filenames in os.walk('.'):
-            if dirname == '.':
-                for d in dirnames:
-                    if d.startswith(prefix):
-                        wikidir = d
-                        started = True
+        for f in os.listdir('.'):
+            # Does not find numbered wikidumps not verify directories
+            if f.startswith(prefix) and f.endswith('wikidump'):
+                wikidir = f
+                started = True
                break #stop searching, dot not explore subdirectories
-        
+
        # time.sleep(60)
        # Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
        # such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
@ -90,15 +87,14 @@ def main():
            subprocess.call('./dumpgenerator.py --api=%s --xml --images --delay=1' % wiki, shell=True)
            started = True
            #save wikidir now
-            for dirname, dirnames, filenames in os.walk('.'):
-                if dirname == '.':
-                    for d in dirnames:
-                        if d.startswith(prefix):
-                            wikidir = d
+            for f in os.listdir('.'):
+                # Does not find numbered wikidumps not verify directories
+                if f.startswith(prefix) and f.endswith('wikidump'):
+                    wikidir = f
                    break #stop searching, dot not explore subdirectories
-        
+
        prefix = wikidir.split('-wikidump')[0]
-        
+
        finished = False
        if started and wikidir and prefix:
            if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
@ -107,7 +103,7 @@ def main():
                finished = True
        # You can also issue this on your working directory to find all incomplete dumps:
        # tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
-        
+
        #compress
        if finished:
            time.sleep(1)
--- a/listsofwikis/dokuwiki/dokuinstall.txt
+++ b/listsofwikis/dokuwiki/dokuinstall.txt
@ -3048,7 +3048,7 @@ http://vai.uibk.ac.at/dadp/doku.php
 http://vak.ru/doku.php
 http://val.bmstu.ru/dokuwiki/doku.php
 http://valk.mave.jp/doku.php
-http://vancouver.hackspace.ca/doku.php
+http://vanhack.ca/doku.php
 http://vanets.vuse.vanderbilt.edu/dokuwiki/doku.php
 http://vaslor.net/doku.php
 http://vbraun.name/cms/doku.php
@ -4957,7 +4957,6 @@ http://www.minkhollow.ca/becker/doku.php
 http://www.minkhollow.ca/mhf/doku.php
 http://www.minkhollow.ca/MHF/doku.php
 http://www.minkhollow.ca/Thesis07/doku.php
-http://www.mirkosertic.de/doku.php
 http://www.mirmer.su/wiki/doku.php
 http://www.mixshare.com/wiki/doku.php
 http://www.mixxx.org/wiki/doku.php
--- a/listsofwikis/mediawiki/mediawikis_2018-alive.txt
+++ b/listsofwikis/mediawiki/mediawikis_2018-alive.txt
--- a/listsofwikis/mediawiki/miraheze-spider.py
+++ b/listsofwikis/mediawiki/miraheze-spider.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-

-# Copyright (C) 2014 WikiTeam developers
+# Copyright (C) 2014-2017 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -26,9 +26,10 @@ def main():
    url = 'https://meta.miraheze.org/wiki/Special:SiteMatrix'
    r = requests.get(url, headers=headers)
    raw = r.text
-    m = re.findall(ur'<tr><td><a href="https://([^>]+?)/">[^<]+</a></td></tr>', raw)
+    m = re.findall(ur'<tr><td>(<del>)?<a href="https://([^>]+?)/">[^<]+</a>', raw)
+    m.sort()
    for i in m:
-        print 'https://' + i + '/w/api.php'
+        print 'https://' + i[1] + '/w/api.php'
    
 if __name__ == '__main__':
    main()
--- a/listsofwikis/mediawiki/miraheze.org
+++ b/listsofwikis/mediawiki/miraheze.org
--- a/listsofwikis/mediawiki/miraheze.org.info
+++ b/listsofwikis/mediawiki/miraheze.org.info
@ -1,5 +1,5 @@
 Wikifarm: https://meta.miraheze.org/wiki/Miraheze
-Last update: 2015-09-29
+Last update: 2017-06-30

 Details:

--- a/listsofwikis/mediawiki/neoseeker-spider.py
+++ b/listsofwikis/mediawiki/neoseeker-spider.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-

-# Copyright (C) 2014 WikiTeam developers
+# Copyright (C) 2014-2017 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -27,6 +27,7 @@ def main():
    r = requests.get(url, headers=headers)
    raw = r.text
    m = re.findall(ur'<li><a href=\'([^>]+?)/wiki/\'>', raw)
+    m.sort()
    for i in m:
        print i + '/w/api.php'
    
--- a/listsofwikis/mediawiki/neoseeker.com
+++ b/listsofwikis/mediawiki/neoseeker.com
@ -2,8 +2,6 @@ http://24.neoseeker.com/w/api.php
 http://aceattorney.neoseeker.com/w/api.php
 http://advancewars.neoseeker.com/w/api.php
 http://adventuretime.neoseeker.com/w/api.php
-http://alanwake.neoseeker.com/w/api.php
-http://alienbreed.neoseeker.com/w/api.php
 http://animalcrossing.neoseeker.com/w/api.php
 http://attackontitan.neoseeker.com/w/api.php
 http://avatar.neoseeker.com/w/api.php
@ -17,9 +15,9 @@ http://boktai.neoseeker.com/w/api.php
 http://bond.neoseeker.com/w/api.php
 http://borderlands.neoseeker.com/w/api.php
 http://boundbyflame.neoseeker.com/w/api.php
+http://bravely.neoseeker.com/w/api.php
 http://breathoffire.neoseeker.com/w/api.php
 http://brink.neoseeker.com/w/api.php
-http://bulletstorm.neoseeker.com/w/api.php
 http://callofduty.neoseeker.com/w/api.php
 http://castlecrashers.neoseeker.com/w/api.php
 http://castlevania.neoseeker.com/w/api.php
@ -35,13 +33,10 @@ http://danganronpa.neoseeker.com/w/api.php
 http://darksouls.neoseeker.com/w/api.php
 http://deadisland.neoseeker.com/w/api.php
 http://deadoralive.neoseeker.com/w/api.php
-http://deadspace.neoseeker.com/w/api.php
 http://deathnote.neoseeker.com/w/api.php
 http://demonssouls.neoseeker.com/w/api.php
 http://destiny.neoseeker.com/w/api.php
-http://deusex.neoseeker.com/w/api.php
 http://devilmaycry.neoseeker.com/w/api.php
-http://diablo3.neoseeker.com/w/api.php
 http://digimon.neoseeker.com/w/api.php
 http://disgaea.neoseeker.com/w/api.php
 http://doctorwho.neoseeker.com/w/api.php
@ -57,21 +52,17 @@ http://dynastywarriors.neoseeker.com/w/api.php
 http://elderscrolls.neoseeker.com/w/api.php
 http://endlessocean.neoseeker.com/w/api.php
 http://evangelion.neoseeker.com/w/api.php
-http://eveonline.neoseeker.com/w/api.php
 http://fable.neoseeker.com/w/api.php
 http://fairytail.neoseeker.com/w/api.php
-http://fallout4.neoseeker.com/w/api.php
 http://fallout.neoseeker.com/w/api.php
+http://fallout4.neoseeker.com/w/api.php
 http://familyguy.neoseeker.com/w/api.php
-http://farcry.neoseeker.com/w/api.php
 http://fatalfury.neoseeker.com/w/api.php
 http://fifa.neoseeker.com/w/api.php
 http://finalfantasy.neoseeker.com/w/api.php
 http://fireemblem.neoseeker.com/w/api.php
 http://footballmanager.neoseeker.com/w/api.php
 http://formula1.neoseeker.com/w/api.php
-http://forza.neoseeker.com/w/api.php
-http://friends.neoseeker.com/w/api.php
 http://fullmetalalchemist.neoseeker.com/w/api.php
 http://futurama.neoseeker.com/w/api.php
 http://fzero.neoseeker.com/w/api.php
@ -81,11 +72,9 @@ http://glee.neoseeker.com/w/api.php
 http://godofwar.neoseeker.com/w/api.php
 http://goldensun.neoseeker.com/w/api.php
 http://granturismo.neoseeker.com/w/api.php
-http://greysanatomy.neoseeker.com/w/api.php
 http://growlanser.neoseeker.com/w/api.php
-http://gta5.neoseeker.com/w/api.php
 http://gta.neoseeker.com/w/api.php
-http://guildwars2.neoseeker.com/w/api.php
+http://gta5.neoseeker.com/w/api.php
 http://guildwars.neoseeker.com/w/api.php
 http://guitarhero.neoseeker.com/w/api.php
 http://gundam.neoseeker.com/w/api.php
@ -106,7 +95,6 @@ http://inuyasha.neoseeker.com/w/api.php
 http://jakdaxter.neoseeker.com/w/api.php
 http://kairosoft.neoseeker.com/w/api.php
 http://kidicarus.neoseeker.com/w/api.php
-http://kingdomcome.neoseeker.com/w/api.php
 http://kingdomhearts.neoseeker.com/w/api.php
 http://kirby.neoseeker.com/w/api.php
 http://knack.neoseeker.com/w/api.php
@ -115,8 +103,6 @@ http://layton.neoseeker.com/w/api.php
 http://leagueoflegends.neoseeker.com/w/api.php
 http://legendofdragoon.neoseeker.com/w/api.php
 http://littlebigplanet.neoseeker.com/w/api.php
-http://lmamanager.neoseeker.com/w/api.php
-http://lordsofthefallen.neoseeker.com/w/api.php
 http://lotr.neoseeker.com/w/api.php
 http://mafia.neoseeker.com/w/api.php
 http://magicalstarsign.neoseeker.com/w/api.php
@ -128,7 +114,6 @@ http://megaman.neoseeker.com/w/api.php
 http://megamitensei.neoseeker.com/w/api.php
 http://metalgear.neoseeker.com/w/api.php
 http://metroid.neoseeker.com/w/api.php
-http://mightandmagic.neoseeker.com/w/api.php
 http://minecraft.neoseeker.com/w/api.php
 http://monsterhunter.neoseeker.com/w/api.php
 http://mortalkombat.neoseeker.com/w/api.php
@ -140,7 +125,6 @@ http://ncis.neoseeker.com/w/api.php
 http://needforspeed.neoseeker.com/w/api.php
 http://ninjagaiden.neoseeker.com/w/api.php
 http://ninokuni.neoseeker.com/w/api.php
-http://nintendogs.neoseeker.com/w/api.php
 http://okami.neoseeker.com/w/api.php
 http://onepiece.neoseeker.com/w/api.php
 http://persona.neoseeker.com/w/api.php
@ -160,14 +144,12 @@ http://rockband.neoseeker.com/w/api.php
 http://rpgmaker.neoseeker.com/w/api.php
 http://runefactory.neoseeker.com/w/api.php
 http://runescape.neoseeker.com/w/api.php
-http://runesofmagic.neoseeker.com/w/api.php
 http://sandbox.neoseeker.com/w/api.php
 http://scottpilgrim.neoseeker.com/w/api.php
 http://scrapmetal.neoseeker.com/w/api.php
 http://scribblenauts.neoseeker.com/w/api.php
 http://shadowofthecolossus.neoseeker.com/w/api.php
 http://shadowrunreturns.neoseeker.com/w/api.php
-http://shank.neoseeker.com/w/api.php
 http://shenmue.neoseeker.com/w/api.php
 http://simpsons.neoseeker.com/w/api.php
 http://skate.neoseeker.com/w/api.php
@ -183,7 +165,6 @@ http://southpark.neoseeker.com/w/api.php
 http://spiderman.neoseeker.com/w/api.php
 http://spongebob.neoseeker.com/w/api.php
 http://spyro.neoseeker.com/w/api.php
-http://starbound.neoseeker.com/w/api.php
 http://starcraft.neoseeker.com/w/api.php
 http://starfox.neoseeker.com/w/api.php
 http://stargate.neoseeker.com/w/api.php
@ -196,9 +177,7 @@ http://tales.neoseeker.com/w/api.php
 http://tekken.neoseeker.com/w/api.php
 http://terraria.neoseeker.com/w/api.php
 http://thedarkness.neoseeker.com/w/api.php
-http://thedivision.neoseeker.com/w/api.php
 http://thelastofus.neoseeker.com/w/api.php
-http://theorder.neoseeker.com/w/api.php
 http://thesecretworld.neoseeker.com/w/api.php
 http://thesims.neoseeker.com/w/api.php
 http://thewarriors.neoseeker.com/w/api.php
@ -206,9 +185,7 @@ http://theworldendswithyou.neoseeker.com/w/api.php
 http://thief.neoseeker.com/w/api.php
 http://timesplitters.neoseeker.com/w/api.php
 http://tonyhawk.neoseeker.com/w/api.php
-http://torchlight2.neoseeker.com/w/api.php
 http://toriko.neoseeker.com/w/api.php
-http://transformers.neoseeker.com/w/api.php
 http://twilight.neoseeker.com/w/api.php
 http://twistedmetal.neoseeker.com/w/api.php
 http://uncharted.neoseeker.com/w/api.php
@ -217,12 +194,9 @@ http://vivapinata.neoseeker.com/w/api.php
 http://wakfu.neoseeker.com/w/api.php
 http://warcraft.neoseeker.com/w/api.php
 http://warhammer.neoseeker.com/w/api.php
-http://wasteland2.neoseeker.com/w/api.php
 http://watchdogs.neoseeker.com/w/api.php
 http://whiteknightchronicles.neoseeker.com/w/api.php
 http://wikiguides.neoseeker.com/w/api.php
-http://witcher3.neoseeker.com/w/api.php
-http://worldoftanks.neoseeker.com/w/api.php
 http://wow.neoseeker.com/w/api.php
 http://xenoblade.neoseeker.com/w/api.php
 http://yugioh.neoseeker.com/w/api.php
--- a/listsofwikis/mediawiki/neoseeker.com.info
+++ b/listsofwikis/mediawiki/neoseeker.com.info
@ -1,5 +1,5 @@
 Wikifarm: http://neowiki.neoseeker.com/wiki/Main_Page
-Last update: 2015-10-07
+Last update: 2017-06-30

 Details:

--- a/listsofwikis/mediawiki/wikia.com
+++ b/listsofwikis/mediawiki/wikia.com
--- a/listsofwikis/mediawiki/wikia.py
+++ b/listsofwikis/mediawiki/wikia.py
@ -23,7 +23,7 @@ import subprocess
 import re
 from wikitools import wiki, api

-def getlist(wikia, wkfrom = 1, wkto = 1000):
+def getlist(wikia, wkfrom = 1, wkto = 100):
    params = {'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto,}
    request = api.APIRequest(wikia, params)
    return request.query()['query']['wkdomains']
@ -31,8 +31,9 @@ def getlist(wikia, wkfrom = 1, wkto = 1000):
 def getall():
    wikia = wiki.Wiki('http://community.wikia.com/api.php')
    offset = 0
-    limit = 1000
+    limit = 100
    domains = {}
+    empty = 0
    # This API module has no query continuation facility
    print 'Getting list of active domains...'
    while True:
@ -40,13 +41,21 @@ def getall():
        if list:
            print offset
            domains = dict(domains.items() + list.items() )
-            offset += 1000
+            empty = 0
        else:
+            empty += 1
+
+        offset += limit
+        if empty > 100:
+            # Hopefully we don't have more than 10k wikis deleted in a row
            break
    return domains

 def main():
    domains = getall()
+    with open('wikia.com', 'w') as out:
+		out.write('\n'.join(str(domains[i]['domain']) for i in domains))
+
    undumped = []
    # Or we could iterate over each sublist while we get it?
    for i in domains:
@ -55,21 +64,21 @@ def main():
        print dbname
        first = dbname[0]
        # There are one-letter dbnames; the second letter is replaced by an underscore
-        # http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.gz
+        # http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.7z
        try:
            second = dbname[1]
        except:
            second = '_'
        base = 'http://s3.amazonaws.com/wikia_xml_dumps/' + first + '/' \
            + first + second + '/' + dbname
-        full = base + '_pages_full.xml.gz'
+        full = base + '_pages_full.xml.7z'
        print full
-        current = base + '_pages_current.xml.gz'
+        current = base + '_pages_current.xml.7z'
        images = base + '_images.tar'
        try:
            #subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
            # Use this instead, and comment out the next try, to only list.
-            subprocess.check_call(['curl', '-I', '--fail', full])
+            subprocess.call(['curl', '-I', '--fail', full])
        except subprocess.CalledProcessError as e:
            # We added --fail for this https://superuser.com/a/854102/283120
            if e.returncode == 22:
@ -81,7 +90,9 @@ def main():
        #    subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images])
        #except:
        #    pass
-    print '\n'.join(str(dump) for dump in undumped)
+
+    with open('wikia.com-unarchived', 'w+') as out:
+        out.write('\n'.join(str(domain) for domain in undumped))

 if __name__ == '__main__':
    main()
--- a/listsofwikis/wikidot/wikidot-duckduckgo.py
+++ b/listsofwikis/wikidot/wikidot-duckduckgo.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2018 WikiTeam developers
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import random
+import re
+import sys
+import time
+import urllib.request
+
+def main():
+    opener = urllib.request.build_opener()
+    opener.addheaders = [('User-agent', 'Mozilla/5.1')]
+    urllib.request.install_opener(opener)
+    
+    for i in range(1, 100000):
+        url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikidot.com' % (random.randint(100, 5000), random.randint(1000, 9999))
+        print('URL search', url)
+        try:
+            html = urllib.request.urlopen(url).read().decode('utf-8')
+        except:
+            print('Search error')
+            time.sleep(30)
+            continue
+        html = urllib.parse.unquote(html)
+        m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
+        for wiki in m:
+            wiki = 'https://' + wiki
+            if not wiki in wikis:
+                wikis.append(wiki)
+                wikis.sort()
+                print(wiki)
+        with open('wikidot-duckduckgo.txt', 'w') as f:
+            wikis2 = []
+            for wiki in wikis:
+                wiki = re.sub(r'https?://www\.', 'http://', wiki)
+                if not wiki in wikis2:
+                    wikis2.append(wiki)
+            wikis = wikis2
+            wikis.sort()
+            f.write('\n'.join(wikis))
+        print('%d wikis found' % (len(wikis)))
+        sleep = random.randint(5,20)
+        print('Sleeping %d seconds' % (sleep))
+        time.sleep(sleep)
+
+if __name__ == '__main__':
+    main()
--- a/listsofwikis/wikidot/wikidot-spider.py
+++ b/listsofwikis/wikidot/wikidot-spider.py
@ -0,0 +1,65 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2018 WikiTeam developers
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import random
+import re
+import sys
+import time
+import urllib.request
+
+def main():
+    opener = urllib.request.build_opener()
+    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+    urllib.request.install_opener(opener)
+    
+    wikis = []
+    with open('wikidot-spider.txt', 'r') as f:
+        wikis = f.read().strip().splitlines()
+    
+    for i in range(1, 1000000):
+        url = random.choice(wikis)
+        print('URL search', url)
+        try:
+            html = urllib.request.urlopen(url).read().decode('utf-8')
+        except:
+            print('Search error')
+            time.sleep(30)
+            continue
+        html = urllib.parse.unquote(html)
+        m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
+        for wiki in m:
+            wiki = 'http://' + wiki
+            if not wiki in wikis:
+                wikis.append(wiki)
+                wikis.sort()
+                print(wiki)
+        with open('wikidot-spider.txt', 'w') as f:
+            wikis2 = []
+            for wiki in wikis:
+                wiki = re.sub(r'https?://www\.', 'http://', wiki)
+                if not wiki in wikis2:
+                    wikis2.append(wiki)
+            wikis = wikis2
+            wikis.sort()
+            f.write('\n'.join(wikis))
+        print('%d wikis found' % (len(wikis)))
+        sleep = random.randint(1,5)
+        print('Sleeping %d seconds' % (sleep))
+        time.sleep(sleep)
+
+if __name__ == '__main__':
+    main()
--- a/listsofwikis/wikidot/wikidot-spider.txt
+++ b/listsofwikis/wikidot/wikidot-spider.txt
@ -0,0 +1,871 @@
+http://007.wikidot.com
+http://025002.wikidot.com
+http://05centraal.wikidot.com
+http://05command-ja.wikidot.com
+http://05command.wikidot.com
+http://05zentrale.wikidot.com
+http://101.wikidot.com
+http://16thfleet.wikidot.com
+http://2012hoax.wikidot.com
+http://56wrtg1150.wikidot.com
+http://5edndwiki.wikidot.com
+http://E-H-S.wikidot.com
+http://F90in15Minutes.wikidot.com
+http://Health-Matters.wikidot.com
+http://Herbis.wikidot.com
+http://INCL.wikidot.com
+http://a4art.wikidot.com
+http://abarrelfull.wikidot.com
+http://academicwriting.wikidot.com
+http://ad3.wikidot.com
+http://admindevelopement.wikidot.com
+http://advent-ro.wikidot.com
+http://adventuresintherealms.wikidot.com
+http://aepassociation.wikidot.com
+http://aepsandbox.wikidot.com
+http://afterthecomet-v2.wikidot.com
+http://ageofascension.wikidot.com
+http://ageofheroesmux.wikidot.com
+http://airchairbuild.wikidot.com
+http://albums-template.wikidot.com
+http://alfamedia.wikidot.com
+http://algadon.wikidot.com
+http://alicebot.wikidot.com
+http://alveslima-edu.wikidot.com
+http://amawal.wikidot.com
+http://amen.wikidot.com
+http://amiii.wikidot.com
+http://analyticscamp.wikidot.com
+http://android0.wikidot.com
+http://androidalchemy.wikidot.com
+http://angarmegia-creadores.wikidot.com
+http://angarmegia-publicaciones.wikidot.com
+http://angarmegia-valores.wikidot.com
+http://angarmegia.wikidot.com
+http://angry-mage-games.wikidot.com
+http://anime-planet.wikidot.com
+http://apmoderneuro.wikidot.com
+http://applebyandwyman.wikidot.com
+http://aprendiendo.wikidot.com
+http://aq-3d.wikidot.com
+http://aqw-swf.wikidot.com
+http://aqwwiki.wikidot.com
+http://arcana.wikidot.com
+http://arcblade.wikidot.com
+http://artemachia.wikidot.com
+http://artniyet.wikidot.com
+http://asen.wikidot.com
+http://asoh.wikidot.com
+http://aspnet.wikidot.com
+http://astrobhadauria.wikidot.com
+http://astrobhadauria1414.wikidot.com
+http://astroveda.wikidot.com
+http://astroyogas.wikidot.com
+http://asu-csf.wikidot.com
+http://audioprodukcja.wikidot.com
+http://avendar.wikidot.com
+http://aviationknowledge.wikidot.com
+http://avoidglow.wikidot.com
+http://azentia.wikidot.com
+http://babel-template.wikidot.com
+http://backpharma.wikidot.com
+http://backupstorage.wikidot.com
+http://badwebcomics.wikidot.com
+http://balchipedia.wikidot.com
+http://barakus.wikidot.com
+http://battlestargenesis.wikidot.com
+http://bcp.wikidot.com
+http://beadersresourceguide.wikidot.com
+http://beargod.wikidot.com
+http://benitachell-bowls-club.wikidot.com
+http://bhg.wikidot.com
+http://bibles.wikidot.com
+http://bilbreyapwh.wikidot.com
+http://biol-117.wikidot.com
+http://biol252-biol319.wikidot.com
+http://bioproject.wikidot.com
+http://bisgmit.wikidot.com
+http://blackbelt.wikidot.com
+http://blackberrystorm.wikidot.com
+http://blackmarches.wikidot.com
+http://blank-template.wikidot.com
+http://bleachitp.wikidot.com
+http://blender0.wikidot.com
+http://blender1.wikidot.com
+http://blmodding.wikidot.com
+http://blog-template.wikidot.com
+http://blog.wikidot.com
+http://blogs-template.wikidot.com
+http://bloodborne.wikidot.com
+http://bni-ine.wikidot.com
+http://book-template.wikidot.com
+http://booriley.wikidot.com
+http://bootstrap-playground.wikidot.com
+http://borderlands.wikidot.com
+http://borradores-insurgencia-del-caos.wikidot.com
+http://borradores-scp-es.wikidot.com
+http://bozic-nation.wikidot.com
+http://brmehta12.wikidot.com
+http://brtff.wikidot.com
+http://brydz.wikidot.com
+http://bua581.wikidot.com
+http://bua581beerworks.wikidot.com
+http://bua581hallelibraryfinalproject.wikidot.com
+http://bugs-template.wikidot.com
+http://bugs.wikidot.com
+http://burntlands.wikidot.com
+http://bvs.wikidot.com
+http://bx-community.wikidot.com
+http://bzhlab.wikidot.com
+http://c4fsharp.wikidot.com
+http://calu.wikidot.com
+http://campusconfidential.wikidot.com
+http://cancer-control.wikidot.com
+http://caosinsurgente.wikidot.com
+http://carpenoctemstaff.wikidot.com
+http://castleage.wikidot.com
+http://caughtnotsleeping.wikidot.com
+http://ccckmit.wikidot.com
+http://ccpd.wikidot.com
+http://cctest.wikidot.com
+http://ccyms.wikidot.com
+http://ccymsevangelization.wikidot.com
+http://ccymsfoundations.wikidot.com
+http://ccymsjustice.wikidot.com
+http://ccymslounge.wikidot.com
+http://ccymspastoral.wikidot.com
+http://ccymspractices.wikidot.com
+http://ccymsprayer.wikidot.com
+http://ccymsprinciples.wikidot.com
+http://ccymsskills.wikidot.com
+http://ccymsstudents.wikidot.com
+http://cdaworldhistory.wikidot.com
+http://cellworld.wikidot.com
+http://celtic-heroes.wikidot.com
+http://cf-vanguard.wikidot.com
+http://cgp.wikidot.com
+http://chaoscomplexityineducation.wikidot.com
+http://chat-template.wikidot.com
+http://chatroom.wikidot.com
+http://chavezbraintrust.wikidot.com
+http://chcc.wikidot.com
+http://chessvariants.wikidot.com
+http://chimiex-bicaz.wikidot.com
+http://ci-sandbox.wikidot.com
+http://ci-visualdocuments.wikidot.com
+http://ci-wiki.wikidot.com
+http://circservices.wikidot.com
+http://ciscotr.wikidot.com
+http://cityofangels.wikidot.com
+http://cleanias.wikidot.com
+http://cmbeta.wikidot.com
+http://coffeetime.wikidot.com
+http://coffeetimex.wikidot.com
+http://colbycriminaljustice.wikidot.com
+http://columbiacity.wikidot.com
+http://comando05.wikidot.com
+http://comando05ptbr.wikidot.com
+http://commandement-alpha.wikidot.com
+http://commandemento5.wikidot.com
+http://communicity.wikidot.com
+http://communicity2010.wikidot.com
+http://community-playground.wikidot.com
+http://community.wikidot.com
+http://computer0.wikidot.com
+http://comux.wikidot.com
+http://connorscampaigns.wikidot.com
+http://connorscentral.wikidot.com
+http://connorsgmnotes.wikidot.com
+http://connorssettings.wikidot.com
+http://consumerpsych2009.wikidot.com
+http://convert.wikidot.com
+http://copernicon.wikidot.com
+http://corvidcollege.wikidot.com
+http://corwyn.wikidot.com
+http://cpp-wiki.wikidot.com
+http://cquniversity.wikidot.com
+http://crashfeverwikitw.wikidot.com
+http://crimjobs2010-2011.wikidot.com
+http://crm-iseg.wikidot.com
+http://crm-template.wikidot.com
+http://crosswindsgarou.wikidot.com
+http://crypsis-net.wikidot.com
+http://cs0.wikidot.com
+http://cs1.wikidot.com
+http://cs101c.wikidot.com
+http://cs124project-2009.wikidot.com
+http://csc180.wikidot.com
+http://csi.wikidot.com
+http://css-competition.wikidot.com
+http://css-sandbox.wikidot.com
+http://css.wikidot.com
+http://css3.wikidot.com
+http://css3themes.wikidot.com
+http://cst133a.wikidot.com
+http://ctwiki.wikidot.com
+http://cuarteldelo5.wikidot.com
+http://cubesat.wikidot.com
+http://cuiltheory.wikidot.com
+http://cunefa2.wikidot.com
+http://cunefb2.wikidot.com
+http://cunefc2.wikidot.com
+http://cunefe2.wikidot.com
+http://cyclods.wikidot.com
+http://daeren.wikidot.com
+http://darksouls.wikidot.com
+http://darksouls2.wikidot.com
+http://darksouls3.wikidot.com
+http://dawnofanewage.wikidot.com
+http://dcernst-teaching.wikidot.com
+http://dcernst.wikidot.com
+http://ddscat.wikidot.com
+http://defa.wikidot.com
+http://default-template.wikidot.com
+http://defunct-elitequestworlds.wikidot.com
+http://demonssouls.wikidot.com
+http://denver.wikidot.com
+http://desenvolvimentodejogos.wikidot.com
+http://design-illustration.wikidot.com
+http://destiny.wikidot.com
+http://detailed-customer-management.wikidot.com
+http://dndis.wikidot.com
+http://docpl.wikidot.com
+http://dokument-uz.wikidot.com
+http://dotflow.wikidot.com
+http://downsfolk.wikidot.com
+http://dowodztwo.wikidot.com
+http://dragon-trees.wikidot.com
+http://dreamprogram.wikidot.com
+http://dreamteam.wikidot.com
+http://dresdenfiles.wikidot.com
+http://ds09.wikidot.com
+http://ds10.wikidot.com
+http://ds2009a.wikidot.com
+http://ds2010a.wikidot.com
+http://dwd.wikidot.com
+http://e-h-s.wikidot.com
+http://earlychildhood.wikidot.com
+http://eberronunlimited.wikidot.com
+http://ecadmin.wikidot.com
+http://ecctimeline.wikidot.com
+http://echobazaar.wikidot.com
+http://ecomind.wikidot.com
+http://editor.wikidot.com
+http://editora.wikidot.com
+http://edmw.wikidot.com
+http://educ400-401.wikidot.com
+http://education-template.wikidot.com
+http://efepereth.wikidot.com
+http://eime.wikidot.com
+http://eitriggcrafting.wikidot.com
+http://ejs-in-india.wikidot.com
+http://eldritch00.wikidot.com
+http://elishapeterson.wikidot.com
+http://elsirvale.wikidot.com
+http://elunesjustice.wikidot.com
+http://emchina2010.wikidot.com
+http://enchantedbros.wikidot.com
+http://encyclowiki.wikidot.com
+http://energyclub.wikidot.com
+http://energyclub4samvedna.wikidot.com
+http://energyfuture.wikidot.com
+http://eng1d1.wikidot.com
+http://eng270.wikidot.com
+http://epimreth.wikidot.com
+http://epitome.wikidot.com
+http://esperanto.wikidot.com
+http://estudianteseconomiauned.wikidot.com
+http://eventidemush.wikidot.com
+http://everydaymagicalgirls.wikidot.com
+http://evilhat.wikidot.com
+http://execs.wikidot.com
+http://exploringsciencewiki.wikidot.com
+http://extrabees.wikidot.com
+http://f650cs.wikidot.com
+http://fairfieldproject.wikidot.com
+http://falchionvalley.wikidot.com
+http://fallout2online.wikidot.com
+http://faq.wikidot.com
+http://fearschemistry.wikidot.com
+http://fed20.wikidot.com
+http://feedback-template.wikidot.com
+http://feedback.wikidot.com
+http://fifa360.wikidot.com
+http://fifabeapro360.wikidot.com
+http://fightcorruption.wikidot.com
+http://figmentregistry.wikidot.com
+http://fillionempire.wikidot.com
+http://finalfantasy14fr.wikidot.com
+http://first-steps.wikidot.com
+http://flyclear.wikidot.com
+http://fmi.wikidot.com
+http://fmiseria3.wikidot.com
+http://fondationscp.wikidot.com
+http://fondationscpsandbox.wikidot.com
+http://fondazionescp.wikidot.com
+http://fortean.wikidot.com
+http://forum-template.wikidot.com
+http://forum.wikidot.com
+http://fourthwallgames.wikidot.com
+http://fpt.wikidot.com
+http://freevoddler.wikidot.com
+http://fretsonfire.wikidot.com
+http://futaba8fg.wikidot.com
+http://gagetowngaming.wikidot.com
+http://galacticunity.wikidot.com
+http://game-maker.wikidot.com
+http://gamedesign.wikidot.com
+http://gamemaker.wikidot.com
+http://gasbags.wikidot.com
+http://gd28.wikidot.com
+http://gdnd.wikidot.com
+http://gdt2009.wikidot.com
+http://gear-sandbox.wikidot.com
+http://geararc.wikidot.com
+http://genderbinary.wikidot.com
+http://generals.wikidot.com
+http://ginnungagap.wikidot.com
+http://globalseminarhealth.wikidot.com
+http://goddardtech.wikidot.com
+http://gorszy.wikidot.com
+http://greatestfilipino.wikidot.com
+http://green-house.wikidot.com
+http://guitarzero.wikidot.com
+http://gurpswiki.wikidot.com
+http://h205.wikidot.com
+http://hackersderede.wikidot.com
+http://halfmoonbay.wikidot.com
+http://hammer-template.wikidot.com
+http://handbook.wikidot.com
+http://harvey-capital-lectures.wikidot.com
+http://health-matters.wikidot.com
+http://herbis.wikidot.com
+http://heroes.wikidot.com
+http://heroesmush.wikidot.com
+http://heroesofalvena.wikidot.com
+http://heroessincity.wikidot.com
+http://hestia.wikidot.com
+http://hfwiki.wikidot.com
+http://hiddenprojectwiki.wikidot.com
+http://himetop.wikidot.com
+http://historynewmedia.wikidot.com
+http://hkcentral.wikidot.com
+http://hogwarts2092.wikidot.com
+http://hopkinswhpg.wikidot.com
+http://housegames.wikidot.com
+http://hp-intothefire.wikidot.com
+http://hrpg.wikidot.com
+http://hscwizards.wikidot.com
+http://hswiki.wikidot.com
+http://html50.wikidot.com
+http://iaac-readings.wikidot.com
+http://iatkos.wikidot.com
+http://ibhistory.wikidot.com
+http://ibi-apedia.wikidot.com
+http://ibiz.wikidot.com
+http://ibmathstuff.wikidot.com
+http://ibphysicsstuff.wikidot.com
+http://ibstuffqa.wikidot.com
+http://iceal.wikidot.com
+http://idrumaaps.wikidot.com
+http://ifs.wikidot.com
+http://igen.wikidot.com
+http://igor.wikidot.com
+http://imocamp.wikidot.com
+http://incl.wikidot.com
+http://inctr-news.wikidot.com
+http://inctr-palliative-care-handbook.wikidot.com
+http://inctr.wikidot.com
+http://indexhibit.wikidot.com
+http://insomniacramblings.wikidot.com
+http://installer.wikidot.com
+http://insurrection-du-chaos-sandbox.wikidot.com
+http://insurrection-du-chaos.wikidot.com
+http://inter-irc.wikidot.com
+http://internationalbatesoninstitute.wikidot.com
+http://internetior.wikidot.com
+http://involo.wikidot.com
+http://ipr10.wikidot.com
+http://ipr11.wikidot.com
+http://ipr12.wikidot.com
+http://iracing.wikidot.com
+http://irc.wikidot.com
+http://irongiant.wikidot.com
+http://irunath.wikidot.com
+http://is2216.wikidot.com
+http://ischool.wikidot.com
+http://isocentre.wikidot.com
+http://issuetracker-template.wikidot.com
+http://istar.wikidot.com
+http://istb-winter2010.wikidot.com
+http://istep-sandbox.wikidot.com
+http://itb322uap.wikidot.com
+http://ivm.wikidot.com
+http://jakilinux.wikidot.com
+http://java.wikidot.com
+http://jayashree.wikidot.com
+http://jccict.wikidot.com
+http://johnmerritt.wikidot.com
+http://join.wikidot.com
+http://jquery-easyui.wikidot.com
+http://jslibrary.wikidot.com
+http://jsukfpsd.wikidot.com
+http://kalgati.wikidot.com
+http://kannadanudi.wikidot.com
+http://karma-lab.wikidot.com
+http://kdiprivateequity.wikidot.com
+http://keramik.wikidot.com
+http://kf59.wikidot.com
+http://kfmapdb.wikidot.com
+http://khaidoan.wikidot.com
+http://kharon.wikidot.com
+http://kindiy.wikidot.com
+http://kingsway.wikidot.com
+http://kingswayeap.wikidot.com
+http://kingswayelem.wikidot.com
+http://kingswayielts.wikidot.com
+http://kingswayint.wikidot.com
+http://kingswaypreint.wikidot.com
+http://kingswayupper.wikidot.com
+http://klps.wikidot.com
+http://kmhouse.wikidot.com
+http://kmk.wikidot.com
+http://knightswrite.wikidot.com
+http://kodo.wikidot.com
+http://koty.wikidot.com
+http://ksemoudania.wikidot.com
+http://ladyhood66.wikidot.com
+http://lafundacionscp.wikidot.com
+http://languagearts8.wikidot.com
+http://lapidaria.wikidot.com
+http://lasthaiku.wikidot.com
+http://latindictionary.wikidot.com
+http://latmari.wikidot.com
+http://leplouc.wikidot.com
+http://lepszy.wikidot.com
+http://level1wiki.wikidot.com
+http://libevents.wikidot.com
+http://liblivadia.wikidot.com
+http://librarylab.wikidot.com
+http://lightworks.wikidot.com
+http://linux0.wikidot.com
+http://livesupport.wikidot.com
+http://lmtoelf.wikidot.com
+http://loosepages.wikidot.com
+http://ltt.wikidot.com
+http://lulu.wikidot.com
+http://m5snapoli.wikidot.com
+http://ma4140.wikidot.com
+http://machines-history.wikidot.com
+http://machinima138.wikidot.com
+http://mactutorial.wikidot.com
+http://maegica.wikidot.com
+http://magiamesterei.wikidot.com
+http://mainframes.wikidot.com
+http://majjhima.wikidot.com
+http://makeyourbot.wikidot.com
+http://malkavian.wikidot.com
+http://managerzonemexico.wikidot.com
+http://maratona.wikidot.com
+http://marblehornets.wikidot.com
+http://margopedia.wikidot.com
+http://marketplace-template.wikidot.com
+http://marvelreborn.wikidot.com
+http://marvelrevolution.wikidot.com
+http://masonic.wikidot.com
+http://math453fall2008.wikidot.com
+http://mathaerobics4samvedna.wikidot.com
+http://mathonline.wikidot.com
+http://mathroughguides.wikidot.com
+http://mbitcoin.wikidot.com
+http://mc-21.wikidot.com
+http://mcdt25e.wikidot.com
+http://me1065.wikidot.com
+http://measurementcamp.wikidot.com
+http://media.wikidot.com
+http://miedzymorze.wikidot.com
+http://minahaplo.wikidot.com
+http://mis213-2.wikidot.com
+http://mk2k.wikidot.com
+http://mkworld.wikidot.com
+http://mnprek-3.wikidot.com
+http://monacobayweyr.wikidot.com
+http://monobook-template.wikidot.com
+http://monobook.wikidot.com
+http://monodot-template.wikidot.com
+http://morningside-genetics.wikidot.com
+http://morningsidemicro.wikidot.com
+http://morphopedics.wikidot.com
+http://mpm.wikidot.com
+http://mukesh381.wikidot.com
+http://multiverse-crisis.wikidot.com
+http://musicgames.wikidot.com
+http://my-pride.wikidot.com
+http://mybookworld.wikidot.com
+http://myslimchatroom.wikidot.com
+http://myvineyard.wikidot.com
+http://nanorodsa.wikidot.com
+http://nanorodthermo.wikidot.com
+http://narutoitp.wikidot.com
+http://narutomushrivalry.wikidot.com
+http://nauticoamager.wikidot.com
+http://neo-dimension.wikidot.com
+http://neosteam.wikidot.com
+http://neozone.wikidot.com
+http://newapprequirements.wikidot.com
+http://news.wikidot.com
+http://nightskysymbology.wikidot.com
+http://nimin.wikidot.com
+http://ninjaproxy.wikidot.com
+http://nirn.wikidot.com
+http://nnhs-science-restrictedaccess.wikidot.com
+http://nnhs-science.wikidot.com
+http://noblebeastwars.wikidot.com
+http://nomyslamps.wikidot.com
+http://norron.wikidot.com
+http://notebook-template.wikidot.com
+http://notebooks.wikidot.com
+http://nre509.wikidot.com
+http://nsb.wikidot.com
+http://ntumed96.wikidot.com
+http://nucularelectronics.wikidot.com
+http://o5command-int.wikidot.com
+http://o5command-th.wikidot.com
+http://oblivionshard.wikidot.com
+http://offtopicarium.wikidot.com
+http://old-template.wikidot.com
+http://oneeleventwentyten.wikidot.com
+http://opend6.wikidot.com
+http://opensource-template.wikidot.com
+http://opensuse.wikidot.com
+http://oppt-sa.wikidot.com
+http://oregonamhi.wikidot.com
+http://osx86.wikidot.com
+http://oversoulgame.wikidot.com
+http://ozradonc.wikidot.com
+http://packages.wikidot.com
+http://pagi.wikidot.com
+http://pandora-saga.wikidot.com
+http://papercraft.wikidot.com
+http://paperworks.wikidot.com
+http://paradiserpg.wikidot.com
+http://paradoxhaze.wikidot.com
+http://paralelo.wikidot.com
+http://parented.wikidot.com
+http://passatb5.wikidot.com
+http://pathtogolarion.wikidot.com
+http://patriot-box-office.wikidot.com
+http://patterns.wikidot.com
+http://pbbg.wikidot.com
+http://pcg.wikidot.com
+http://pcif.wikidot.com
+http://pedhemoncreview.wikidot.com
+http://perchelinux.wikidot.com
+http://pernworld.wikidot.com
+http://personal-template.wikidot.com
+http://petition-template.wikidot.com
+http://pfcuq.wikidot.com
+http://pfseconddarkness.wikidot.com
+http://phikappatau.wikidot.com
+http://philosophia.wikidot.com
+http://philosophiesoflife.wikidot.com
+http://photo-gallery-template.wikidot.com
+http://phylo.wikidot.com
+http://pl.wikidot.com
+http://playstation3hacksandmods.wikidot.com
+http://pofomultiquiz.wikidot.com
+http://pogon.wikidot.com
+http://polls.wikidot.com
+http://porphyrarpg.wikidot.com
+http://porsche.wikidot.com
+http://pottersarmy.wikidot.com
+http://predev.wikidot.com
+http://private-template.wikidot.com
+http://processexcel.wikidot.com
+http://professorallred.wikidot.com
+http://profiles.wikidot.com
+http://project-template.wikidot.com
+http://projects.wikidot.com
+http://ps3indexhelp.wikidot.com
+http://psi-ppwg.wikidot.com
+http://psms.wikidot.com
+http://psrboregon.wikidot.com
+http://psyc101.wikidot.com
+http://psychjobsearch.wikidot.com
+http://psychotronicsdivision.wikidot.com
+http://pt851.wikidot.com
+http://puddincupcss.wikidot.com
+http://puppet.wikidot.com
+http://pw7890o.wikidot.com
+http://pylint-messages.wikidot.com
+http://qttabbar.wikidot.com
+http://quiat.wikidot.com
+http://r.wikidot.com
+http://radonc.wikidot.com
+http://railgunitp.wikidot.com
+http://ravenmarches.wikidot.com
+http://realestate-template.wikidot.com
+http://redirect-template.wikidot.com
+http://redsite.wikidot.com
+http://renegadesofpw.wikidot.com
+http://reshme.wikidot.com
+http://reskitchen.wikidot.com
+http://retrolegends.wikidot.com
+http://retrowiki.wikidot.com
+http://reykjavikmanifesto.wikidot.com
+http://rhetoricalgoddess.wikidot.com
+http://rmitvnim2007b.wikidot.com
+http://roadmap.wikidot.com
+http://roboticsclubucla.wikidot.com
+http://roboticspedia.wikidot.com
+http://rock-xproject.wikidot.com
+http://rtd1261.wikidot.com
+http://rxwiki.wikidot.com
+http://s7s.wikidot.com
+http://sacwwiki.wikidot.com
+http://salamander724.wikidot.com
+http://saludintegral.wikidot.com
+http://samvedna.wikidot.com
+http://sandboxscpfr.wikidot.com
+http://sasana.wikidot.com
+http://sasi555.wikidot.com
+http://savagetidewithfiretrolls.wikidot.com
+http://scala.wikidot.com
+http://schoolsteachersparents.wikidot.com
+http://schrijven.wikidot.com
+http://scienceonlinelondon.wikidot.com
+http://scion-mmp.wikidot.com
+http://scp-et.wikidot.com
+http://scp-field-work.wikidot.com
+http://scp-foundation-origins.wikidot.com
+http://scp-he.wikidot.com
+http://scp-hu.wikidot.com
+http://scp-int-sandbox.wikidot.com
+http://scp-int.wikidot.com
+http://scp-international.wikidot.com
+http://scp-jp-admin.wikidot.com
+http://scp-jp-archive.wikidot.com
+http://scp-jp-sandbox2.wikidot.com
+http://scp-jp-sandbox3.wikidot.com
+http://scp-jp.wikidot.com
+http://scp-ko-15c.wikidot.com
+http://scp-kr.wikidot.com
+http://scp-la.wikidot.com
+http://scp-nd.wikidot.com
+http://scp-nl.wikidot.com
+http://scp-pl-sandbox.wikidot.com
+http://scp-pl.wikidot.com
+http://scp-pt-br.wikidot.com
+http://scp-pt.wikidot.com
+http://scp-ru.wikidot.com
+http://scp-sandbox-3.wikidot.com
+http://scp-sandbox-la.wikidot.com
+http://scp-spqr.wikidot.com
+http://scp-template.wikidot.com
+http://scp-th-sandbox.wikidot.com
+http://scp-th.wikidot.com
+http://scp-tw.wikidot.com
+http://scp-ukrainian.wikidot.com
+http://scp-un.wikidot.com
+http://scp-vn.wikidot.com
+http://scp-wiki-cn.wikidot.com
+http://scp-wiki-de.wikidot.com
+http://scp-wiki.wikidot.com
+http://scpalex-fh.wikidot.com
+http://scpclassic.wikidot.com
+http://scpexplained.wikidot.com
+http://scpjp-fansite.wikidot.com
+http://scpkoreahq.wikidot.com
+http://scpminecraft.wikidot.com
+http://scpsandbox-jp.wikidot.com
+http://scpsandbox-pl.wikidot.com
+http://scpsandbox-ua.wikidot.com
+http://scpsandbox2.wikidot.com
+http://scpsandboxbr.wikidot.com
+http://scpsandboxcn.wikidot.com
+http://scpsandboxde.wikidot.com
+http://scpsandboxit.wikidot.com
+http://scpsandboxnl.wikidot.com
+http://scpvakfi.wikidot.com
+http://scpvakfisandbox.wikidot.com
+http://scpvnsandbox.wikidot.com
+http://scratch4samvedna.wikidot.com
+http://serpents-hand.wikidot.com
+http://sfi.wikidot.com
+http://sfugamedev.wikidot.com
+http://shadow4e.wikidot.com
+http://sharecokecodes.wikidot.com
+http://shop.wikidot.com
+http://sicurezzapubblica.wikidot.com
+http://sidowegraty.wikidot.com
+http://signaturbogen.wikidot.com
+http://siluria.wikidot.com
+http://simtrackipedia.wikidot.com
+http://sistdig.wikidot.com
+http://siteclone.wikidot.com
+http://sky852751.wikidot.com
+http://skyangel.wikidot.com
+http://slaythespire.wikidot.com
+http://sliscomps.wikidot.com
+http://slownik-geologiczny.wikidot.com
+http://small-steps.wikidot.com
+http://smofficer.wikidot.com
+http://smsalgebra.wikidot.com
+http://sniktbub.wikidot.com
+http://snippets.wikidot.com
+http://snow-template.wikidot.com
+http://snowleopard.wikidot.com
+http://sociatecture.wikidot.com
+http://sociatectureblog.wikidot.com
+http://socjobs.wikidot.com
+http://socjobs2011.wikidot.com
+http://soctech.wikidot.com
+http://softwarecraftsmanship.wikidot.com
+http://solariapedia.wikidot.com
+http://solodarydar.wikidot.com
+http://solpadeinehelp.wikidot.com
+http://sortibrige.wikidot.com
+http://soulslore.wikidot.com
+http://soymilkls.wikidot.com
+http://sp1.wikidot.com
+http://spambotdeathwall.wikidot.com
+http://sparks.wikidot.com
+http://sped.wikidot.com
+http://splinterverse.wikidot.com
+http://spolecznosc.wikidot.com
+http://srm.wikidot.com
+http://st-phelpers.wikidot.com
+http://stallmanism.wikidot.com
+http://standard-template.wikidot.com
+http://starwarsmadness.wikidot.com
+http://static.wikidot.com
+http://steelandstone.wikidot.com
+http://storychip.wikidot.com
+http://string-theory.wikidot.com
+http://studiocomments.wikidot.com
+http://studiolynn.wikidot.com
+http://suffadv.wikidot.com
+http://summer350.wikidot.com
+http://summerisle.wikidot.com
+http://sunnybrook-academy.wikidot.com
+http://superjet.wikidot.com
+http://surreal64ce.wikidot.com
+http://sw-gis.wikidot.com
+http://swietomuzyki.wikidot.com
+http://swwotc.wikidot.com
+http://talesofhonor.wikidot.com
+http://talkingpadproject.wikidot.com
+http://task-management.wikidot.com
+http://tasker.wikidot.com
+http://tauren.wikidot.com
+http://tech-racingcars.wikidot.com
+http://techblog-template.wikidot.com
+http://techcomm.wikidot.com
+http://ten-sb.wikidot.com
+http://terrasdeportugal.wikidot.com
+http://tex.wikidot.com
+http://textanalytics.wikidot.com
+http://the-nexus.wikidot.com
+http://theanarchstate.wikidot.com
+http://theblightedworld.wikidot.com
+http://thecollaboratory.wikidot.com
+http://thegamerdome.wikidot.com
+http://thekingkillerchronicle.wikidot.com
+http://thelaststory.wikidot.com
+http://themes.wikidot.com
+http://thep-serc.wikidot.com
+http://therafim.wikidot.com
+http://therafimrpg.wikidot.com
+http://thesimsonline.wikidot.com
+http://theskyremains.wikidot.com
+http://theunforgotten.wikidot.com
+http://thewake.wikidot.com
+http://theweird.wikidot.com
+http://theweirdwest.wikidot.com
+http://ti-iseg-t12.wikidot.com
+http://ti-iseg-t19.wikidot.com
+http://tibasicdev.wikidot.com
+http://timidgirls.wikidot.com
+http://tlug.wikidot.com
+http://tlumaczenia.wikidot.com
+http://tmduc.wikidot.com
+http://tradewithsaint.wikidot.com
+http://translate.wikidot.com
+http://translators-forum.wikidot.com
+http://trb-mux.wikidot.com
+http://triathematician.wikidot.com
+http://trueblood-dallas.wikidot.com
+http://try.wikidot.com
+http://ttu-dom.wikidot.com
+http://tyf.wikidot.com
+http://typesets.wikidot.com
+http://ubmedicinefaqs.wikidot.com
+http://ucsdgrads.wikidot.com
+http://ukcw.wikidot.com
+http://ultimatemutantsofgagetown.wikidot.com
+http://umassenglishgrad.wikidot.com
+http://uml.wikidot.com
+http://underworldlarp.wikidot.com
+http://uniofbeds.wikidot.com
+http://urbanmobile.wikidot.com
+http://uscta.wikidot.com
+http://user-gemeinschaft.wikidot.com
+http://usma387.wikidot.com
+http://valeofcallus.wikidot.com
+http://veritasbatheo.wikidot.com
+http://videoart.wikidot.com
+http://viotikoskosmos.wikidot.com
+http://virtualwargamer.wikidot.com
+http://viscomclass.wikidot.com
+http://visual-records.wikidot.com
+http://vitalusers.wikidot.com
+http://vocaro.wikidot.com
+http://vs-tcg.wikidot.com
+http://vtls-vital.wikidot.com
+http://vusb.wikidot.com
+http://vwinterop.wikidot.com
+http://vyprmedia.wikidot.com
+http://w24.wikidot.com
+http://wanderers-library-ko.wikidot.com
+http://wanderers-library.wikidot.com
+http://wanderers-sandbox.wikidot.com
+http://warsztatywww.wikidot.com
+http://web0.wikidot.com
+http://webcomicauthority.wikidot.com
+http://wfh.wikidot.com
+http://whanethewhip.wikidot.com
+http://whatever.wikidot.com
+http://wherearethejoneses.wikidot.com
+http://wikidot.com
+http://wikiedresearch.wikidot.com
+http://wikiethica.wikidot.com
+http://wikim5s.wikidot.com
+http://wikinorm.wikidot.com
+http://wikiofscience.wikidot.com
+http://wikirhye.wikidot.com
+http://wikirmaphil.wikidot.com
+http://wikistoriaenciclopedia.wikidot.com
+http://wikitipsgr.wikidot.com
+http://windycity.wikidot.com
+http://wiwimush.wikidot.com
+http://world.wikidot.com
+http://wow-arrakis.wikidot.com
+http://wpts.wikidot.com
+http://wqa.wikidot.com
+http://writ-111-office-hour-sign-up.wikidot.com
+http://writingoneeleven.wikidot.com
+http://wrtg1150.wikidot.com
+http://wtg.wikidot.com
+http://www-old.wikidot.com
+http://wychwood.wikidot.com
+http://xanadu.wikidot.com
+http://y31.wikidot.com
+http://ye-olde-music-industrapedia.wikidot.com
+http://yo801106.wikidot.com
+http://yyp.wikidot.com
+http://zeroshell.wikidot.com
+http://zmk.wikidot.com
+http://zodiac-ffrpg.wikidot.com
+http://zodiac-monster-manual.wikidot.com
+http://zombiecafe.wikidot.com
+http://zorya.wikidot.com
--- a/listsofwikis/wikidot/words.txt
+++ b/listsofwikis/wikidot/words.txt
@ -0,0 +1,214 @@
+arte
+cine
+lengua
+literatura
+matematicas
+ingles
+frances
+aleman
+ruso
+idiomas
+geografia
+historia
+secundaria
+bachillerato
+examen
+examenes
+profesor
+educacion
+profesores
+historias
+extremadura
+andalucia
+iberia
+oceano
+cultura
+periodico
+television
+radio
+italiano
+polaco
+chino
+japones
+coreano
+musica
+mozart
+beethoven
+asimov
+newton
+kilogramo
+teoria
+fisica
+deporte
+cancion
+futbol
+astronomia
+telescopio
+cuaderno
+libro
+texto
+pizarra
+descartes
+galileo
+fosiles
+paisaje
+fosil
+paisajes
+mar
+oceano
+espacio
+meteorologia
+nubes
+religion
+bandera
+lengua
+politica
+biologia
+quimica
+medicina
+tecnologia
+diagrama
+mapa
+mapas
+dibujos
+pronunciacion
+arquitectura
+compositor
+pintor
+pintura
+escultura
+museo
+biblioteca
+museos
+bibliotecas
+enciclopedia
+diccionario
+filosofia
+filosofos
+feminismo
+sociologia
+leyes
+coche
+barco
+avion
+transporte
+teatro
+europa
+america
+africa
+asia
+oceania
+australia
+atlantico
+mediterraneo
+fenicios
+griegos
+cartagineses
+palabras
+numeros
+escritura
+isla
+java
+python
+programacion
+piramide
+cuadrado
+geometria
+rectangulo
+circulo
+ciencia
+marx
+engels
+platon
+socrates
+continente
+tormenta
+terremoto
+proyecto
+glosario
+vocabulario
+aprender
+recursos
+lectura
+comunicacion
+salud
+bienestar
+europeo
+africano
+asiatico
+americano
+wiki
+wikis
+documental
+documentales
+bibliografia
+documentacion
+ciencias
+naturales
+sociales
+inteligencia
+investigacion
+cientifico
+tecnico
+cientifica
+enlaces
+antropologia
+arqueologia
+arqueologo
+filologia
+arduino
+software
+hardware
+computador
+ordenador
+siglo xx
+siglo xix
+siglo xviii
+siglo xvii
+siglo xvi
+siglo xv
+libros
+marte
+tierra
+mercurio
+jupiter
+saturno
+urano
+neptuno
+pluton
+cometa
+asteroide
+luna
+pajaro
+ave
+aves
+reptil
+reptiles
+flores
+arboles
+flor
+dictadura
+democracia
+parlamento
+universidad
+universidades
+empresa
+comida
+alimento
+equipo
+lampara
+luz
+bombilla
+electricidad
+frigorifico
+lavadora
+mueble
+fregona
+espacio
+sol
+estrella
+fenomeno
+hispanico
+hispanica
+biodiversidad
+guerra fria
--- a/listsofwikis/wikispaces/users.txt
+++ b/listsofwikis/wikispaces/users.txt
--- a/listsofwikis/wikispaces/wikis.txt
+++ b/listsofwikis/wikispaces/wikis.txt
--- a/listsofwikis/wikispaces/wikispaces-duckduckgo.py
+++ b/listsofwikis/wikispaces/wikispaces-duckduckgo.py
@ -0,0 +1,86 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2018 WikiTeam developers
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import random
+import re
+import time
+import urllib.request
+
+def main():
+    opener = urllib.request.build_opener()
+    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+    urllib.request.install_opener(opener)
+    
+    words = []
+    with open('words.txt', 'r') as f:
+        words = f.read().strip().splitlines()
+    random.shuffle(words)
+    print('Loaded %d words from file' % (len(words)))
+    #words = words + ['%d' % (i) for i in range(1900, 1980, 10)]
+    wikis = []
+    with open('wikispaces-duckduckgo.txt', 'r') as f:
+        wikis = f.read().strip().splitlines()
+        wikis.sort()
+    print('Loaded %d wikis from file' % (len(wikis)))
+    
+    for i in range(1, 100):
+        random.shuffle(words)
+        for word in words:
+            print('Word', word)
+            word_ = re.sub(' ', '+', word)
+            url = ''
+            r = random.randint(0, 10)
+            if r == 0:
+                url = 'https://duckduckgo.com/html/?q=%s%%20site:wikispaces.com' % (word_)
+            elif r == 1:
+                url = 'https://duckduckgo.com/html/?q=%s%%20wikispaces.com' % (word_)
+            elif r == 2:
+                url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
+            elif r == 3:
+                url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (random.randint(100, 3000), word_)
+            else:
+                url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
+            print('URL search', url)
+            try:
+                html = urllib.request.urlopen(url).read().decode('utf-8')
+            except:
+                print('Search error')
+                sys.exit()
+            html = urllib.parse.unquote(html)
+            m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
+            for wiki in m:
+                wiki = 'https://' + wiki
+                if not wiki in wikis:
+                    wikis.append(wiki)
+                    wikis.sort()
+                    print(wiki)
+            with open('wikispaces-duckduckgo.txt', 'w') as f:
+                wikis2 = []
+                for wiki in wikis:
+                    wiki = re.sub(r'https://www\.', 'https://', wiki)
+                    if not wiki in wikis2:
+                        wikis2.append(wiki)
+                wikis = wikis2
+                wikis.sort()
+                f.write('\n'.join(wikis))
+            print('%d wikis found' % (len(wikis)))
+            sleep = random.randint(5,20)
+            print('Sleeping %d seconds' % (sleep))
+            time.sleep(sleep)
+
+if __name__ == '__main__':
+    main()
--- a/listsofwikis/wikispaces/wikispaces-duckduckgo.txt
+++ b/listsofwikis/wikispaces/wikispaces-duckduckgo.txt
--- a/listsofwikis/wikispaces/wikispaces-spider.py
+++ b/listsofwikis/wikispaces/wikispaces-spider.py
@ -16,6 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import csv
+import random
 import re
 import time
 import urllib2
@ -88,6 +89,8 @@ def getWikis(user):
        return {}

 def main():
+    sleep = 0.1
+    rand = 10
    users = loadUsers()
    wikis = loadWikis()
    
@ -112,11 +115,16 @@ def main():
                c += 1
        print 'Found %s new users' % (c)
        if c > 0:
-            saveUsers(users)
-            users = loadUsers()
-        saveWikis(wikis)
-        time.sleep(1)
+            if random.randint(0,rand) == 0:
+                saveUsers(users)
+                users = loadUsers()
+        if random.randint(0,rand) == 0:
+            saveWikis(wikis)
+        time.sleep(sleep)
+    saveWikis(wikis)
    wikis = loadWikis()
+    saveUsers(users)
+    users = loadUsers()
    
    # find more wikis
    print 'Scanning users for more wikis'
@ -133,10 +141,15 @@ def main():
                c += 1
        print 'Found %s new wikis' % (c)
        if c > 0:
-            saveWikis(wikis)
-            wikis = loadWikis()
-        saveUsers(users)
-        time.sleep(1)
+            if random.randint(0,rand) == 0:
+                saveWikis(wikis)
+                wikis = loadWikis()
+        if random.randint(0,rand) == 0:
+            saveUsers(users)
+        time.sleep(sleep)
+    saveWikis(wikis)
+    wikis = loadWikis()
+    saveUsers(users)
    users = loadUsers()
    
    print '\nSummary:'
--- a/listsofwikis/wikispaces/wikispaces00
+++ b/listsofwikis/wikispaces/wikispaces00
--- a/listsofwikis/wikispaces/wikispaces01
+++ b/listsofwikis/wikispaces/wikispaces01
--- a/listsofwikis/wikispaces/wikispaces02
+++ b/listsofwikis/wikispaces/wikispaces02
--- a/listsofwikis/wikispaces/wikispaces03
+++ b/listsofwikis/wikispaces/wikispaces03
--- a/listsofwikis/wikispaces/wikispaces04
+++ b/listsofwikis/wikispaces/wikispaces04
--- a/listsofwikis/wikispaces/words.txt
+++ b/listsofwikis/wikispaces/words.txt
@ -0,0 +1,214 @@
+arte
+cine
+lengua
+literatura
+matematicas
+ingles
+frances
+aleman
+ruso
+idiomas
+geografia
+historia
+secundaria
+bachillerato
+examen
+examenes
+profesor
+educacion
+profesores
+historias
+extremadura
+andalucia
+iberia
+oceano
+cultura
+periodico
+television
+radio
+italiano
+polaco
+chino
+japones
+coreano
+musica
+mozart
+beethoven
+asimov
+newton
+kilogramo
+teoria
+fisica
+deporte
+cancion
+futbol
+astronomia
+telescopio
+cuaderno
+libro
+texto
+pizarra
+descartes
+galileo
+fosiles
+paisaje
+fosil
+paisajes
+mar
+oceano
+espacio
+meteorologia
+nubes
+religion
+bandera
+lengua
+politica
+biologia
+quimica
+medicina
+tecnologia
+diagrama
+mapa
+mapas
+dibujos
+pronunciacion
+arquitectura
+compositor
+pintor
+pintura
+escultura
+museo
+biblioteca
+museos
+bibliotecas
+enciclopedia
+diccionario
+filosofia
+filosofos
+feminismo
+sociologia
+leyes
+coche
+barco
+avion
+transporte
+teatro
+europa
+america
+africa
+asia
+oceania
+australia
+atlantico
+mediterraneo
+fenicios
+griegos
+cartagineses
+palabras
+numeros
+escritura
+isla
+java
+python
+programacion
+piramide
+cuadrado
+geometria
+rectangulo
+circulo
+ciencia
+marx
+engels
+platon
+socrates
+continente
+tormenta
+terremoto
+proyecto
+glosario
+vocabulario
+aprender
+recursos
+lectura
+comunicacion
+salud
+bienestar
+europeo
+africano
+asiatico
+americano
+wiki
+wikis
+documental
+documentales
+bibliografia
+documentacion
+ciencias
+naturales
+sociales
+inteligencia
+investigacion
+cientifico
+tecnico
+cientifica
+enlaces
+antropologia
+arqueologia
+arqueologo
+filologia
+arduino
+software
+hardware
+computador
+ordenador
+siglo xx
+siglo xix
+siglo xviii
+siglo xvii
+siglo xvi
+siglo xv
+libros
+marte
+tierra
+mercurio
+jupiter
+saturno
+urano
+neptuno
+pluton
+cometa
+asteroide
+luna
+pajaro
+ave
+aves
+reptil
+reptiles
+flores
+arboles
+flor
+dictadura
+democracia
+parlamento
+universidad
+universidades
+empresa
+comida
+alimento
+equipo
+lampara
+luz
+bombilla
+electricidad
+frigorifico
+lavadora
+mueble
+fregona
+espacio
+sol
+estrella
+fenomeno
+hispanico
+hispanica
+biodiversidad
+guerra fria
--- a/testing/test_dumpgenerator.py
+++ b/testing/test_dumpgenerator.py
@ -62,14 +62,14 @@ class TestDumpgenerator(unittest.TestCase):
        tests = [
            # Alone wikis
            #['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'],
-            ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
-            ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
+            ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
+            #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
            
            # Editthis wikifarm
            # It has a page view limit
            
            # Gamepedia wikifarm
-            ['http://dawngate.gamepedia.com/index.php', 'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'],
+            #['http://dawngate.gamepedia.com/index.php', 'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'],
            
            # Neoseeker wikifarm
            #['http://digimon.neoseeker.com/w/index.php', 'http://digimon.neoseeker.com/w/api.php', u'Ogremon card.png'],
@ -78,13 +78,13 @@ class TestDumpgenerator(unittest.TestCase):
            #['http://mc.orain.org/w/index.php', 'http://mc.orain.org/w/api.php', u'Mojang logo.svg'],
            
            # Referata wikifarm
-            ['http://wikipapers.referata.com/w/index.php', 'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'],
+            #['http://wikipapers.referata.com/w/index.php', 'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'],
            
            # ShoutWiki wikifarm
-            ['http://commandos.shoutwiki.com/w/index.php', 'http://commandos.shoutwiki.com/w/api.php', u'Night of the Wolves loading.png'],
+            #['http://commandos.shoutwiki.com/w/index.php', 'http://commandos.shoutwiki.com/w/api.php', u'Night of the Wolves loading.png'],
            
            # Wiki-site wikifarm
-            ['http://minlingo.wiki-site.com/index.php', 'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'],
+            #['http://minlingo.wiki-site.com/index.php', 'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'],
            
            # Wikkii wikifarm
            # It seems offline
@ -146,8 +146,8 @@ class TestDumpgenerator(unittest.TestCase):
        print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
        tests = [
            # Alone wikis
-            ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'],
-            ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],
+            ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'April Fools\' Day'],
+            #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],

            # Test old allpages API behaviour
            #['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'],
@ -261,7 +261,11 @@ class TestDumpgenerator(unittest.TestCase):
        ]
        for wiki, engine in tests:
            print 'Testing', wiki
-            guess_engine = getWikiEngine(wiki)
+            try:
+                guess_engine = getWikiEngine(wiki)
+            except ConnectionError:
+                print "%s failed to load, skipping..." % (wiki)
+                continue
            print 'Got: %s, expected: %s' % (guess_engine, engine)
            self.assertEqual(guess_engine, engine)
    
@ -269,14 +273,14 @@ class TestDumpgenerator(unittest.TestCase):
        print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73
        tests = [
            # Alone wikis
-            ['http://archiveteam.org', 'http://archiveteam.org/api.php', 'http://archiveteam.org/index.php'],
-            ['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],
+            ['https://www.archiveteam.org', 'https://www.archiveteam.org/api.php', 'https://www.archiveteam.org/index.php'],
+            #['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],
            
            # Editthis wikifarm
            # It has a page view limit
            
            # Gamepedia wikifarm
-            ['http://dawngate.gamepedia.com', 'http://dawngate.gamepedia.com/api.php', 'http://dawngate.gamepedia.com/index.php'],
+            #['http://dawngate.gamepedia.com', 'http://dawngate.gamepedia.com/api.php', 'http://dawngate.gamepedia.com/index.php'],
            
            # Neoseeker wikifarm
            #['http://digimon.neoseeker.com', 'http://digimon.neoseeker.com/w/api.php', 'http://digimon.neoseeker.com/w/index.php'],
@ -288,7 +292,7 @@ class TestDumpgenerator(unittest.TestCase):
            # ['http://wikipapers.referata.com', 'http://wikipapers.referata.com/w/api.php', 'http://wikipapers.referata.com/w/index.php'],
            
            # ShoutWiki wikifarm
-            ['http://commandos.shoutwiki.com', 'http://commandos.shoutwiki.com/w/api.php', 'http://commandos.shoutwiki.com/w/index.php'],
+            #['http://commandos.shoutwiki.com', 'http://commandos.shoutwiki.com/w/api.php', 'http://commandos.shoutwiki.com/w/index.php'],
            
            # Wiki-site wikifarm
            #['http://minlingo.wiki-site.com', 'http://minlingo.wiki-site.com/api.php', 'http://minlingo.wiki-site.com/index.php'],
--- a/uploader.py
+++ b/uploader.py
@ -16,6 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import getopt
+import argparse
 import os
 import re
 import subprocess
@ -30,89 +31,41 @@ from internetarchive import get_item

 import dumpgenerator

-# Configuration goes here
 # You need a file named keys.txt with access and secret keys, in two different lines
 accesskey = open('keys.txt', 'r').readlines()[0].strip()
 secretkey = open('keys.txt', 'r').readlines()[1].strip()
-# Use --admin if you are a wikiteam collection admin, or specify another collection:
-collection = 'opensource'

 # Nothing to change below
 convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
-listfile = sys.argv[1]
-uploadeddumps = []
-try:
-    uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
-except:
-    pass
-print '%d dumps uploaded previously' % (len(uploadeddumps))
-
-def getParameters(params=[]):
-    if not params:
-        params = sys.argv[2:]
-    config = {
-        'prune-directories': False,
-        'prune-wikidump': False,
-        'collection': collection,
-        'update': False,
-    }
-    #console params
-    try:
-        opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"])
-    except getopt.GetoptError, err:
-        # print help information and exit:
-        print str(err) # will print something like "option -a not recognized"
-        usage()
-        sys.exit(2)
-    for o, a in opts:
-        if o in ("-h","--help"):
-            usage()
-            sys.exit()
-        elif o in ("--prune-directories"):
-            config['prune-directories'] = True
-        elif o in ("--prune-wikidump"):
-            config['prune-wikidump'] = True
-        elif o in ("--admin"):
-            config['collection'] = "wikiteam"
-        elif o in ("--update"):
-            config['update'] = True
-    return config
-
-def usage():
-    """  """
-    print """uploader.py

-This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
-The list must be a text file with the wiki's api.php URLs, one per line.
-Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
-as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
-You need a file named keys.txt with access and secret keys, in two different lines
-You also need dumpgenerator.py in the same directory as this script.
-
-Use --help to print this help."""
-
-def log(wiki, dump, msg):
-    f = open('uploader-%s.log' % (listfile), 'a')
+def log(wiki, dump, msg, config={}):
+    f = open('uploader-%s.log' % (config.listfile), 'a')
    f.write('\n%s;%s;%s' % (wiki, dump, msg))
    f.close()

-def upload(wikis, config={}):
+def upload(wikis, config={}, uploadeddumps=[]):
    headers = {'User-Agent': dumpgenerator.getUserAgent()}
+    dumpdir = config.wikidump_dir

+    filelist = os.listdir(dumpdir)
    for wiki in wikis:
        print "#"*73
        print "# Uploading", wiki
        print "#"*73
        wiki = wiki.lower()
-        prefix = dumpgenerator.domain2prefix(config={'api': wiki})
+        configtemp = config
+        try:
+            prefix = dumpgenerator.domain2prefix(config={'api': wiki})
+        except KeyError:
+            print "ERROR: could not produce the prefix for %s" % wiki
+        config = configtemp

        wikiname = prefix.split('-')[0]
        dumps = []
-        for dirname, dirnames, filenames in os.walk('.'):
-            if dirname == '.':
-                for f in filenames:
-                    if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
-                        dumps.append(f)
+        for f in filelist:
+            if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
+                print "%s found" % f
+                dumps.append(f)
                break

        c = 0
@ -120,30 +73,33 @@ def upload(wikis, config={}):
            wikidate = dump.split('-')[1]
            item = get_item('wiki-' + wikiname)
            if dump in uploadeddumps:
-                if config['prune-directories']:
+                if config.prune_directories:
                    rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
-                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
+                if config.prune_wikidump and dump.endswith('wikidump.7z'):
                        # Simplistic quick&dirty check for the presence of this file in the item
-                        stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+                        print "Checking content in previously uploaded files"
+                        stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                        dumphash = re.sub(' +.+\n?', '', stdout)

                        if dumphash in map(lambda x: x['md5'], item.files):
-                            log(wiki, dump, 'verified')
-                            rmline='rm -rf %s' % dump
+                            log(wiki, dump, 'verified', config)
+                            rmline='rm -rf %s' % dumpdir + '/' + dump
                            if not os.system(rmline):
-                                print 'DELETED ' + dump
+                                print 'DELETED ' + dumpdir + '/' + dump
                            print '%s was uploaded before, skipping...' % (dump)
                            continue
                        else:
                            print 'ERROR: The online item misses ' + dump
-                            log(wiki, dump, 'missing')
+                            log(wiki, dump, 'missing', config)
                            # We'll exit this if and go upload the dump
                else:
                    print '%s was uploaded before, skipping...' % (dump)
                    continue
+            else:
+                print '%s was not uploaded before' % dump

            time.sleep(0.1)
            wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
@ -155,7 +111,7 @@ def upload(wikis, config={}):
            # Logo path
            logourl = ''

-            if ismissingitem or config['update']:
+            if ismissingitem or config.update:
                #get metadata from api.php
                #first sitename and base url
                params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
@ -163,7 +119,7 @@ def upload(wikis, config={}):
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
-                    f = urllib2.urlopen(req)
+                    f = urllib2.urlopen(req, timeout=10)
                    xml = f.read()
                    f.close()
                except:
@ -198,7 +154,7 @@ def upload(wikis, config={}):
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
-                    f = urllib2.urlopen(req)
+                    f = urllib2.urlopen(req, timeout=10)
                    xml = f.read()
                    f.close()
                except:
@ -214,7 +170,7 @@ def upload(wikis, config={}):

                raw = ''
                try:
-                    f = urllib.urlopen(baseurl)
+                    f = urllib.urlopen(baseurl, timeout=10)
                    raw = f.read()
                    f.close()
                except:
@ -238,7 +194,6 @@ def upload(wikis, config={}):
                    logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
                except:
                    pass
-                print logourl

                #retrieve some info from the wiki
                wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
@ -264,7 +219,7 @@ def upload(wikis, config={}):
                # Item metadata
                md = {
                    'mediatype': 'web',
-                    'collection': config['collection'],
+                    'collection': config.collection,
                    'title': wikititle,
                    'description': wikidesc,
                    'language': lang,
@ -277,25 +232,54 @@ def upload(wikis, config={}):

            #Upload files and update metadata
            try:
-                item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
+                item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
                item.modify_metadata(md) # update
                print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
+                uploadeddumps.append(dump)
+                log(wiki, dump, 'ok', config)
                if logourl:
-                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
+                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read())
                    logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
                    logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
                    item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
-                uploadeddumps.append(dump)
-                log(wiki, dump, 'ok')
-            except:
-                print wiki, dump, 'error when uploading?'
+            except Exception as e:
+                print wiki, dump, 'Error when uploading?'
+                print e.message

            c += 1

 def main(params=[]):
-    config = getParameters(params=params)
+    parser = argparse.ArgumentParser("""uploader.py
+
+This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
+The list must be a text file with the wiki's api.php URLs, one per line.
+Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
+as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
+You need a file named keys.txt with access and secret keys, in two different lines
+You also need dumpgenerator.py in the same directory as this script.
+
+Use --help to print this help.""")
+
+    parser.add_argument('-pd', '--prune_directories', action='store_true')
+    parser.add_argument('-pw', '--prune_wikidump', action='store_true')
+    parser.add_argument('-a', '--admin', action='store_true')
+    parser.add_argument('-c', '--collection', default='opensource')
+    parser.add_argument('-wd', '--wikidump_dir', default='.')
+    parser.add_argument('-u', '--update', action='store_true')
+    parser.add_argument('listfile')
+    config = parser.parse_args()
+    if config.admin:
+        config.collection = 'wikiteam'
+    uploadeddumps = []
+    listfile = config.listfile
+    try:
+        uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
+    except:
+        pass
+    print '%d dumps uploaded previously' % (len(uploadeddumps))
    wikis = open(listfile, 'r').read().strip().splitlines()
-    upload(wikis, config)
+
+    upload(wikis, config, uploadeddumps)

 if __name__ == "__main__":
    main()
--- a/wikiapiary/wikiapiary-update-ia-params.py
+++ b/wikiapiary/wikiapiary-update-ia-params.py
@ -24,7 +24,7 @@ def main():
    site = pywikibot.Site('wikiapiary', 'wikiapiary')
    catname = 'Category:Website'
    cat = pywikibot.Category(site, catname)
-    gen = pagegenerators.CategorizedPageGenerator(cat, start='Spyropedia')
+    gen = pagegenerators.CategorizedPageGenerator(cat, start='!')
    pre = pagegenerators.PreloadingGenerator(gen)
    
    for page in pre:
@ -52,7 +52,8 @@ def main():
                print('No API found in WikiApiary, skiping')
                continue
            
-            urliasearch = 'https://archive.org/search.php?query=originalurl:"%s"' % (apiurl)
+            indexurl = 'index.php'.join(apiurl.rsplit('api.php', 1))
+            urliasearch = 'https://archive.org/search.php?query=originalurl:"%s" OR originalurl:"%s"' % (apiurl, indexurl)
            f = urllib.request.urlopen(urliasearch)
            raw = f.read().decode('utf-8')
            if re.search(r'(?i)Your search did not match any items', raw):
--- a/wikispaces.py
+++ b/wikispaces.py
@ -0,0 +1,458 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2018 WikiTeam developers
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
+# Documentation for developers: http://wikiteam.readthedocs.com
+
+import csv
+import datetime
+import os
+import random
+import re
+import subprocess
+import sys
+import time
+import urllib.request
+#from internetarchive import get_item
+
+# Requirements:
+# zip command (apt-get install zip)
+# ia command (pip install internetarchive, and configured properly)
+
+"""
+# You need a file with access and secret keys, in two different lines
+iakeysfilename = '%s/.iakeys' % (os.path.expanduser('~'))
+if os.path.exists(iakeysfilename):
+    accesskey = open(iakeysfilename, 'r').readlines()[0].strip()
+    secretkey = open(iakeysfilename, 'r').readlines()[1].strip()
+else:
+    print('Error, no %s file with S3 keys for Internet Archive account' % (iakeysfilename))
+    sys.exit()
+"""
+
+def saveURL(wikidomain='', url='', filename='', path='', overwrite=False, iteration=1):
+    filename2 = '%s/%s' % (wikidomain, filename)
+    if path:
+        filename2 = '%s/%s/%s' % (wikidomain, path, filename)
+    if os.path.exists(filename2):
+        if not overwrite:
+            print('Warning: file exists on disk. Skipping download. Force download with parameter --overwrite')
+            return
+    opener = urllib.request.build_opener()
+    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+    urllib.request.install_opener(opener)
+    try:
+        urllib.request.urlretrieve(url, filename2)
+    except:
+        sleep = 10 # seconds
+        maxsleep = 30
+        while sleep <= maxsleep:
+            try:
+                print('Error while retrieving: %s' % (url))
+                print('Retry in %s seconds...' % (sleep))
+                time.sleep(sleep)
+                urllib.request.urlretrieve(url, filename2)
+                return
+            except:
+                sleep = sleep * 2
+        print('Download failed')
+    
+    #sometimes wikispaces returns invalid data, redownload in that cases
+    #only 'pages'. 'files' binaries are a pain to open and check
+    if (os.path.exists(filename2) and 'pages' in path) or \
+        (os.path.exists(filename2) and path == '' and filename2.split('.')[-1] in ['xml', 'html', 'csv']):
+        sleep2 = 60 * iteration
+        raw = ''
+        try:
+            with open(filename2, 'r', encoding='utf-8') as f:
+                raw = f.read()
+        except:
+            with open(filename2, 'r', encoding='latin-1') as f:
+                raw = f.read()
+        if re.findall(r'(?im)<title>TES and THE Status</title>', raw):
+            print('Warning: invalid content. Waiting %d seconds and re-downloading' % (sleep2))
+            time.sleep(sleep2)
+            saveURL(wikidomain=wikidomain, url=url, filename=filename, path=path, overwrite=overwrite, iteration=iteration+1)
+
+def undoHTMLEntities(text=''):
+    """ Undo some HTML codes """
+
+    # i guess only < > & " ' need conversion
+    # http://www.w3schools.com/html/html_entities.asp
+    text = re.sub('&lt;', '<', text)
+    text = re.sub('&gt;', '>', text)
+    text = re.sub('&amp;', '&', text)
+    text = re.sub('&quot;', '"', text)
+    text = re.sub('&#039;', '\'', text)
+
+    return text
+
+def convertHTML2Wikitext(wikidomain='', filename='', path=''):
+    wikitext = ''
+    wikitextfile = '%s/%s/%s' % (wikidomain, path, filename)
+    if not os.path.exists(wikitextfile):
+        print('Error retrieving wikitext, page is a redirect probably')
+        return
+    with open(wikitextfile, 'r') as f:
+        wikitext = f.read()
+    with open(wikitextfile, 'w') as f:
+        m = re.findall(r'(?im)<div class="WikispacesContent WikispacesBs3">\s*<pre>', wikitext)
+        if m:
+            try:
+                wikitext = wikitext.split(m[0])[1].split('</pre>')[0].strip()
+                wikitext = undoHTMLEntities(text=wikitext)
+            except:
+                pass
+        f.write(wikitext)
+
+def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False):
+    pagenameplus = re.sub(' ', '+', pagename)
+    pagename_ = urllib.parse.quote(pagename)
+    
+    #page current revision (html & wikitext)
+    pageurl = '%s/%s' % (wikiurl, pagename_)
+    filename = '%s.html' % (pagenameplus)
+    print('Downloading page: %s' % (filename))
+    saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite)
+    pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_)
+    filename2 = '%s.wikitext' % (pagenameplus)
+    print('Downloading page: %s' % (filename2))
+    saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite)
+    convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages')
+    
+    #csv with page history
+    csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_)
+    csvfilename = '%s.history.csv' % (pagenameplus)
+    print('Downloading page: %s' % (csvfilename))
+    saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite)
+
+def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False):
+    filenameplus = re.sub(' ', '+', filename)
+    filename_ = urllib.parse.quote(filename)
+    
+    #file full resolution
+    fileurl = '%s/file/view/%s' % (wikiurl, filename_)
+    filename = filenameplus
+    print('Downloading file: %s' % (filename))
+    saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite)
+    
+    #csv with file history
+    csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_)
+    csvfilename = '%s.history.csv' % (filenameplus)
+    print('Downloading file: %s' % (csvfilename))
+    saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite)
+
+def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False):
+    print('Downloading Pages and Files from %s' % (wikiurl))
+    #csv all pages and files
+    csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl)
+    saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='')
+    #download every page and file
+    totallines = 0
+    with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f:
+        totallines = len(f.read().splitlines()) - 1
+    with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile:
+        filesc = 0
+        pagesc = 0
+        print('This wiki has %d pages and files' % (totallines))
+        rows = csv.reader(csvfile, delimiter=',', quotechar='"')
+        for row in rows:
+            if row[0] == 'file':
+                filesc += 1
+                filename = row[1]
+                downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite)
+            elif row[0] == 'page':
+                pagesc += 1
+                pagename = row[1]
+                downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite)
+            if (filesc + pagesc) % 10 == 0:
+                print('  Progress: %d of %d' % ((filesc + pagesc), totallines))
+        print('  Progress: %d of %d' % ((filesc + pagesc), totallines))
+    print('Downloaded %d pages' % (pagesc))
+    print('Downloaded %d files' % (filesc))
+
+def downloadSitemap(wikidomain='', wikiurl='', overwrite=False):
+    print('Downloading sitemap.xml')
+    saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite)
+
+def downloadMainPage(wikidomain='', wikiurl='', overwrite=False):
+    print('Downloading index.html')
+    saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite)
+
+def downloadLogo(wikidomain='', wikiurl='', overwrite=False):
+    index = '%s/index.html' % (wikidomain)
+    if os.path.exists(index):
+        raw = ''
+        try:
+            with open(index, 'r', encoding='utf-8') as f:
+                raw = f.read()
+        except:
+            with open(index, 'r', encoding='latin-1') as f:
+                raw = f.read()
+        m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', raw)
+        if m:
+            logourl = m[0]
+            logofilename = logourl.split('/')[-1]
+            print('Downloading logo')
+            saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='', overwrite=overwrite)
+            return logofilename
+    return ''
+
+def printhelp():
+    helptext = """This script downloads (and uploads) WikiSpaces wikis.
+
+Parameters available:
+
+--upload: upload compressed file with downloaded wiki
+--admin: add item to WikiTeam collection (if you are an admin in that collection)
+--overwrite: download again even if files exists locally
+--overwrite-ia: upload again to Internet Archive even if item exists there
+--help: prints this help text
+
+Examples:
+
+python3 wikispaces.py https://mywiki.wikispaces.com
+   It downloads that wiki
+
+python3 wikispaces.py wikis.txt
+   It downloads a list of wikis (file format is a URL per line)
+
+python3 wikispaces.py https://mywiki.wikispaces.com --upload
+   It downloads that wiki, compress it and uploading to Internet Archive
+"""
+    print(helptext)
+    sys.exit()
+
+def duckduckgo():
+    opener = urllib.request.build_opener()
+    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+    urllib.request.install_opener(opener)
+    
+    wikis = []
+    ignorewikis = [
+        'https://wikispaces.com', 
+        'https://www.wikispaces.com', 
+        'https://wikispaces.net', 
+        'https://www.wikispaces.net', 
+    ]
+    for i in range(1, 100000):
+        url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikispaces.com' % (random.randint(100, 5000), random.randint(1000, 9999))
+        print('URL search', url)
+        try:
+            html = urllib.request.urlopen(url).read().decode('utf-8')
+        except:
+            print('Search error')
+            time.sleep(30)
+            continue
+        html = urllib.parse.unquote(html)
+        m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
+        for wiki in m:
+            wiki = 'https://' + wiki
+            wiki = re.sub(r'https://www\.', 'https://', wiki)
+            if not wiki in wikis and not wiki in ignorewikis:
+                wikis.append(wiki)
+                yield wiki
+        sleep = random.randint(5,20)
+        print('Sleeping %d seconds' % (sleep))
+        time.sleep(sleep)
+
+def main():
+    upload = False
+    isadmin = False
+    overwrite = False
+    overwriteia = False
+    if len(sys.argv) < 2:
+        printhelp()
+    param = sys.argv[1]
+    if not param:
+        printhelp()
+    if len(sys.argv) > 2:
+        if '--upload' in sys.argv:
+            upload = True
+        if '--admin' in sys.argv:
+            isadmin = True
+        if '--overwrite' in sys.argv:
+            overwrite = True
+        if '--overwrite-ia' in sys.argv:
+            overwriteia = True
+        if '--help' in sys.argv:
+            printhelp()
+    
+    wikilist = []
+    if '://' in param:
+        wikilist.append(param.rstrip('/'))
+    elif param.lower() == 'duckduckgo':
+        wikilist = duckduckgo()
+        #for wiki in wikilist:
+        #    print(wiki)
+    else:
+        with open(param, 'r') as f:
+            wikilist = f.read().strip().splitlines()
+            wikilist2 = []
+            for wiki in wikilist:
+                wikilist2.append(wiki.rstrip('/'))
+            wikilist = wikilist2
+    
+    for wikiurl in wikilist:
+        wikidomain = wikiurl.split('://')[1].split('/')[0]
+        print('\n')
+        print('#'*40,'\n Downloading:', wikiurl)
+        print('#'*40,'\n')
+        
+        if upload and not overwriteia:
+            itemid = 'wiki-%s' % (wikidomain)
+            try:
+                iahtml = ''
+                try:
+                    iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
+                except:
+                    time.sleep(10)
+                    iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
+                if iahtml and not re.findall(r'(?im)Item cannot be found', iahtml):
+                    if not overwriteia:
+                        print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia')
+                        print('You can find it in https://archive.org/details/%s' % (itemid))
+                        time.sleep(1)
+                        continue
+            except:
+                pass
+        
+        dirfiles = '%s/files' % (wikidomain)
+        if not os.path.exists(dirfiles):
+            print('Creating directory %s' % (dirfiles))
+            os.makedirs(dirfiles)
+        dirpages = '%s/pages' % (wikidomain)
+        if not os.path.exists(dirpages):
+            print('Creating directory %s' % (dirpages))
+            os.makedirs(dirpages)
+        sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
+        
+        downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite)
+        if not os.path.exists('%s/sitemap.xml' % (wikidomain)):
+            print('Error, wiki was probably deleted. Skiping wiki...')
+            continue
+        else:
+            sitemapraw = ''
+            try:
+                with open('%s/sitemap.xml' % (wikidomain), encoding='utf-8') as g:
+                    sitemapraw = g.read()
+            except:
+                with open('%s/sitemap.xml' % (wikidomain), encoding='latin-1') as g:
+                    sitemapraw = g.read()
+            if re.search(r'(?im)<h1>This wiki has been deactivated</h1>', sitemapraw):
+                print('Error, wiki was deactivated. Skiping wiki...')
+                continue
+        
+        downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
+        if not os.path.exists('%s/index.html' % (wikidomain)):
+            print('Error, wiki was probably deleted or expired. Skiping wiki...')
+            continue
+        else:
+            indexraw = ''
+            try:
+                with open('%s/index.html' % (wikidomain), encoding='utf-8') as g:
+                    indexraw = g.read()
+            except:
+                with open('%s/index.html' % (wikidomain), encoding='latin-1') as g:
+                    indexraw = g.read()
+            if re.search(r'(?im)<h1>Subscription Expired</h1>', indexraw):
+                print('Error, wiki subscription expired. Skiping wiki...')
+                continue
+        
+        downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
+        logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
+        
+        if upload:
+            itemid = 'wiki-%s' % (wikidomain)
+            print('\nCompressing dump...')
+            wikidir = wikidomain
+            os.chdir(wikidir)
+            print('Changed directory to', os.getcwd())
+            wikizip = '%s.zip' % (wikidomain)
+            subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
+            os.chdir('..')
+            print('Changed directory to', os.getcwd())
+            
+            print('\nUploading to Internet Archive...')
+            indexfilename = '%s/index.html' % (wikidir)
+            if not os.path.exists(indexfilename):
+                print('\nError dump incomplete, skipping upload\n')
+                continue
+            indexhtml = ''
+            try:
+                with open(indexfilename, 'r', encoding='utf-8') as f:
+                    indexhtml = f.read()
+            except:
+                with open(indexfilename, 'r', encoding='latin-1') as f:
+                    indexhtml = f.read()
+            
+            wikititle = ''
+            try:
+                wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
+            except:
+                wikititle = wikidomain
+            if not wikititle:
+                wikititle = wikidomain
+            wikititle = wikititle.replace("\\'", " ")
+            wikititle = wikititle.replace('\\"', " ")
+            itemtitle = 'Wiki - %s' % wikititle
+            itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
+            itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
+            itemoriginalurl = wikiurl
+            itemlicenseurl = ''
+            m = ''
+            try:
+                m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
+            except:
+                m = ''
+            if m:
+                itemlicenseurl = m[0]
+            if not itemlicenseurl:
+                itemtags.append('unknowncopyright')
+            itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
+            itemcollection = isadmin and 'wikiteam' or 'opensource'
+            itemlang = 'Unknown'
+            itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
+            itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''                
+            callplain = "ia upload %s %s %s --metadata='mediatype:web' --metadata='collection:%s' --metadata='title:%s' --metadata='description:%s' --metadata='language:%s' --metadata='last-updated-date:%s' --metadata='originalurl:%s' %s %s" % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemoriginalurl, itemlicenseurl and "--metadata='licenseurl:%s'" % (itemlicenseurl) or '', itemtags_)
+            print(callplain)
+            subprocess.call(callplain, shell=True)
+            
+            """
+            md = {
+                'mediatype': 'web',
+                'collection': itemcollection,
+                'title': itemtitle,
+                'description': itemdesc,
+                'language': itemlang,
+                'last-updated-date': itemdate,
+                'subject': '; '.join(itemtags), 
+                'licenseurl': itemlicenseurl,
+                'originalurl': itemoriginalurl,
+            }
+            item = get_item(itemid)
+            item.upload(wikizip, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
+            item.modify_metadata(md)
+            if itemlogo:
+                item.upload(itemlogo, access_key=accesskey, secret_key=secretkey, verbose=True)
+            """
+            
+            print('You can find it in https://archive.org/details/%s' % (itemid))
+            os.remove(wikizip)
+
+if __name__ == "__main__":
+    main()
--- a/wikiteam/mediawiki.py
+++ b/wikiteam/mediawiki.py
@ -228,7 +228,11 @@ def mwGetImageNamesAPI(config={}):
                url = mwCurateImageURL(config=config, url=url)
                # encoding to ascii is needed to work around this horrible bug:
                # http://bugs.python.org/issue8136
-                filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
+                if 'mwapi' in config and '.wikia.com' in config['mwapi']:
+                    #to avoid latest?cb=20120816112532 in filenames
+                    filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')
+                else:
+                    filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
                uploader = re.sub('_', ' ', image['user'])
                imagenames.append([filename, url, uploader])
        else: