Merge branch 'master' of https://github.com/WikiTeam/wikiteam

5 years ago · aecee2dc53
parent 33a93fd76a 966df37c54
commit aecee2dc53
36 changed files with 2506182 additions and 248445 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,2 @@
 *.com linguist-vendored
 *.org linguist-vendored
--- a/.travis.yml
+++ b/.travis.yml
@ -4,3 +4,5 @@ install:
  - pip install tox 
 script:
  - tox
 notifications:
  email: false
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # WikiTeam
 ### We archive wikis, from Wikipedia to tiniest wikis
-**WikiTeam software is a set of tools for archiving wikis.** They work on MediaWiki wikis, but we want to expand to other wiki engines. As of January 2016, WikiTeam has preserved more than [27,000 stand-alone wikis](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups), several wikifarms, regular Wikipedia dumps and [34 TB of Wikimedia Commons images](https://archive.org/details/wikimediacommons).
+**WikiTeam software is a set of tools for archiving wikis.** They work on MediaWiki wikis, but we want to expand to other wiki engines. As of 2019, WikiTeam has preserved more than [250,000 wikis](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups), several wikifarms, regular Wikipedia dumps and [34 TB of Wikimedia Commons images](https://archive.org/details/wikimediacommons).
 There are [thousands](http://wikiindex.org) of [wikis](https://wikiapiary.com) in the Internet. Every day some of them are no longer publicly available and, due to lack of backups, lost forever. Millions of people download tons of media files (movies, music, books, etc) from the Internet, serving as a kind of distributed backup. Wikis, most of them under free licenses, disappear from time to time because nobody grabbed a copy of them. That is a shame that we would like to solve.
--- a/batchdownload/taskforce/mediawikis_notarchived_2018.txt
+++ b/batchdownload/taskforce/mediawikis_notarchived_2018.txt
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 # dumpgenerator.py A generator of dumps for wikis
-# Copyright (C) 2011-2016 WikiTeam developers
+# Copyright (C) 2011-2018 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -20,7 +20,7 @@
 #     https://github.com/WikiTeam/wikiteam/wiki
 try:
-    from kitchen.text.converters import getwriter
+    from kitchen.text.converters import getwriter, to_unicode
 except ImportError:
    print "Please install the kitchen module."
 import cookielib
@ -39,17 +39,31 @@ except ImportError:             # Python 2.4 compatibility
    from md5 import new as md5
 import os
 import re
 import subprocess
 try:
    import requests
 except ImportError:
    print "Please install or update the Requests module."
    sys.exit(1)
 try:
    import wikitools
 except ImportError:
    print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
 try:
    from lxml import etree
    from lxml.builder import E
 except ImportError:
    print "Please install the lxml module if you want to use --xmlrevisions."
 import time
 import urllib
 try:
    from urlparse import urlparse, urlunparse
 except ImportError:
    from urllib.parse import urlparse, urlunparse
 UTF8Writer = getwriter('utf8')
 sys.stdout = UTF8Writer(sys.stdout)
-__VERSION__ = '0.3.0-alpha'  # major, minor, micro: semver.org
+__VERSION__ = '0.4.0-alpha'  # major, minor, micro: semver.org
 class PageMissingError(Exception):
    def __init__(self, title, xml):
@ -150,7 +164,7 @@ def getNamespacesScraper(config={}, session=None):
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        r = session.post(
-            url=config['index'], data={'title': 'Special:Allpages'})
+            url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
        raw = r.text
        delay(config=config, session=session)
@ -187,33 +201,41 @@ def getNamespacesAPI(config={}, session=None):
    if namespaces:
        r = session.post(
            url=config['api'],
-            data={
+            params={
                'action': 'query',
                'meta': 'siteinfo',
                'siprop': 'namespaces',
-                'format': 'json'}
+                'format': 'json'},
            timeout=30
        )
        result = getJSON(r)
        delay(config=config, session=session)
        try:
            nsquery = result['query']['namespaces']
        except KeyError:
            print "Error: could not get namespaces from the API request"
            print "HTTP %d" % r.status_code
            print r.text
            return None
        if 'all' in namespaces:
            namespaces = []
-            for i in result['query']['namespaces'].keys():
+            for i in nsquery.keys():
                if int(i) < 0:  # -1: Special, -2: Media, excluding
                    continue
                namespaces.append(int(i))
-                namespacenames[int(i)] = result['query']['namespaces'][i]['*']
+                namespacenames[int(i)] = nsquery[i]['*']
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
-            for i in result['query']['namespaces'].keys():
+            for i in nsquery.keys():
                bi = i
                i = int(i)
                if i < 0:  # -1: Special, -2: Media, excluding
                    continue
                if i in namespaces:
                    namespaces2.append(i)
-                    namespacenames[i] = result['query']['namespaces'][bi]['*']
+                    namespacenames[i] = nsquery[bi]['*']
            namespaces = namespaces2
    else:
        namespaces = [0]
@ -249,7 +271,7 @@ def getPageTitlesAPI(config={}, session=None):
            retryCount = 0
            while retryCount < config["retries"]:
                try:
-                    r = session.post(url=config['api'], data=params)
+                    r = session.post(url=config['api'], data=params, timeout=30)
                    break
                except ConnectionError as err:
                    print "Connection error: %s" % (str(err),)
@ -271,21 +293,27 @@ def getPageTitlesAPI(config={}, session=None):
                    apfrom = jsontitles['continue']['apcontinue']
                elif 'apfrom' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apfrom']
-            
+
            # print apfrom
            # print jsontitles
-            allpages = jsontitles['query']['allpages']
+            try:
                allpages = jsontitles['query']['allpages']
            except KeyError:
                print "The allpages API returned nothing. Exit."
                sys.exit(1)
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
            for page in allpages:
-                yield page['title']
+                title = page['title']
                titles.append(title)
                yield title
            c += len(allpages)
            if len(titles) != len(set(titles)):
-                # probably we are in a loop, server returning dupe titles, stop
+                print 'Probably a loop, switching to next namespace. Duplicate title:'
-                # it
+                print title
                print 'Probably a loop, finishing'
                titles = list(set(titles))
                apfrom = ''
@ -301,7 +329,7 @@ def getPageTitlesScraper(config={}, session=None):
        print '    Retrieving titles in the namespace', namespace
        url = '%s?title=Special:Allpages&namespace=%s' % (
            config['index'], namespace)
-        r = session.get(url=url)
+        r = session.get(url=url, timeout=30)
        raw = r.text
        raw = cleanHTML(raw)
@ -353,7 +381,7 @@ def getPageTitlesScraper(config={}, session=None):
                    # to avoid reload dupe subpages links
                    checked_suballpages.append(name)
                    delay(config=config, session=session)
-                    r2 = session.get(url=url)
+                    r2 = session.get(url=url, timeout=10)
                    raw2 = r2.text
                    raw2 = cleanHTML(raw2)
                    rawacum += raw2  # merge it after removed junk
@ -386,13 +414,11 @@ def getPageTitles(config={}, session=None):
    titles = []
    if 'api' in config and config['api']:
-        r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'})
+        try:
        test = getJSON(r)
        if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
                and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
            titles = getPageTitlesScraper(config=config, session=session)
        else:
            titles = getPageTitlesAPI(config=config, session=session)
        except:
            print "Error: could not get page titles from the API"
            titles = getPageTitlesScraper(config=config, session=session)
    elif 'index' in config and config['index']:
        titles = getPageTitlesScraper(config=config, session=session)
@ -412,7 +438,7 @@ def getPageTitles(config={}, session=None):
    print '%d page titles loaded' % (c)
    return titlesfilename
-    
+
 def getImageNames(config={}, session=None):
    """ Get list of image names """
@ -436,39 +462,60 @@ def getXMLHeader(config={}, session=None):
    # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
    # xmlns:x....
    randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
-    try:
+    print config['api']
-        xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+    if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
-    except PageMissingError as pme:
+        xml = None
        # The <page> does not exist. Not a problem, if we get the <siteinfo>.
        xml = pme.xml
    # Issue 26: Account for missing "Special" namespace.
    # Hope the canonical special name has not been removed.
    # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
    except ExportAbortedError:
        try:
-            if config['api']:
+            print 'Getting the XML header from the API'
-                print "Trying the local name for the Special namespace instead"
+            r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
-                r = session.post(
+            xml = r.json()['query']['export']['*']
-                url=config['api'],
+            if not xml:
-                data={
+                r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
-                    'action': 'query',
+                xml = r.text
-                    'meta': 'siteinfo',
+        except requests.exceptions.RetryError:
-                    'siprop': 'namespaces',
+            pass
-                    'format': 'json'}
+
-                )
+    else:
-                config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+        try:
-                    + ':Export'
+            xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
                xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
        except PageMissingError as pme:
            # The <page> does not exist. Not a problem, if we get the <siteinfo>.
            xml = pme.xml
        # Issue 26: Account for missing "Special" namespace.
        # Hope the canonical special name has not been removed.
        # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
        except ExportAbortedError:
-            pass
+            try:
                if config['api']:
                    print "Trying the local name for the Special namespace instead"
                    r = session.post(
                    url=config['api'],
                    params={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop': 'namespaces',
                        'format': 'json'},
                    timeout=120
                    )
                    config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
                        + ':Export'
                    xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
            except PageMissingError as pme:
                xml = pme.xml
            except ExportAbortedError:
                pass
    header = xml.split('</mediawiki>')[0]
    if not re.match(r"\s*<mediawiki", xml):
-        print 'XML export on this wiki is broken, quitting.'
+        if config['xmlrevisions']:
-        logerror(u'XML export on this wiki is broken, quitting.')
+            # Try again the old way
-        sys.exit()
+            print 'Export test via the API failed. Wiki too old? Trying without xmlrevisions.'
            config['xmlrevisions'] = False
            header, config = getXMLHeader(config=config, session=session)
        else:
            print 'XML export on this wiki is broken, quitting.'
            logerror(u'XML export on this wiki is broken, quitting.')
            sys.exit()
    return header, config
@ -512,7 +559,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
        if c > 0 and c < maxretries:
            wait = increment * c < maxseconds and increment * \
                c or maxseconds  # incremental until maxseconds
-            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['pages'], wait)
+            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)
            time.sleep(wait)
            # reducing server load requesting smallest chunks (if curonly then
            # limit = 1 from mother function)
@ -521,6 +568,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
        if c >= maxretries:
            print '    We have retried %d times' % (c)
            print '    MediaWiki error for "%s", network error or whatever...' % (params['pages'])
            if config['failfast']:
                print "Exit, it will be for another time"
                sys.exit()
            # If it's not already what we tried: our last chance, preserve only the last revision...
            # config['curonly'] means that the whole dump is configured to save only the last,
            # params['curonly'] should mean that we've already tried this
@ -550,7 +600,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
                return ''  # empty xml
        # FIXME HANDLE HTTP Errors HERE
        try:
-            r = session.post(url=config['index'], data=params, headers=headers)
+            r = session.post(url=config['index'], params=params, headers=headers, timeout=10)
            handleStatusCode(r)
            xml = fixBOM(r)
        except requests.exceptions.ConnectionError as e:
@ -675,10 +725,9 @@ def cleanXML(xml=''):
 def generateXMLDump(config={}, titles=[], start=None, session=None):
-    """ Generates a XML dump for a list of titles """
+    """ Generates a XML dump for a list of titles or from revision IDs """
    # TODO: titles is now unused.
    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
    header, config = getXMLHeader(config=config, session=session)
    footer = '</mediawiki>\n'  # new line at the end
    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
@ -686,48 +735,189 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
                                    config['curonly'] and 'current' or 'history')
    xmlfile = ''
    lock = True
-    if start:
+
-        print "Removing the last chunk of past XML dump: it is probably incomplete."
+    if config['xmlrevisions']:
-        for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
+        print 'Retrieving the XML for every page from the beginning'
            pass
    else:
        # requested complete xml dump
        lock = False
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
        xmlfile.write(header.encode('utf-8'))
        xmlfile.close()
    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
    c = 1
    for title in readTitles(config, start):
        if not title.strip():
            continue
        if title == start:  # start downloading from start, included
            lock = False
        if lock:
            continue
        delay(config=config, session=session)
        if c % 10 == 0:
            print 'Downloaded %d pages' % (c)
        try:
-            for xml in getXMLPage(config=config, title=title, session=session):
+            r_timestamp = r'<timestamp>([^<]+)</timestamp>'
            for xml in getXMLRevisions(config=config, session=session):
                numrevs = len(re.findall(r_timestamp, xml))
                # Due to how generators work, it's expected this may be less
                print "%d more revisions exported" % numrevs
                xml = cleanXML(xml=xml)
                xmlfile.write(xml.encode('utf-8'))
-        except PageMissingError:
+        except AttributeError:
-            logerror(
+            print "This wikitools module version is not working"
-                config=config,
+            sys.exit()
-                text=u'The page "%s" was missing in the wiki (probably deleted)' %
+    else:
-                (title.decode('utf-8'))
+        print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
-            )
+        if start:
-        # here, XML is a correct <page> </page> chunk or
+            print "Removing the last chunk of past XML dump: it is probably incomplete."
-        # an empty string due to a deleted page (logged in errors log) or
+            for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
-        # an empty string due to an error while retrieving the page from server
+                pass
-        # (logged in errors log)
+        else:
-        c += 1
+            # requested complete xml dump
            lock = False
            xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
            xmlfile.write(header.encode('utf-8'))
            xmlfile.close()
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
        c = 1
        for title in readTitles(config, start):
            if not title.strip():
                continue
            if title == start:  # start downloading from start, included
                lock = False
            if lock:
                continue
            delay(config=config, session=session)
            if c % 10 == 0:
                print 'Downloaded %d pages' % (c)
            try:
                for xml in getXMLPage(config=config, title=title, session=session):
                    xml = cleanXML(xml=xml)
                    xmlfile.write(xml.encode('utf-8'))
            except PageMissingError:
                logerror(
                    config=config,
                    text=u'The page "%s" was missing in the wiki (probably deleted)' %
                    (title.decode('utf-8'))
                )
            # here, XML is a correct <page> </page> chunk or
            # an empty string due to a deleted page (logged in errors log) or
            # an empty string due to an error while retrieving the page from server
            # (logged in errors log)
            c += 1
    xmlfile.write(footer)
    xmlfile.close()
    print 'XML dump saved at...', xmlfilename
 def getXMLRevisions(config={}, session=None, allpages=False):
    site = wikitools.wiki.Wiki(config['api'])
    if not 'all' in config['namespaces']:
        namespaces = config['namespaces']
    else:
        namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
    try:
        for namespace in namespaces:
            print "Trying to export all revisions from namespace %s" % namespace
            arvparams = {
                'action': 'query',
                'list': 'allrevisions',
                'arvlimit': 500,
                'arvnamespace': namespace
            }
            if not config['curonly']:
                # We have to build the XML manually...
                # Skip flags, presumably needed to add <minor/> which is in the schema.
                # Also missing: parentid and contentformat.
                arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
                arvrequest = wikitools.api.APIRequest(site, arvparams)
                results = arvrequest.queryGen()
                for result in results:
                    for page in result['query']['allrevisions']:
                        yield makeXmlFromPage(page)
            else:
                # Just cycle through revision IDs and use the XML as is
                arvparams['arvprop'] = 'ids'
                arvrequest = wikitools.api.APIRequest(site, arvparams)
                arvresults = arvrequest.queryGen()
                for result in arvresults:
                    revids = []
                    for page in result['query']['allrevisions']:
                        for revision in page['revisions']:
                            revids.append(str(revision['revid']))
                    print "%d more revisions listed, until %s" % (len(revids), revids[-1])
                    exportparams = {
                        'action': 'query',
                        'revids': '|'.join(revids),
                        'export': '1',
                    }
                    exportrequest = wikitools.api.APIRequest(site, exportparams)
                    exportresults = exportrequest.queryGen()
                    for exportresult in exportresults:
                        yield exportresult['query']['export']['*']
    except KeyError:
        print "Warning. Could not use allrevisions, wiki too old."
        if config['curonly']:
            for title in readTitles(config):
                exportparams = {
                    'action': 'query',
                    'titles': title,
                    'export': '1',
                }
                exportrequest = wikitools.api.APIRequest(site, exportparams)
                exportresults = exportrequest.queryGen()
                for exportresult in exportresults:
                    yield exportresult['query']['export']['*']
        else:
            for title in readTitles(config):
                pparams = {
                    'action': 'query',
                    'titles': title,
                    'prop': 'revisions',
                    'rvlimit': 'max',
                    'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
                    'rawcontinue': 'yes'
                }
                prequest = wikitools.api.APIRequest(site, pparams)
                try:
                    results = prequest.query()
                    pages = results['query']['pages']
                except KeyError:
                    raise PageMissingError(title, xml='')
                for page in pages:
                    try:
                        xml = makeXmlFromPage(pages[page])
                    except PageMissingError:
                        logerror(
                            config=config,
                            text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
                        )
                        continue
                    yield xml
    except wikitools.api.APIError:
        print "This wikitools version seems not to work for us. Exiting."
        sys.exit()
 def makeXmlFromPage(page):
    """ Output an XML document as a string from a page as in the API JSON """
    try:
        p = E.page(
                E.title(page['title']),
                E.ns(to_unicode(page['ns'])),
                E.id(to_unicode(page['pageid'])),
        )
        for rev in page['revisions']:
            revision = E.revision(
                E.id(to_unicode(rev['revid'])),
                E.parentid(to_unicode(rev['parentid'])),
                E.timestamp(rev['timestamp']),
                E.contributor(
                        E.id(to_unicode(rev['userid'])),
                        E.username(to_unicode(rev['user'])),
                ),
                E.comment(rev['comment']),
                E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
            )
            if 'contentmodel' in rev:
                revision.append(E.model(rev['contentmodel']))
            # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
            if 'sha1' in rev:
                revision.append(E.sha1(rev['sha1']))
            p.append(revision)
    except KeyError:
        raise PageMissingError(page['title'], '')
    return etree.tostring(p, pretty_print=True)
 def readTitles(config={}, start=None):
    """ Read title list from a file, from the title "start" """
@ -863,10 +1053,11 @@ def getImageNamesScraper(config={}, session=None):
        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
        r = session.post(
            url=config['index'],
-            data={
+            params={
                'title': 'Special:Imagelist',
                'limit': limit,
-                'offset': offset})
+                'offset': offset},
            timeout=30)
        raw = r.text
        delay(config=config, session=session)
        # delicate wiki
@ -967,7 +1158,7 @@ def getImageNamesAPI(config={}, session=None):
            'format': 'json',
            'ailimit': 500}
        # FIXME Handle HTTP Errors HERE
-        r = session.post(url=config['api'], data=params)
+        r = session.post(url=config['api'], params=params, timeout=30)
        handleStatusCode(r)
        jsonimages = getJSON(r)
        delay(config=config, session=session)
@ -1025,7 +1216,7 @@ def getImageNamesAPI(config={}, session=None):
                'iiprop': 'user|url',
                'format': 'json'}
            # FIXME Handle HTTP Errors HERE
-            r = session.post(url=config['api'], data=params)
+            r = session.post(url=config['api'], params=params, timeout=30)
            handleStatusCode(r)
            jsonimages = getJSON(r)
            delay(config=config, session=session)
@ -1112,10 +1303,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
        # saving description if any
        try:
            title = u'Image:%s' % (filename)
-            xmlfiledesc = getXMLFileDesc(
+            if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
-                config=config,
+                r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
-                title=title,
+                xmlfiledesc = r.text
-                session=session)  # use Image: for backwards compatibility
+            else:
                xmlfiledesc = getXMLFileDesc(
                    config=config,
                    title=title,
                    session=session)  # use Image: for backwards compatibility
        except PageMissingError:
            xmlfiledesc = ''
            logerror(
@ -1170,7 +1365,7 @@ def domain2prefix(config={}, session=None):
        domain = config['index']
    domain = domain.lower()
-    domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
+    domain = re.sub(r'(https?://|www\.|/index\.php.*|/api\.php.*)', '', domain)
    domain = re.sub(r'/', '_', domain)
    domain = re.sub(r'\.', '', domain)
    domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
@ -1211,8 +1406,9 @@ def welcome():
    message += ''
    message += "\n"
    message += "#" * 73
    message += "\n"
    message += "# Copyright (C) 2011-%d WikiTeam developers                           #\n" % (datetime.datetime.now().year)
    message += """
 # Copyright (C) 2011-2014 WikiTeam                                      #
 # This program is free software: you can redistribute it and/or modify  #
 # it under the terms of the GNU General Public License as published by  #
 # the Free Software Foundation, either version 3 of the License, or     #
@ -1299,7 +1495,9 @@ def getParameters(params=[]):
        action='store_true',
        help="generates a full history XML dump (--xml --curonly for current revisions only)")
    groupDownload.add_argument('--curonly', action='store_true',
-                               help='store only the current version of pages')
+        help='store only the current version of pages')
    groupDownload.add_argument('--xmlrevisions', action='store_true',
                               help='download all revisions from an API generator. MediaWiki 1.27+ only.')
    groupDownload.add_argument(
        '--images', action='store_true', help="generates an image dump")
    groupDownload.add_argument(
@ -1319,6 +1517,10 @@ def getParameters(params=[]):
        '--get-wiki-engine',
        action='store_true',
        help="returns the wiki engine")
    groupMeta.add_argument(
        '--failfast',
        action='store_true',
        help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.")
    args = parser.parse_args()
    # print args
@ -1350,11 +1552,22 @@ def getParameters(params=[]):
        print 'Using cookies from %s' % args.cookies
    session = requests.Session()
    try:
        from requests.packages.urllib3.util.retry import Retry
        from requests.adapters import HTTPAdapter
        # Courtesy datashaman https://stackoverflow.com/a/35504626
        __retries__ = Retry(total=5,
                        backoff_factor=2,
                        status_forcelist=[500, 502, 503, 504])
        session.mount('https://', HTTPAdapter(max_retries=__retries__))
        session.mount('http://', HTTPAdapter(max_retries=__retries__))
    except:
        # Our urllib3/requests is too old
        pass
    session.cookies = cj
    session.headers.update({'User-Agent': getUserAgent()})
    if args.user and args.password:
        session.auth = (args.user, args.password)
    # session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
    # check URLs
    for url in [args.api, args.index, args.wiki]:
@ -1392,6 +1605,7 @@ def getParameters(params=[]):
        retry = 0
        maxretries = args.retries
        retrydelay = 20
        check = None
        while retry < maxretries:
            try:
                check = checkAPI(api=api, session=session)
@ -1427,15 +1641,20 @@ def getParameters(params=[]):
                session=session):
            print 'index.php is OK'
        else:
-            index = '/'.join(index.split('/')[:-1])
+            try:
                index = '/'.join(index.split('/')[:-1])
            except AttributeError:
                index = None
            if index and checkIndex(
                    index=index,
                    cookies=args.cookies,
                    session=session):
                print 'index.php is OK'
            else:
-                print 'Error in index.php, please, provide a correct path to index.php'
+                print 'Error in index.php.'
-                sys.exit(1)
+                if not args.xmlrevisions:
                    print 'Please, provide a correct path to index.php or use --xmlrevisions. Terminating.'
                    sys.exit(1)
    # check user and pass (one requires both)
    if (args.user and not args.password) or (args.password and not args.user):
@ -1483,10 +1702,12 @@ def getParameters(params=[]):
        'curonly': args.curonly,
        'date': datetime.datetime.now().strftime('%Y%m%d'),
        'api': api,
        'failfast': args.failfast,
        'index': index,
        'images': args.images,
        'logs': False,
        'xml': args.xml,
        'xmlrevisions': args.xmlrevisions,
        'namespaces': namespaces,
        'exnamespaces': exnamespaces,
        'path': args.path and os.path.normpath(args.path) or '',
@ -1520,18 +1741,23 @@ def checkAPI(api=None, session=None):
            data={
                'action': 'query',
                'meta': 'siteinfo',
-                'format': 'json'}
+                'format': 'json'},
            timeout=30
        )
-        if r.url == api:
+        if r.status_code == 200:
            break
-        else:
+        elif r.status_code < 400:
-            api = r.url
+            p = r.url
            api = urlunparse([p.scheme, p.netloc, p.path, '', '', ''])
        elif r.status_code > 400:
            print "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
            return False
    if "MediaWiki API is not enabled for this site." in r.text:
        return False
    try:
        result = getJSON(r)
        index = None
-        if result['query']:
+        if result:
            try:
                index = result['query']['general']['server'] + \
                    result['query']['general']['script']
@ -1548,7 +1774,7 @@ def checkAPI(api=None, session=None):
 def checkIndex(index=None, cookies=None, session=None):
    """ Checking index.php availability """
-    r = session.post(url=index, data={'title': 'Special:Version'})
+    r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
    raw = r.text
    print 'Checking index.php...', index
    # Workaround for issue 71
@ -1587,7 +1813,11 @@ def getJSON(request):
    """Strip Unicode BOM"""
    if request.text.startswith(u'\ufeff'):
        request.encoding = 'utf-8-sig'
-    return request.json()
+    try:
        return request.json()
    except:
        # Maybe an older API version which did not return correct JSON
        return {}
 def fixBOM(request):
@ -1633,6 +1863,8 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
    else:
        print 'XML dump seems to be corrupted.'
        reply = ''
        if config['failfast']:
            reply = 'yes'
        while reply.lower() not in ['yes', 'y', 'no', 'n']:
            reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
        if reply.lower() in ['yes', 'y']:
@ -1679,7 +1911,7 @@ def resumePreviousDump(config={}, other={}):
            if lasttitle == '':
                lasttitle=lasttitles.next()
        except:
-            pass  # probably file does not exists
+            lasttitle = ''  # probably file does not exists
        if lasttitle == '--END--':
            # titles list is complete
            print 'Title list was completed in the previous session'
@ -1810,7 +2042,7 @@ def saveSpecialVersion(config={}, session=None):
    else:
        print 'Downloading Special:Version with extensions and other related info'
        r = session.post(
-            url=config['index'], data={'title': 'Special:Version'})
+            url=config['index'], params={'title': 'Special:Version'}, timeout=10)
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
@ -1825,14 +2057,13 @@ def saveIndexPHP(config={}, session=None):
        print 'index.html exists, do not overwrite'
    else:
        print 'Downloading index.php (Main Page) as index.html'
-        r = session.post(url=config['index'], data={})
+        r = session.post(url=config['index'], params={}, timeout=10)
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
        with open('%s/index.html' % (config['path']), 'w') as outfile:
            outfile.write(raw.encode('utf-8'))
 def saveSiteInfo(config={}, session=None):
    """ Save a file with site info """
@ -1845,30 +2076,33 @@ def saveSiteInfo(config={}, session=None):
            # MediaWiki 1.13+
            r = session.post(
                url=config['api'],
-                data={
+                params={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
                    'sinumberingroup': 1,
-                    'format': 'json'})
+                    'format': 'json'},
                timeout=10)
            # MediaWiki 1.11-1.12
            if not 'query' in getJSON(r):
                r = session.post(
                    url=config['api'],
-                    data={
+                    params={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
-                        'format': 'json'})
+                        'format': 'json'},
                    timeout=10)
            # MediaWiki 1.8-1.10
            if not 'query' in getJSON(r):
                r = session.post(
                    url=config['api'],
-                    data={
+                    params={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop': 'general|namespaces',
-                        'format': 'json'})
+                        'format': 'json'},
                    timeout=10)
            result = getJSON(r)
            delay(config=config, session=session)
            with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
@ -1879,10 +2113,14 @@ def avoidWikimediaProjects(config={}, other={}):
    """ Skip Wikimedia projects and redirect to the dumps website """
    # notice about wikipedia dumps
    url = ''
    if config['api']:
        url = url + config['api']
    if config['index']:
        url = url + config['index']
    if re.findall(
            r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
-            config['api'] +
+            url):
            config['index']):
        print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
        print 'Download the dumps from http://dumps.wikimedia.org'
        if not other['force']:
@ -1895,9 +2133,9 @@ def getWikiEngine(url=''):
    session = requests.Session()
    session.headers.update({'User-Agent': getUserAgent()})
-    r = session.post(url=url)
+    r = session.post(url=url, timeout=30)
    if r.status_code == 405 or r.text == '':
-        r = session.get(url=url)
+        r = session.get(url=url, timeout=120)
    result = r.text
    wikiengine = 'Unknown'
@ -1980,7 +2218,7 @@ def mwGetAPIAndIndex(url=''):
    index = ''
    session = requests.Session()
    session.headers.update({'User-Agent': getUserAgent()})
-    r = session.post(url=url)
+    r = session.post(url=url, timeout=120)
    result = r.text
    # API
@ -2042,6 +2280,8 @@ def main(params=[]):
    while not other['resume'] and os.path.isdir(config['path']):
        print '\nWarning!: "%s" path exists' % (config['path'])
        reply = ''
        if config['failfast']:
            retry = 'yes'
        while reply.lower() not in ['yes', 'y', 'no', 'n']:
            reply = raw_input(
                'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %
--- a/batchdownload/launcher.py
+++ b/batchdownload/launcher.py
@ -6,12 +6,12 @@
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
@ -30,11 +30,11 @@ def main():
    if len(sys.argv) < 2:
        print 'python script.py file-with-apis.txt'
        sys.exit()
-    
+
    print 'Reading list of APIs from', sys.argv[1]
    wikis = open(sys.argv[1], 'r').read().splitlines()
    print '%d APIs found' % (len(wikis))
-    
+
    for wiki in wikis:
        print "#"*73
        print "# Downloading", wiki
@ -42,17 +42,15 @@ def main():
        wiki = wiki.lower()
        # Make the prefix in standard way; api and index must be defined, not important which is which
        prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
-        
+
        #check if compressed, in that case dump was finished previously
        compressed = False
-        for dirname, dirnames, filenames in os.walk('.'):
+        for f in os.listdir('.'):
-            if dirname == '.':
+            if f.startswith(prefix) and f.endswith('.7z'):
-                for f in filenames:
+                compressed = True
-                    if f.startswith(prefix) and f.endswith('.7z'):
+                zipfilename = f
                        compressed = True
                        zipfilename = f
                break #stop searching, dot not explore subdirectories
-        
+
        if compressed:
            print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
            # Get the archive's file list.
@ -67,18 +65,17 @@ def main():
                print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
                # TODO: Find a way like grep -q below without doing a 7z l multiple times?
            continue
-        
+
        #download
        started = False #was this wiki download started before? then resume
        wikidir = ''
-        for dirname, dirnames, filenames in os.walk('.'):
+        for f in os.listdir('.'):
-            if dirname == '.':
+            # Does not find numbered wikidumps not verify directories
-                for d in dirnames:
+            if f.startswith(prefix) and f.endswith('wikidump'):
-                    if d.startswith(prefix):
+                wikidir = f
-                        wikidir = d
+                started = True
                        started = True
                break #stop searching, dot not explore subdirectories
-        
+
        # time.sleep(60)
        # Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
        # such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
@ -90,15 +87,14 @@ def main():
            subprocess.call('./dumpgenerator.py --api=%s --xml --images --delay=1' % wiki, shell=True)
            started = True
            #save wikidir now
-            for dirname, dirnames, filenames in os.walk('.'):
+            for f in os.listdir('.'):
-                if dirname == '.':
+                # Does not find numbered wikidumps not verify directories
-                    for d in dirnames:
+                if f.startswith(prefix) and f.endswith('wikidump'):
-                        if d.startswith(prefix):
+                    wikidir = f
                            wikidir = d
                    break #stop searching, dot not explore subdirectories
-        
+
        prefix = wikidir.split('-wikidump')[0]
-        
+
        finished = False
        if started and wikidir and prefix:
            if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
@ -107,7 +103,7 @@ def main():
                finished = True
        # You can also issue this on your working directory to find all incomplete dumps:
        # tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
-        
+
        #compress
        if finished:
            time.sleep(1)
--- a/listsofwikis/dokuwiki/dokuinstall.txt
+++ b/listsofwikis/dokuwiki/dokuinstall.txt
@ -3048,7 +3048,7 @@ http://vai.uibk.ac.at/dadp/doku.php
 http://vak.ru/doku.php
 http://val.bmstu.ru/dokuwiki/doku.php
 http://valk.mave.jp/doku.php
-http://vancouver.hackspace.ca/doku.php
+http://vanhack.ca/doku.php
 http://vanets.vuse.vanderbilt.edu/dokuwiki/doku.php
 http://vaslor.net/doku.php
 http://vbraun.name/cms/doku.php
@ -4957,7 +4957,6 @@ http://www.minkhollow.ca/becker/doku.php
 http://www.minkhollow.ca/mhf/doku.php
 http://www.minkhollow.ca/MHF/doku.php
 http://www.minkhollow.ca/Thesis07/doku.php
 http://www.mirkosertic.de/doku.php
 http://www.mirmer.su/wiki/doku.php
 http://www.mixshare.com/wiki/doku.php
 http://www.mixxx.org/wiki/doku.php
--- a/listsofwikis/mediawiki/mediawikis_2018-alive.txt
+++ b/listsofwikis/mediawiki/mediawikis_2018-alive.txt
--- a/listsofwikis/mediawiki/miraheze-spider.py
+++ b/listsofwikis/mediawiki/miraheze-spider.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
-# Copyright (C) 2014 WikiTeam developers
+# Copyright (C) 2014-2017 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -26,9 +26,10 @@ def main():
    url = 'https://meta.miraheze.org/wiki/Special:SiteMatrix'
    r = requests.get(url, headers=headers)
    raw = r.text
-    m = re.findall(ur'<tr><td><a href="https://([^>]+?)/">[^<]+</a></td></tr>', raw)
+    m = re.findall(ur'<tr><td>(<del>)?<a href="https://([^>]+?)/">[^<]+</a>', raw)
    m.sort()
    for i in m:
-        print 'https://' + i + '/w/api.php'
+        print 'https://' + i[1] + '/w/api.php'
 if __name__ == '__main__':
    main()
--- a/listsofwikis/mediawiki/miraheze.org
+++ b/listsofwikis/mediawiki/miraheze.org
--- a/listsofwikis/mediawiki/miraheze.org.info
+++ b/listsofwikis/mediawiki/miraheze.org.info
@ -1,5 +1,5 @@
 Wikifarm: https://meta.miraheze.org/wiki/Miraheze
-Last update: 2015-09-29
+Last update: 2017-06-30
 Details:
--- a/listsofwikis/mediawiki/neoseeker-spider.py
+++ b/listsofwikis/mediawiki/neoseeker-spider.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
-# Copyright (C) 2014 WikiTeam developers
+# Copyright (C) 2014-2017 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -27,6 +27,7 @@ def main():
    r = requests.get(url, headers=headers)
    raw = r.text
    m = re.findall(ur'<li><a href=\'([^>]+?)/wiki/\'>', raw)
    m.sort()
    for i in m:
        print i + '/w/api.php'
--- a/listsofwikis/mediawiki/neoseeker.com
+++ b/listsofwikis/mediawiki/neoseeker.com
@ -2,8 +2,6 @@ http://24.neoseeker.com/w/api.php
 http://aceattorney.neoseeker.com/w/api.php
 http://advancewars.neoseeker.com/w/api.php
 http://adventuretime.neoseeker.com/w/api.php
 http://alanwake.neoseeker.com/w/api.php
 http://alienbreed.neoseeker.com/w/api.php
 http://animalcrossing.neoseeker.com/w/api.php
 http://attackontitan.neoseeker.com/w/api.php
 http://avatar.neoseeker.com/w/api.php
@ -17,9 +15,9 @@ http://boktai.neoseeker.com/w/api.php
 http://bond.neoseeker.com/w/api.php
 http://borderlands.neoseeker.com/w/api.php
 http://boundbyflame.neoseeker.com/w/api.php
 http://bravely.neoseeker.com/w/api.php
 http://breathoffire.neoseeker.com/w/api.php
 http://brink.neoseeker.com/w/api.php
 http://bulletstorm.neoseeker.com/w/api.php
 http://callofduty.neoseeker.com/w/api.php
 http://castlecrashers.neoseeker.com/w/api.php
 http://castlevania.neoseeker.com/w/api.php
@ -35,13 +33,10 @@ http://danganronpa.neoseeker.com/w/api.php
 http://darksouls.neoseeker.com/w/api.php
 http://deadisland.neoseeker.com/w/api.php
 http://deadoralive.neoseeker.com/w/api.php
 http://deadspace.neoseeker.com/w/api.php
 http://deathnote.neoseeker.com/w/api.php
 http://demonssouls.neoseeker.com/w/api.php
 http://destiny.neoseeker.com/w/api.php
 http://deusex.neoseeker.com/w/api.php
 http://devilmaycry.neoseeker.com/w/api.php
 http://diablo3.neoseeker.com/w/api.php
 http://digimon.neoseeker.com/w/api.php
 http://disgaea.neoseeker.com/w/api.php
 http://doctorwho.neoseeker.com/w/api.php
@ -57,21 +52,17 @@ http://dynastywarriors.neoseeker.com/w/api.php
 http://elderscrolls.neoseeker.com/w/api.php
 http://endlessocean.neoseeker.com/w/api.php
 http://evangelion.neoseeker.com/w/api.php
 http://eveonline.neoseeker.com/w/api.php
 http://fable.neoseeker.com/w/api.php
 http://fairytail.neoseeker.com/w/api.php
 http://fallout4.neoseeker.com/w/api.php
 http://fallout.neoseeker.com/w/api.php
 http://fallout4.neoseeker.com/w/api.php
 http://familyguy.neoseeker.com/w/api.php
 http://farcry.neoseeker.com/w/api.php
 http://fatalfury.neoseeker.com/w/api.php
 http://fifa.neoseeker.com/w/api.php
 http://finalfantasy.neoseeker.com/w/api.php
 http://fireemblem.neoseeker.com/w/api.php
 http://footballmanager.neoseeker.com/w/api.php
 http://formula1.neoseeker.com/w/api.php
 http://forza.neoseeker.com/w/api.php
 http://friends.neoseeker.com/w/api.php
 http://fullmetalalchemist.neoseeker.com/w/api.php
 http://futurama.neoseeker.com/w/api.php
 http://fzero.neoseeker.com/w/api.php
@ -81,11 +72,9 @@ http://glee.neoseeker.com/w/api.php
 http://godofwar.neoseeker.com/w/api.php
 http://goldensun.neoseeker.com/w/api.php
 http://granturismo.neoseeker.com/w/api.php
 http://greysanatomy.neoseeker.com/w/api.php
 http://growlanser.neoseeker.com/w/api.php
 http://gta5.neoseeker.com/w/api.php
 http://gta.neoseeker.com/w/api.php
-http://guildwars2.neoseeker.com/w/api.php
+http://gta5.neoseeker.com/w/api.php
 http://guildwars.neoseeker.com/w/api.php
 http://guitarhero.neoseeker.com/w/api.php
 http://gundam.neoseeker.com/w/api.php
@ -106,7 +95,6 @@ http://inuyasha.neoseeker.com/w/api.php
 http://jakdaxter.neoseeker.com/w/api.php
 http://kairosoft.neoseeker.com/w/api.php
 http://kidicarus.neoseeker.com/w/api.php
 http://kingdomcome.neoseeker.com/w/api.php
 http://kingdomhearts.neoseeker.com/w/api.php
 http://kirby.neoseeker.com/w/api.php
 http://knack.neoseeker.com/w/api.php
@ -115,8 +103,6 @@ http://layton.neoseeker.com/w/api.php
 http://leagueoflegends.neoseeker.com/w/api.php
 http://legendofdragoon.neoseeker.com/w/api.php
 http://littlebigplanet.neoseeker.com/w/api.php
 http://lmamanager.neoseeker.com/w/api.php
 http://lordsofthefallen.neoseeker.com/w/api.php
 http://lotr.neoseeker.com/w/api.php
 http://mafia.neoseeker.com/w/api.php
 http://magicalstarsign.neoseeker.com/w/api.php
@ -128,7 +114,6 @@ http://megaman.neoseeker.com/w/api.php
 http://megamitensei.neoseeker.com/w/api.php
 http://metalgear.neoseeker.com/w/api.php
 http://metroid.neoseeker.com/w/api.php
 http://mightandmagic.neoseeker.com/w/api.php
 http://minecraft.neoseeker.com/w/api.php
 http://monsterhunter.neoseeker.com/w/api.php
 http://mortalkombat.neoseeker.com/w/api.php
@ -140,7 +125,6 @@ http://ncis.neoseeker.com/w/api.php
 http://needforspeed.neoseeker.com/w/api.php
 http://ninjagaiden.neoseeker.com/w/api.php
 http://ninokuni.neoseeker.com/w/api.php
 http://nintendogs.neoseeker.com/w/api.php
 http://okami.neoseeker.com/w/api.php
 http://onepiece.neoseeker.com/w/api.php
 http://persona.neoseeker.com/w/api.php
@ -160,14 +144,12 @@ http://rockband.neoseeker.com/w/api.php
 http://rpgmaker.neoseeker.com/w/api.php
 http://runefactory.neoseeker.com/w/api.php
 http://runescape.neoseeker.com/w/api.php
 http://runesofmagic.neoseeker.com/w/api.php
 http://sandbox.neoseeker.com/w/api.php
 http://scottpilgrim.neoseeker.com/w/api.php
 http://scrapmetal.neoseeker.com/w/api.php
 http://scribblenauts.neoseeker.com/w/api.php
 http://shadowofthecolossus.neoseeker.com/w/api.php
 http://shadowrunreturns.neoseeker.com/w/api.php
 http://shank.neoseeker.com/w/api.php
 http://shenmue.neoseeker.com/w/api.php
 http://simpsons.neoseeker.com/w/api.php
 http://skate.neoseeker.com/w/api.php
@ -183,7 +165,6 @@ http://southpark.neoseeker.com/w/api.php
 http://spiderman.neoseeker.com/w/api.php
 http://spongebob.neoseeker.com/w/api.php
 http://spyro.neoseeker.com/w/api.php
 http://starbound.neoseeker.com/w/api.php
 http://starcraft.neoseeker.com/w/api.php
 http://starfox.neoseeker.com/w/api.php
 http://stargate.neoseeker.com/w/api.php
@ -196,9 +177,7 @@ http://tales.neoseeker.com/w/api.php
 http://tekken.neoseeker.com/w/api.php
 http://terraria.neoseeker.com/w/api.php
 http://thedarkness.neoseeker.com/w/api.php
 http://thedivision.neoseeker.com/w/api.php
 http://thelastofus.neoseeker.com/w/api.php
 http://theorder.neoseeker.com/w/api.php
 http://thesecretworld.neoseeker.com/w/api.php
 http://thesims.neoseeker.com/w/api.php
 http://thewarriors.neoseeker.com/w/api.php
@ -206,9 +185,7 @@ http://theworldendswithyou.neoseeker.com/w/api.php
 http://thief.neoseeker.com/w/api.php
 http://timesplitters.neoseeker.com/w/api.php
 http://tonyhawk.neoseeker.com/w/api.php
 http://torchlight2.neoseeker.com/w/api.php
 http://toriko.neoseeker.com/w/api.php
 http://transformers.neoseeker.com/w/api.php
 http://twilight.neoseeker.com/w/api.php
 http://twistedmetal.neoseeker.com/w/api.php
 http://uncharted.neoseeker.com/w/api.php
@ -217,12 +194,9 @@ http://vivapinata.neoseeker.com/w/api.php
 http://wakfu.neoseeker.com/w/api.php
 http://warcraft.neoseeker.com/w/api.php
 http://warhammer.neoseeker.com/w/api.php
 http://wasteland2.neoseeker.com/w/api.php
 http://watchdogs.neoseeker.com/w/api.php
 http://whiteknightchronicles.neoseeker.com/w/api.php
 http://wikiguides.neoseeker.com/w/api.php
 http://witcher3.neoseeker.com/w/api.php
 http://worldoftanks.neoseeker.com/w/api.php
 http://wow.neoseeker.com/w/api.php
 http://xenoblade.neoseeker.com/w/api.php
 http://yugioh.neoseeker.com/w/api.php
--- a/listsofwikis/mediawiki/neoseeker.com.info
+++ b/listsofwikis/mediawiki/neoseeker.com.info
@ -1,5 +1,5 @@
 Wikifarm: http://neowiki.neoseeker.com/wiki/Main_Page
-Last update: 2015-10-07
+Last update: 2017-06-30
 Details:
--- a/listsofwikis/mediawiki/wikia.com
+++ b/listsofwikis/mediawiki/wikia.com
--- a/listsofwikis/mediawiki/wikia.py
+++ b/listsofwikis/mediawiki/wikia.py
@ -23,7 +23,7 @@ import subprocess
 import re
 from wikitools import wiki, api
-def getlist(wikia, wkfrom = 1, wkto = 1000):
+def getlist(wikia, wkfrom = 1, wkto = 100):
    params = {'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto,}
    request = api.APIRequest(wikia, params)
    return request.query()['query']['wkdomains']
@ -31,8 +31,9 @@ def getlist(wikia, wkfrom = 1, wkto = 1000):
 def getall():
    wikia = wiki.Wiki('http://community.wikia.com/api.php')
    offset = 0
-    limit = 1000
+    limit = 100
    domains = {}
    empty = 0
    # This API module has no query continuation facility
    print 'Getting list of active domains...'
    while True:
@ -40,13 +41,21 @@ def getall():
        if list:
            print offset
            domains = dict(domains.items() + list.items() )
-            offset += 1000
+            empty = 0
        else:
            empty += 1
        offset += limit
        if empty > 100:
            # Hopefully we don't have more than 10k wikis deleted in a row
            break
    return domains
 def main():
    domains = getall()
    with open('wikia.com', 'w') as out:
 		out.write('\n'.join(str(domains[i]['domain']) for i in domains))
    undumped = []
    # Or we could iterate over each sublist while we get it?
    for i in domains:
@ -55,21 +64,21 @@ def main():
        print dbname
        first = dbname[0]
        # There are one-letter dbnames; the second letter is replaced by an underscore
-        # http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.gz
+        # http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.7z
        try:
            second = dbname[1]
        except:
            second = '_'
        base = 'http://s3.amazonaws.com/wikia_xml_dumps/' + first + '/' \
            + first + second + '/' + dbname
-        full = base + '_pages_full.xml.gz'
+        full = base + '_pages_full.xml.7z'
        print full
-        current = base + '_pages_current.xml.gz'
+        current = base + '_pages_current.xml.7z'
        images = base + '_images.tar'
        try:
            #subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
            # Use this instead, and comment out the next try, to only list.
-            subprocess.check_call(['curl', '-I', '--fail', full])
+            subprocess.call(['curl', '-I', '--fail', full])
        except subprocess.CalledProcessError as e:
            # We added --fail for this https://superuser.com/a/854102/283120
            if e.returncode == 22:
@ -81,7 +90,9 @@ def main():
        #    subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images])
        #except:
        #    pass
-    print '\n'.join(str(dump) for dump in undumped)
+
    with open('wikia.com-unarchived', 'w+') as out:
        out.write('\n'.join(str(domain) for domain in undumped))
 if __name__ == '__main__':
    main()
--- a/listsofwikis/wikidot/wikidot-duckduckgo.py
+++ b/listsofwikis/wikidot/wikidot-duckduckgo.py
@ -0,0 +1,61 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 # Copyright (C) 2018 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import random
 import re
 import sys
 import time
 import urllib.request
 def main():
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.1')]
    urllib.request.install_opener(opener)
    for i in range(1, 100000):
        url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikidot.com' % (random.randint(100, 5000), random.randint(1000, 9999))
        print('URL search', url)
        try:
            html = urllib.request.urlopen(url).read().decode('utf-8')
        except:
            print('Search error')
            time.sleep(30)
            continue
        html = urllib.parse.unquote(html)
        m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
        for wiki in m:
            wiki = 'https://' + wiki
            if not wiki in wikis:
                wikis.append(wiki)
                wikis.sort()
                print(wiki)
        with open('wikidot-duckduckgo.txt', 'w') as f:
            wikis2 = []
            for wiki in wikis:
                wiki = re.sub(r'https?://www\.', 'http://', wiki)
                if not wiki in wikis2:
                    wikis2.append(wiki)
            wikis = wikis2
            wikis.sort()
            f.write('\n'.join(wikis))
        print('%d wikis found' % (len(wikis)))
        sleep = random.randint(5,20)
        print('Sleeping %d seconds' % (sleep))
        time.sleep(sleep)
 if __name__ == '__main__':
    main()
--- a/listsofwikis/wikidot/wikidot-spider.py
+++ b/listsofwikis/wikidot/wikidot-spider.py
@ -0,0 +1,65 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 # Copyright (C) 2018 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import random
 import re
 import sys
 import time
 import urllib.request
 def main():
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    urllib.request.install_opener(opener)
    wikis = []
    with open('wikidot-spider.txt', 'r') as f:
        wikis = f.read().strip().splitlines()
    for i in range(1, 1000000):
        url = random.choice(wikis)
        print('URL search', url)
        try:
            html = urllib.request.urlopen(url).read().decode('utf-8')
        except:
            print('Search error')
            time.sleep(30)
            continue
        html = urllib.parse.unquote(html)
        m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
        for wiki in m:
            wiki = 'http://' + wiki
            if not wiki in wikis:
                wikis.append(wiki)
                wikis.sort()
                print(wiki)
        with open('wikidot-spider.txt', 'w') as f:
            wikis2 = []
            for wiki in wikis:
                wiki = re.sub(r'https?://www\.', 'http://', wiki)
                if not wiki in wikis2:
                    wikis2.append(wiki)
            wikis = wikis2
            wikis.sort()
            f.write('\n'.join(wikis))
        print('%d wikis found' % (len(wikis)))
        sleep = random.randint(1,5)
        print('Sleeping %d seconds' % (sleep))
        time.sleep(sleep)
 if __name__ == '__main__':
    main()
--- a/listsofwikis/wikidot/wikidot-spider.txt
+++ b/listsofwikis/wikidot/wikidot-spider.txt
@ -0,0 +1,871 @@
 http://007.wikidot.com
 http://025002.wikidot.com
 http://05centraal.wikidot.com
 http://05command-ja.wikidot.com
 http://05command.wikidot.com
 http://05zentrale.wikidot.com
 http://101.wikidot.com
 http://16thfleet.wikidot.com
 http://2012hoax.wikidot.com
 http://56wrtg1150.wikidot.com
 http://5edndwiki.wikidot.com
 http://E-H-S.wikidot.com
 http://F90in15Minutes.wikidot.com
 http://Health-Matters.wikidot.com
 http://Herbis.wikidot.com
 http://INCL.wikidot.com
 http://a4art.wikidot.com
 http://abarrelfull.wikidot.com
 http://academicwriting.wikidot.com
 http://ad3.wikidot.com
 http://admindevelopement.wikidot.com
 http://advent-ro.wikidot.com
 http://adventuresintherealms.wikidot.com
 http://aepassociation.wikidot.com
 http://aepsandbox.wikidot.com
 http://afterthecomet-v2.wikidot.com
 http://ageofascension.wikidot.com
 http://ageofheroesmux.wikidot.com
 http://airchairbuild.wikidot.com
 http://albums-template.wikidot.com
 http://alfamedia.wikidot.com
 http://algadon.wikidot.com
 http://alicebot.wikidot.com
 http://alveslima-edu.wikidot.com
 http://amawal.wikidot.com
 http://amen.wikidot.com
 http://amiii.wikidot.com
 http://analyticscamp.wikidot.com
 http://android0.wikidot.com
 http://androidalchemy.wikidot.com
 http://angarmegia-creadores.wikidot.com
 http://angarmegia-publicaciones.wikidot.com
 http://angarmegia-valores.wikidot.com
 http://angarmegia.wikidot.com
 http://angry-mage-games.wikidot.com
 http://anime-planet.wikidot.com
 http://apmoderneuro.wikidot.com
 http://applebyandwyman.wikidot.com
 http://aprendiendo.wikidot.com
 http://aq-3d.wikidot.com
 http://aqw-swf.wikidot.com
 http://aqwwiki.wikidot.com
 http://arcana.wikidot.com
 http://arcblade.wikidot.com
 http://artemachia.wikidot.com
 http://artniyet.wikidot.com
 http://asen.wikidot.com
 http://asoh.wikidot.com
 http://aspnet.wikidot.com
 http://astrobhadauria.wikidot.com
 http://astrobhadauria1414.wikidot.com
 http://astroveda.wikidot.com
 http://astroyogas.wikidot.com
 http://asu-csf.wikidot.com
 http://audioprodukcja.wikidot.com
 http://avendar.wikidot.com
 http://aviationknowledge.wikidot.com
 http://avoidglow.wikidot.com
 http://azentia.wikidot.com
 http://babel-template.wikidot.com
 http://backpharma.wikidot.com
 http://backupstorage.wikidot.com
 http://badwebcomics.wikidot.com
 http://balchipedia.wikidot.com
 http://barakus.wikidot.com
 http://battlestargenesis.wikidot.com
 http://bcp.wikidot.com
 http://beadersresourceguide.wikidot.com
 http://beargod.wikidot.com
 http://benitachell-bowls-club.wikidot.com
 http://bhg.wikidot.com
 http://bibles.wikidot.com
 http://bilbreyapwh.wikidot.com
 http://biol-117.wikidot.com
 http://biol252-biol319.wikidot.com
 http://bioproject.wikidot.com
 http://bisgmit.wikidot.com
 http://blackbelt.wikidot.com
 http://blackberrystorm.wikidot.com
 http://blackmarches.wikidot.com
 http://blank-template.wikidot.com
 http://bleachitp.wikidot.com
 http://blender0.wikidot.com
 http://blender1.wikidot.com
 http://blmodding.wikidot.com
 http://blog-template.wikidot.com
 http://blog.wikidot.com
 http://blogs-template.wikidot.com
 http://bloodborne.wikidot.com
 http://bni-ine.wikidot.com
 http://book-template.wikidot.com
 http://booriley.wikidot.com
 http://bootstrap-playground.wikidot.com
 http://borderlands.wikidot.com
 http://borradores-insurgencia-del-caos.wikidot.com
 http://borradores-scp-es.wikidot.com
 http://bozic-nation.wikidot.com
 http://brmehta12.wikidot.com
 http://brtff.wikidot.com
 http://brydz.wikidot.com
 http://bua581.wikidot.com
 http://bua581beerworks.wikidot.com
 http://bua581hallelibraryfinalproject.wikidot.com
 http://bugs-template.wikidot.com
 http://bugs.wikidot.com
 http://burntlands.wikidot.com
 http://bvs.wikidot.com
 http://bx-community.wikidot.com
 http://bzhlab.wikidot.com
 http://c4fsharp.wikidot.com
 http://calu.wikidot.com
 http://campusconfidential.wikidot.com
 http://cancer-control.wikidot.com
 http://caosinsurgente.wikidot.com
 http://carpenoctemstaff.wikidot.com
 http://castleage.wikidot.com
 http://caughtnotsleeping.wikidot.com
 http://ccckmit.wikidot.com
 http://ccpd.wikidot.com
 http://cctest.wikidot.com
 http://ccyms.wikidot.com
 http://ccymsevangelization.wikidot.com
 http://ccymsfoundations.wikidot.com
 http://ccymsjustice.wikidot.com
 http://ccymslounge.wikidot.com
 http://ccymspastoral.wikidot.com
 http://ccymspractices.wikidot.com
 http://ccymsprayer.wikidot.com
 http://ccymsprinciples.wikidot.com
 http://ccymsskills.wikidot.com
 http://ccymsstudents.wikidot.com
 http://cdaworldhistory.wikidot.com
 http://cellworld.wikidot.com
 http://celtic-heroes.wikidot.com
 http://cf-vanguard.wikidot.com
 http://cgp.wikidot.com
 http://chaoscomplexityineducation.wikidot.com
 http://chat-template.wikidot.com
 http://chatroom.wikidot.com
 http://chavezbraintrust.wikidot.com
 http://chcc.wikidot.com
 http://chessvariants.wikidot.com
 http://chimiex-bicaz.wikidot.com
 http://ci-sandbox.wikidot.com
 http://ci-visualdocuments.wikidot.com
 http://ci-wiki.wikidot.com
 http://circservices.wikidot.com
 http://ciscotr.wikidot.com
 http://cityofangels.wikidot.com
 http://cleanias.wikidot.com
 http://cmbeta.wikidot.com
 http://coffeetime.wikidot.com
 http://coffeetimex.wikidot.com
 http://colbycriminaljustice.wikidot.com
 http://columbiacity.wikidot.com
 http://comando05.wikidot.com
 http://comando05ptbr.wikidot.com
 http://commandement-alpha.wikidot.com
 http://commandemento5.wikidot.com
 http://communicity.wikidot.com
 http://communicity2010.wikidot.com
 http://community-playground.wikidot.com
 http://community.wikidot.com
 http://computer0.wikidot.com
 http://comux.wikidot.com
 http://connorscampaigns.wikidot.com
 http://connorscentral.wikidot.com
 http://connorsgmnotes.wikidot.com
 http://connorssettings.wikidot.com
 http://consumerpsych2009.wikidot.com
 http://convert.wikidot.com
 http://copernicon.wikidot.com
 http://corvidcollege.wikidot.com
 http://corwyn.wikidot.com
 http://cpp-wiki.wikidot.com
 http://cquniversity.wikidot.com
 http://crashfeverwikitw.wikidot.com
 http://crimjobs2010-2011.wikidot.com
 http://crm-iseg.wikidot.com
 http://crm-template.wikidot.com
 http://crosswindsgarou.wikidot.com
 http://crypsis-net.wikidot.com
 http://cs0.wikidot.com
 http://cs1.wikidot.com
 http://cs101c.wikidot.com
 http://cs124project-2009.wikidot.com
 http://csc180.wikidot.com
 http://csi.wikidot.com
 http://css-competition.wikidot.com
 http://css-sandbox.wikidot.com
 http://css.wikidot.com
 http://css3.wikidot.com
 http://css3themes.wikidot.com
 http://cst133a.wikidot.com
 http://ctwiki.wikidot.com
 http://cuarteldelo5.wikidot.com
 http://cubesat.wikidot.com
 http://cuiltheory.wikidot.com
 http://cunefa2.wikidot.com
 http://cunefb2.wikidot.com
 http://cunefc2.wikidot.com
 http://cunefe2.wikidot.com
 http://cyclods.wikidot.com
 http://daeren.wikidot.com
 http://darksouls.wikidot.com
 http://darksouls2.wikidot.com
 http://darksouls3.wikidot.com
 http://dawnofanewage.wikidot.com
 http://dcernst-teaching.wikidot.com
 http://dcernst.wikidot.com
 http://ddscat.wikidot.com
 http://defa.wikidot.com
 http://default-template.wikidot.com
 http://defunct-elitequestworlds.wikidot.com
 http://demonssouls.wikidot.com
 http://denver.wikidot.com
 http://desenvolvimentodejogos.wikidot.com
 http://design-illustration.wikidot.com
 http://destiny.wikidot.com
 http://detailed-customer-management.wikidot.com
 http://dndis.wikidot.com
 http://docpl.wikidot.com
 http://dokument-uz.wikidot.com
 http://dotflow.wikidot.com
 http://downsfolk.wikidot.com
 http://dowodztwo.wikidot.com
 http://dragon-trees.wikidot.com
 http://dreamprogram.wikidot.com
 http://dreamteam.wikidot.com
 http://dresdenfiles.wikidot.com
 http://ds09.wikidot.com
 http://ds10.wikidot.com
 http://ds2009a.wikidot.com
 http://ds2010a.wikidot.com
 http://dwd.wikidot.com
 http://e-h-s.wikidot.com
 http://earlychildhood.wikidot.com
 http://eberronunlimited.wikidot.com
 http://ecadmin.wikidot.com
 http://ecctimeline.wikidot.com
 http://echobazaar.wikidot.com
 http://ecomind.wikidot.com
 http://editor.wikidot.com
 http://editora.wikidot.com
 http://edmw.wikidot.com
 http://educ400-401.wikidot.com
 http://education-template.wikidot.com
 http://efepereth.wikidot.com
 http://eime.wikidot.com
 http://eitriggcrafting.wikidot.com
 http://ejs-in-india.wikidot.com
 http://eldritch00.wikidot.com
 http://elishapeterson.wikidot.com
 http://elsirvale.wikidot.com
 http://elunesjustice.wikidot.com
 http://emchina2010.wikidot.com
 http://enchantedbros.wikidot.com
 http://encyclowiki.wikidot.com
 http://energyclub.wikidot.com
 http://energyclub4samvedna.wikidot.com
 http://energyfuture.wikidot.com
 http://eng1d1.wikidot.com
 http://eng270.wikidot.com
 http://epimreth.wikidot.com
 http://epitome.wikidot.com
 http://esperanto.wikidot.com
 http://estudianteseconomiauned.wikidot.com
 http://eventidemush.wikidot.com
 http://everydaymagicalgirls.wikidot.com
 http://evilhat.wikidot.com
 http://execs.wikidot.com
 http://exploringsciencewiki.wikidot.com
 http://extrabees.wikidot.com
 http://f650cs.wikidot.com
 http://fairfieldproject.wikidot.com
 http://falchionvalley.wikidot.com
 http://fallout2online.wikidot.com
 http://faq.wikidot.com
 http://fearschemistry.wikidot.com
 http://fed20.wikidot.com
 http://feedback-template.wikidot.com
 http://feedback.wikidot.com
 http://fifa360.wikidot.com
 http://fifabeapro360.wikidot.com
 http://fightcorruption.wikidot.com
 http://figmentregistry.wikidot.com
 http://fillionempire.wikidot.com
 http://finalfantasy14fr.wikidot.com
 http://first-steps.wikidot.com
 http://flyclear.wikidot.com
 http://fmi.wikidot.com
 http://fmiseria3.wikidot.com
 http://fondationscp.wikidot.com
 http://fondationscpsandbox.wikidot.com
 http://fondazionescp.wikidot.com
 http://fortean.wikidot.com
 http://forum-template.wikidot.com
 http://forum.wikidot.com
 http://fourthwallgames.wikidot.com
 http://fpt.wikidot.com
 http://freevoddler.wikidot.com
 http://fretsonfire.wikidot.com
 http://futaba8fg.wikidot.com
 http://gagetowngaming.wikidot.com
 http://galacticunity.wikidot.com
 http://game-maker.wikidot.com
 http://gamedesign.wikidot.com
 http://gamemaker.wikidot.com
 http://gasbags.wikidot.com
 http://gd28.wikidot.com
 http://gdnd.wikidot.com
 http://gdt2009.wikidot.com
 http://gear-sandbox.wikidot.com
 http://geararc.wikidot.com
 http://genderbinary.wikidot.com
 http://generals.wikidot.com
 http://ginnungagap.wikidot.com
 http://globalseminarhealth.wikidot.com
 http://goddardtech.wikidot.com
 http://gorszy.wikidot.com
 http://greatestfilipino.wikidot.com
 http://green-house.wikidot.com
 http://guitarzero.wikidot.com
 http://gurpswiki.wikidot.com
 http://h205.wikidot.com
 http://hackersderede.wikidot.com
 http://halfmoonbay.wikidot.com
 http://hammer-template.wikidot.com
 http://handbook.wikidot.com
 http://harvey-capital-lectures.wikidot.com
 http://health-matters.wikidot.com
 http://herbis.wikidot.com
 http://heroes.wikidot.com
 http://heroesmush.wikidot.com
 http://heroesofalvena.wikidot.com
 http://heroessincity.wikidot.com
 http://hestia.wikidot.com
 http://hfwiki.wikidot.com
 http://hiddenprojectwiki.wikidot.com
 http://himetop.wikidot.com
 http://historynewmedia.wikidot.com
 http://hkcentral.wikidot.com
 http://hogwarts2092.wikidot.com
 http://hopkinswhpg.wikidot.com
 http://housegames.wikidot.com
 http://hp-intothefire.wikidot.com
 http://hrpg.wikidot.com
 http://hscwizards.wikidot.com
 http://hswiki.wikidot.com
 http://html50.wikidot.com
 http://iaac-readings.wikidot.com
 http://iatkos.wikidot.com
 http://ibhistory.wikidot.com
 http://ibi-apedia.wikidot.com
 http://ibiz.wikidot.com
 http://ibmathstuff.wikidot.com
 http://ibphysicsstuff.wikidot.com
 http://ibstuffqa.wikidot.com
 http://iceal.wikidot.com
 http://idrumaaps.wikidot.com
 http://ifs.wikidot.com
 http://igen.wikidot.com
 http://igor.wikidot.com
 http://imocamp.wikidot.com
 http://incl.wikidot.com
 http://inctr-news.wikidot.com
 http://inctr-palliative-care-handbook.wikidot.com
 http://inctr.wikidot.com
 http://indexhibit.wikidot.com
 http://insomniacramblings.wikidot.com
 http://installer.wikidot.com
 http://insurrection-du-chaos-sandbox.wikidot.com
 http://insurrection-du-chaos.wikidot.com
 http://inter-irc.wikidot.com
 http://internationalbatesoninstitute.wikidot.com
 http://internetior.wikidot.com
 http://involo.wikidot.com
 http://ipr10.wikidot.com
 http://ipr11.wikidot.com
 http://ipr12.wikidot.com
 http://iracing.wikidot.com
 http://irc.wikidot.com
 http://irongiant.wikidot.com
 http://irunath.wikidot.com
 http://is2216.wikidot.com
 http://ischool.wikidot.com
 http://isocentre.wikidot.com
 http://issuetracker-template.wikidot.com
 http://istar.wikidot.com
 http://istb-winter2010.wikidot.com
 http://istep-sandbox.wikidot.com
 http://itb322uap.wikidot.com
 http://ivm.wikidot.com
 http://jakilinux.wikidot.com
 http://java.wikidot.com
 http://jayashree.wikidot.com
 http://jccict.wikidot.com
 http://johnmerritt.wikidot.com
 http://join.wikidot.com
 http://jquery-easyui.wikidot.com
 http://jslibrary.wikidot.com
 http://jsukfpsd.wikidot.com
 http://kalgati.wikidot.com
 http://kannadanudi.wikidot.com
 http://karma-lab.wikidot.com
 http://kdiprivateequity.wikidot.com
 http://keramik.wikidot.com
 http://kf59.wikidot.com
 http://kfmapdb.wikidot.com
 http://khaidoan.wikidot.com
 http://kharon.wikidot.com
 http://kindiy.wikidot.com
 http://kingsway.wikidot.com
 http://kingswayeap.wikidot.com
 http://kingswayelem.wikidot.com
 http://kingswayielts.wikidot.com
 http://kingswayint.wikidot.com
 http://kingswaypreint.wikidot.com
 http://kingswayupper.wikidot.com
 http://klps.wikidot.com
 http://kmhouse.wikidot.com
 http://kmk.wikidot.com
 http://knightswrite.wikidot.com
 http://kodo.wikidot.com
 http://koty.wikidot.com
 http://ksemoudania.wikidot.com
 http://ladyhood66.wikidot.com
 http://lafundacionscp.wikidot.com
 http://languagearts8.wikidot.com
 http://lapidaria.wikidot.com
 http://lasthaiku.wikidot.com
 http://latindictionary.wikidot.com
 http://latmari.wikidot.com
 http://leplouc.wikidot.com
 http://lepszy.wikidot.com
 http://level1wiki.wikidot.com
 http://libevents.wikidot.com
 http://liblivadia.wikidot.com
 http://librarylab.wikidot.com
 http://lightworks.wikidot.com
 http://linux0.wikidot.com
 http://livesupport.wikidot.com
 http://lmtoelf.wikidot.com
 http://loosepages.wikidot.com
 http://ltt.wikidot.com
 http://lulu.wikidot.com
 http://m5snapoli.wikidot.com
 http://ma4140.wikidot.com
 http://machines-history.wikidot.com
 http://machinima138.wikidot.com
 http://mactutorial.wikidot.com
 http://maegica.wikidot.com
 http://magiamesterei.wikidot.com
 http://mainframes.wikidot.com
 http://majjhima.wikidot.com
 http://makeyourbot.wikidot.com
 http://malkavian.wikidot.com
 http://managerzonemexico.wikidot.com
 http://maratona.wikidot.com
 http://marblehornets.wikidot.com
 http://margopedia.wikidot.com
 http://marketplace-template.wikidot.com
 http://marvelreborn.wikidot.com
 http://marvelrevolution.wikidot.com
 http://masonic.wikidot.com
 http://math453fall2008.wikidot.com
 http://mathaerobics4samvedna.wikidot.com
 http://mathonline.wikidot.com
 http://mathroughguides.wikidot.com
 http://mbitcoin.wikidot.com
 http://mc-21.wikidot.com
 http://mcdt25e.wikidot.com
 http://me1065.wikidot.com
 http://measurementcamp.wikidot.com
 http://media.wikidot.com
 http://miedzymorze.wikidot.com
 http://minahaplo.wikidot.com
 http://mis213-2.wikidot.com
 http://mk2k.wikidot.com
 http://mkworld.wikidot.com
 http://mnprek-3.wikidot.com
 http://monacobayweyr.wikidot.com
 http://monobook-template.wikidot.com
 http://monobook.wikidot.com
 http://monodot-template.wikidot.com
 http://morningside-genetics.wikidot.com
 http://morningsidemicro.wikidot.com
 http://morphopedics.wikidot.com
 http://mpm.wikidot.com
 http://mukesh381.wikidot.com
 http://multiverse-crisis.wikidot.com
 http://musicgames.wikidot.com
 http://my-pride.wikidot.com
 http://mybookworld.wikidot.com
 http://myslimchatroom.wikidot.com
 http://myvineyard.wikidot.com
 http://nanorodsa.wikidot.com
 http://nanorodthermo.wikidot.com
 http://narutoitp.wikidot.com
 http://narutomushrivalry.wikidot.com
 http://nauticoamager.wikidot.com
 http://neo-dimension.wikidot.com
 http://neosteam.wikidot.com
 http://neozone.wikidot.com
 http://newapprequirements.wikidot.com
 http://news.wikidot.com
 http://nightskysymbology.wikidot.com
 http://nimin.wikidot.com
 http://ninjaproxy.wikidot.com
 http://nirn.wikidot.com
 http://nnhs-science-restrictedaccess.wikidot.com
 http://nnhs-science.wikidot.com
 http://noblebeastwars.wikidot.com
 http://nomyslamps.wikidot.com
 http://norron.wikidot.com
 http://notebook-template.wikidot.com
 http://notebooks.wikidot.com
 http://nre509.wikidot.com
 http://nsb.wikidot.com
 http://ntumed96.wikidot.com
 http://nucularelectronics.wikidot.com
 http://o5command-int.wikidot.com
 http://o5command-th.wikidot.com
 http://oblivionshard.wikidot.com
 http://offtopicarium.wikidot.com
 http://old-template.wikidot.com
 http://oneeleventwentyten.wikidot.com
 http://opend6.wikidot.com
 http://opensource-template.wikidot.com
 http://opensuse.wikidot.com
 http://oppt-sa.wikidot.com
 http://oregonamhi.wikidot.com
 http://osx86.wikidot.com
 http://oversoulgame.wikidot.com
 http://ozradonc.wikidot.com
 http://packages.wikidot.com
 http://pagi.wikidot.com
 http://pandora-saga.wikidot.com
 http://papercraft.wikidot.com
 http://paperworks.wikidot.com
 http://paradiserpg.wikidot.com
 http://paradoxhaze.wikidot.com
 http://paralelo.wikidot.com
 http://parented.wikidot.com
 http://passatb5.wikidot.com
 http://pathtogolarion.wikidot.com
 http://patriot-box-office.wikidot.com
 http://patterns.wikidot.com
 http://pbbg.wikidot.com
 http://pcg.wikidot.com
 http://pcif.wikidot.com
 http://pedhemoncreview.wikidot.com
 http://perchelinux.wikidot.com
 http://pernworld.wikidot.com
 http://personal-template.wikidot.com
 http://petition-template.wikidot.com
 http://pfcuq.wikidot.com
 http://pfseconddarkness.wikidot.com
 http://phikappatau.wikidot.com
 http://philosophia.wikidot.com
 http://philosophiesoflife.wikidot.com
 http://photo-gallery-template.wikidot.com
 http://phylo.wikidot.com
 http://pl.wikidot.com
 http://playstation3hacksandmods.wikidot.com
 http://pofomultiquiz.wikidot.com
 http://pogon.wikidot.com
 http://polls.wikidot.com
 http://porphyrarpg.wikidot.com
 http://porsche.wikidot.com
 http://pottersarmy.wikidot.com
 http://predev.wikidot.com
 http://private-template.wikidot.com
 http://processexcel.wikidot.com
 http://professorallred.wikidot.com
 http://profiles.wikidot.com
 http://project-template.wikidot.com
 http://projects.wikidot.com
 http://ps3indexhelp.wikidot.com
 http://psi-ppwg.wikidot.com
 http://psms.wikidot.com
 http://psrboregon.wikidot.com
 http://psyc101.wikidot.com
 http://psychjobsearch.wikidot.com
 http://psychotronicsdivision.wikidot.com
 http://pt851.wikidot.com
 http://puddincupcss.wikidot.com
 http://puppet.wikidot.com
 http://pw7890o.wikidot.com
 http://pylint-messages.wikidot.com
 http://qttabbar.wikidot.com
 http://quiat.wikidot.com
 http://r.wikidot.com
 http://radonc.wikidot.com
 http://railgunitp.wikidot.com
 http://ravenmarches.wikidot.com
 http://realestate-template.wikidot.com
 http://redirect-template.wikidot.com
 http://redsite.wikidot.com
 http://renegadesofpw.wikidot.com
 http://reshme.wikidot.com
 http://reskitchen.wikidot.com
 http://retrolegends.wikidot.com
 http://retrowiki.wikidot.com
 http://reykjavikmanifesto.wikidot.com
 http://rhetoricalgoddess.wikidot.com
 http://rmitvnim2007b.wikidot.com
 http://roadmap.wikidot.com
 http://roboticsclubucla.wikidot.com
 http://roboticspedia.wikidot.com
 http://rock-xproject.wikidot.com
 http://rtd1261.wikidot.com
 http://rxwiki.wikidot.com
 http://s7s.wikidot.com
 http://sacwwiki.wikidot.com
 http://salamander724.wikidot.com
 http://saludintegral.wikidot.com
 http://samvedna.wikidot.com
 http://sandboxscpfr.wikidot.com
 http://sasana.wikidot.com
 http://sasi555.wikidot.com
 http://savagetidewithfiretrolls.wikidot.com
 http://scala.wikidot.com
 http://schoolsteachersparents.wikidot.com
 http://schrijven.wikidot.com
 http://scienceonlinelondon.wikidot.com
 http://scion-mmp.wikidot.com
 http://scp-et.wikidot.com
 http://scp-field-work.wikidot.com
 http://scp-foundation-origins.wikidot.com
 http://scp-he.wikidot.com
 http://scp-hu.wikidot.com
 http://scp-int-sandbox.wikidot.com
 http://scp-int.wikidot.com
 http://scp-international.wikidot.com
 http://scp-jp-admin.wikidot.com
 http://scp-jp-archive.wikidot.com
 http://scp-jp-sandbox2.wikidot.com
 http://scp-jp-sandbox3.wikidot.com
 http://scp-jp.wikidot.com
 http://scp-ko-15c.wikidot.com
 http://scp-kr.wikidot.com
 http://scp-la.wikidot.com
 http://scp-nd.wikidot.com
 http://scp-nl.wikidot.com
 http://scp-pl-sandbox.wikidot.com
 http://scp-pl.wikidot.com
 http://scp-pt-br.wikidot.com
 http://scp-pt.wikidot.com
 http://scp-ru.wikidot.com
 http://scp-sandbox-3.wikidot.com
 http://scp-sandbox-la.wikidot.com
 http://scp-spqr.wikidot.com
 http://scp-template.wikidot.com
 http://scp-th-sandbox.wikidot.com
 http://scp-th.wikidot.com
 http://scp-tw.wikidot.com
 http://scp-ukrainian.wikidot.com
 http://scp-un.wikidot.com
 http://scp-vn.wikidot.com
 http://scp-wiki-cn.wikidot.com
 http://scp-wiki-de.wikidot.com
 http://scp-wiki.wikidot.com
 http://scpalex-fh.wikidot.com
 http://scpclassic.wikidot.com
 http://scpexplained.wikidot.com
 http://scpjp-fansite.wikidot.com
 http://scpkoreahq.wikidot.com
 http://scpminecraft.wikidot.com
 http://scpsandbox-jp.wikidot.com
 http://scpsandbox-pl.wikidot.com
 http://scpsandbox-ua.wikidot.com
 http://scpsandbox2.wikidot.com
 http://scpsandboxbr.wikidot.com
 http://scpsandboxcn.wikidot.com
 http://scpsandboxde.wikidot.com
 http://scpsandboxit.wikidot.com
 http://scpsandboxnl.wikidot.com
 http://scpvakfi.wikidot.com
 http://scpvakfisandbox.wikidot.com
 http://scpvnsandbox.wikidot.com
 http://scratch4samvedna.wikidot.com
 http://serpents-hand.wikidot.com
 http://sfi.wikidot.com
 http://sfugamedev.wikidot.com
 http://shadow4e.wikidot.com
 http://sharecokecodes.wikidot.com
 http://shop.wikidot.com
 http://sicurezzapubblica.wikidot.com
 http://sidowegraty.wikidot.com
 http://signaturbogen.wikidot.com
 http://siluria.wikidot.com
 http://simtrackipedia.wikidot.com
 http://sistdig.wikidot.com
 http://siteclone.wikidot.com
 http://sky852751.wikidot.com
 http://skyangel.wikidot.com
 http://slaythespire.wikidot.com
 http://sliscomps.wikidot.com
 http://slownik-geologiczny.wikidot.com
 http://small-steps.wikidot.com
 http://smofficer.wikidot.com
 http://smsalgebra.wikidot.com
 http://sniktbub.wikidot.com
 http://snippets.wikidot.com
 http://snow-template.wikidot.com
 http://snowleopard.wikidot.com
 http://sociatecture.wikidot.com
 http://sociatectureblog.wikidot.com
 http://socjobs.wikidot.com
 http://socjobs2011.wikidot.com
 http://soctech.wikidot.com
 http://softwarecraftsmanship.wikidot.com
 http://solariapedia.wikidot.com
 http://solodarydar.wikidot.com
 http://solpadeinehelp.wikidot.com
 http://sortibrige.wikidot.com
 http://soulslore.wikidot.com
 http://soymilkls.wikidot.com
 http://sp1.wikidot.com
 http://spambotdeathwall.wikidot.com
 http://sparks.wikidot.com
 http://sped.wikidot.com
 http://splinterverse.wikidot.com
 http://spolecznosc.wikidot.com
 http://srm.wikidot.com
 http://st-phelpers.wikidot.com
 http://stallmanism.wikidot.com
 http://standard-template.wikidot.com
 http://starwarsmadness.wikidot.com
 http://static.wikidot.com
 http://steelandstone.wikidot.com
 http://storychip.wikidot.com
 http://string-theory.wikidot.com
 http://studiocomments.wikidot.com
 http://studiolynn.wikidot.com
 http://suffadv.wikidot.com
 http://summer350.wikidot.com
 http://summerisle.wikidot.com
 http://sunnybrook-academy.wikidot.com
 http://superjet.wikidot.com
 http://surreal64ce.wikidot.com
 http://sw-gis.wikidot.com
 http://swietomuzyki.wikidot.com
 http://swwotc.wikidot.com
 http://talesofhonor.wikidot.com
 http://talkingpadproject.wikidot.com
 http://task-management.wikidot.com
 http://tasker.wikidot.com
 http://tauren.wikidot.com
 http://tech-racingcars.wikidot.com
 http://techblog-template.wikidot.com
 http://techcomm.wikidot.com
 http://ten-sb.wikidot.com
 http://terrasdeportugal.wikidot.com
 http://tex.wikidot.com
 http://textanalytics.wikidot.com
 http://the-nexus.wikidot.com
 http://theanarchstate.wikidot.com
 http://theblightedworld.wikidot.com
 http://thecollaboratory.wikidot.com
 http://thegamerdome.wikidot.com
 http://thekingkillerchronicle.wikidot.com
 http://thelaststory.wikidot.com
 http://themes.wikidot.com
 http://thep-serc.wikidot.com
 http://therafim.wikidot.com
 http://therafimrpg.wikidot.com
 http://thesimsonline.wikidot.com
 http://theskyremains.wikidot.com
 http://theunforgotten.wikidot.com
 http://thewake.wikidot.com
 http://theweird.wikidot.com
 http://theweirdwest.wikidot.com
 http://ti-iseg-t12.wikidot.com
 http://ti-iseg-t19.wikidot.com
 http://tibasicdev.wikidot.com
 http://timidgirls.wikidot.com
 http://tlug.wikidot.com
 http://tlumaczenia.wikidot.com
 http://tmduc.wikidot.com
 http://tradewithsaint.wikidot.com
 http://translate.wikidot.com
 http://translators-forum.wikidot.com
 http://trb-mux.wikidot.com
 http://triathematician.wikidot.com
 http://trueblood-dallas.wikidot.com
 http://try.wikidot.com
 http://ttu-dom.wikidot.com
 http://tyf.wikidot.com
 http://typesets.wikidot.com
 http://ubmedicinefaqs.wikidot.com
 http://ucsdgrads.wikidot.com
 http://ukcw.wikidot.com
 http://ultimatemutantsofgagetown.wikidot.com
 http://umassenglishgrad.wikidot.com
 http://uml.wikidot.com
 http://underworldlarp.wikidot.com
 http://uniofbeds.wikidot.com
 http://urbanmobile.wikidot.com
 http://uscta.wikidot.com
 http://user-gemeinschaft.wikidot.com
 http://usma387.wikidot.com
 http://valeofcallus.wikidot.com
 http://veritasbatheo.wikidot.com
 http://videoart.wikidot.com
 http://viotikoskosmos.wikidot.com
 http://virtualwargamer.wikidot.com
 http://viscomclass.wikidot.com
 http://visual-records.wikidot.com
 http://vitalusers.wikidot.com
 http://vocaro.wikidot.com
 http://vs-tcg.wikidot.com
 http://vtls-vital.wikidot.com
 http://vusb.wikidot.com
 http://vwinterop.wikidot.com
 http://vyprmedia.wikidot.com
 http://w24.wikidot.com
 http://wanderers-library-ko.wikidot.com
 http://wanderers-library.wikidot.com
 http://wanderers-sandbox.wikidot.com
 http://warsztatywww.wikidot.com
 http://web0.wikidot.com
 http://webcomicauthority.wikidot.com
 http://wfh.wikidot.com
 http://whanethewhip.wikidot.com
 http://whatever.wikidot.com
 http://wherearethejoneses.wikidot.com
 http://wikidot.com
 http://wikiedresearch.wikidot.com
 http://wikiethica.wikidot.com
 http://wikim5s.wikidot.com
 http://wikinorm.wikidot.com
 http://wikiofscience.wikidot.com
 http://wikirhye.wikidot.com
 http://wikirmaphil.wikidot.com
 http://wikistoriaenciclopedia.wikidot.com
 http://wikitipsgr.wikidot.com
 http://windycity.wikidot.com
 http://wiwimush.wikidot.com
 http://world.wikidot.com
 http://wow-arrakis.wikidot.com
 http://wpts.wikidot.com
 http://wqa.wikidot.com
 http://writ-111-office-hour-sign-up.wikidot.com
 http://writingoneeleven.wikidot.com
 http://wrtg1150.wikidot.com
 http://wtg.wikidot.com
 http://www-old.wikidot.com
 http://wychwood.wikidot.com
 http://xanadu.wikidot.com
 http://y31.wikidot.com
 http://ye-olde-music-industrapedia.wikidot.com
 http://yo801106.wikidot.com
 http://yyp.wikidot.com
 http://zeroshell.wikidot.com
 http://zmk.wikidot.com
 http://zodiac-ffrpg.wikidot.com
 http://zodiac-monster-manual.wikidot.com
 http://zombiecafe.wikidot.com
 http://zorya.wikidot.com
--- a/listsofwikis/wikidot/words.txt
+++ b/listsofwikis/wikidot/words.txt
@ -0,0 +1,214 @@
 arte
 cine
 lengua
 literatura
 matematicas
 ingles
 frances
 aleman
 ruso
 idiomas
 geografia
 historia
 secundaria
 bachillerato
 examen
 examenes
 profesor
 educacion
 profesores
 historias
 extremadura
 andalucia
 iberia
 oceano
 cultura
 periodico
 television
 radio
 italiano
 polaco
 chino
 japones
 coreano
 musica
 mozart
 beethoven
 asimov
 newton
 kilogramo
 teoria
 fisica
 deporte
 cancion
 futbol
 astronomia
 telescopio
 cuaderno
 libro
 texto
 pizarra
 descartes
 galileo
 fosiles
 paisaje
 fosil
 paisajes
 mar
 oceano
 espacio
 meteorologia
 nubes
 religion
 bandera
 lengua
 politica
 biologia
 quimica
 medicina
 tecnologia
 diagrama
 mapa
 mapas
 dibujos
 pronunciacion
 arquitectura
 compositor
 pintor
 pintura
 escultura
 museo
 biblioteca
 museos
 bibliotecas
 enciclopedia
 diccionario
 filosofia
 filosofos
 feminismo
 sociologia
 leyes
 coche
 barco
 avion
 transporte
 teatro
 europa
 america
 africa
 asia
 oceania
 australia
 atlantico
 mediterraneo
 fenicios
 griegos
 cartagineses
 palabras
 numeros
 escritura
 isla
 java
 python
 programacion
 piramide
 cuadrado
 geometria
 rectangulo
 circulo
 ciencia
 marx
 engels
 platon
 socrates
 continente
 tormenta
 terremoto
 proyecto
 glosario
 vocabulario
 aprender
 recursos
 lectura
 comunicacion
 salud
 bienestar
 europeo
 africano
 asiatico
 americano
 wiki
 wikis
 documental
 documentales
 bibliografia
 documentacion
 ciencias
 naturales
 sociales
 inteligencia
 investigacion
 cientifico
 tecnico
 cientifica
 enlaces
 antropologia
 arqueologia
 arqueologo
 filologia
 arduino
 software
 hardware
 computador
 ordenador
 siglo xx
 siglo xix
 siglo xviii
 siglo xvii
 siglo xvi
 siglo xv
 libros
 marte
 tierra
 mercurio
 jupiter
 saturno
 urano
 neptuno
 pluton
 cometa
 asteroide
 luna
 pajaro
 ave
 aves
 reptil
 reptiles
 flores
 arboles
 flor
 dictadura
 democracia
 parlamento
 universidad
 universidades
 empresa
 comida
 alimento
 equipo
 lampara
 luz
 bombilla
 electricidad
 frigorifico
 lavadora
 mueble
 fregona
 espacio
 sol
 estrella
 fenomeno
 hispanico
 hispanica
 biodiversidad
 guerra fria
--- a/listsofwikis/wikispaces/users.txt
+++ b/listsofwikis/wikispaces/users.txt
--- a/listsofwikis/wikispaces/wikis.txt
+++ b/listsofwikis/wikispaces/wikis.txt
--- a/listsofwikis/wikispaces/wikispaces-duckduckgo.py
+++ b/listsofwikis/wikispaces/wikispaces-duckduckgo.py
@ -0,0 +1,86 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 # Copyright (C) 2018 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import random
 import re
 import time
 import urllib.request
 def main():
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    urllib.request.install_opener(opener)
    words = []
    with open('words.txt', 'r') as f:
        words = f.read().strip().splitlines()
    random.shuffle(words)
    print('Loaded %d words from file' % (len(words)))
    #words = words + ['%d' % (i) for i in range(1900, 1980, 10)]
    wikis = []
    with open('wikispaces-duckduckgo.txt', 'r') as f:
        wikis = f.read().strip().splitlines()
        wikis.sort()
    print('Loaded %d wikis from file' % (len(wikis)))
    for i in range(1, 100):
        random.shuffle(words)
        for word in words:
            print('Word', word)
            word_ = re.sub(' ', '+', word)
            url = ''
            r = random.randint(0, 10)
            if r == 0:
                url = 'https://duckduckgo.com/html/?q=%s%%20site:wikispaces.com' % (word_)
            elif r == 1:
                url = 'https://duckduckgo.com/html/?q=%s%%20wikispaces.com' % (word_)
            elif r == 2:
                url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
            elif r == 3:
                url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (random.randint(100, 3000), word_)
            else:
                url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
            print('URL search', url)
            try:
                html = urllib.request.urlopen(url).read().decode('utf-8')
            except:
                print('Search error')
                sys.exit()
            html = urllib.parse.unquote(html)
            m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
            for wiki in m:
                wiki = 'https://' + wiki
                if not wiki in wikis:
                    wikis.append(wiki)
                    wikis.sort()
                    print(wiki)
            with open('wikispaces-duckduckgo.txt', 'w') as f:
                wikis2 = []
                for wiki in wikis:
                    wiki = re.sub(r'https://www\.', 'https://', wiki)
                    if not wiki in wikis2:
                        wikis2.append(wiki)
                wikis = wikis2
                wikis.sort()
                f.write('\n'.join(wikis))
            print('%d wikis found' % (len(wikis)))
            sleep = random.randint(5,20)
            print('Sleeping %d seconds' % (sleep))
            time.sleep(sleep)
 if __name__ == '__main__':
    main()
--- a/listsofwikis/wikispaces/wikispaces-duckduckgo.txt
+++ b/listsofwikis/wikispaces/wikispaces-duckduckgo.txt
--- a/listsofwikis/wikispaces/wikispaces-spider.py
+++ b/listsofwikis/wikispaces/wikispaces-spider.py
@ -16,6 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import csv
 import random
 import re
 import time
 import urllib2
@ -88,6 +89,8 @@ def getWikis(user):
        return {}
 def main():
    sleep = 0.1
    rand = 10
    users = loadUsers()
    wikis = loadWikis()
@ -112,11 +115,16 @@ def main():
                c += 1
        print 'Found %s new users' % (c)
        if c > 0:
-            saveUsers(users)
+            if random.randint(0,rand) == 0:
-            users = loadUsers()
+                saveUsers(users)
-        saveWikis(wikis)
+                users = loadUsers()
-        time.sleep(1)
+        if random.randint(0,rand) == 0:
            saveWikis(wikis)
        time.sleep(sleep)
    saveWikis(wikis)
    wikis = loadWikis()
    saveUsers(users)
    users = loadUsers()
    # find more wikis
    print 'Scanning users for more wikis'
@ -133,10 +141,15 @@ def main():
                c += 1
        print 'Found %s new wikis' % (c)
        if c > 0:
-            saveWikis(wikis)
+            if random.randint(0,rand) == 0:
-            wikis = loadWikis()
+                saveWikis(wikis)
-        saveUsers(users)
+                wikis = loadWikis()
-        time.sleep(1)
+        if random.randint(0,rand) == 0:
            saveUsers(users)
        time.sleep(sleep)
    saveWikis(wikis)
    wikis = loadWikis()
    saveUsers(users)
    users = loadUsers()
    print '\nSummary:'
--- a/listsofwikis/wikispaces/wikispaces00
+++ b/listsofwikis/wikispaces/wikispaces00
--- a/listsofwikis/wikispaces/wikispaces01
+++ b/listsofwikis/wikispaces/wikispaces01
--- a/listsofwikis/wikispaces/wikispaces02
+++ b/listsofwikis/wikispaces/wikispaces02
--- a/listsofwikis/wikispaces/wikispaces03
+++ b/listsofwikis/wikispaces/wikispaces03
--- a/listsofwikis/wikispaces/wikispaces04
+++ b/listsofwikis/wikispaces/wikispaces04
--- a/listsofwikis/wikispaces/words.txt
+++ b/listsofwikis/wikispaces/words.txt
@ -0,0 +1,214 @@
 arte
 cine
 lengua
 literatura
 matematicas
 ingles
 frances
 aleman
 ruso
 idiomas
 geografia
 historia
 secundaria
 bachillerato
 examen
 examenes
 profesor
 educacion
 profesores
 historias
 extremadura
 andalucia
 iberia
 oceano
 cultura
 periodico
 television
 radio
 italiano
 polaco
 chino
 japones
 coreano
 musica
 mozart
 beethoven
 asimov
 newton
 kilogramo
 teoria
 fisica
 deporte
 cancion
 futbol
 astronomia
 telescopio
 cuaderno
 libro
 texto
 pizarra
 descartes
 galileo
 fosiles
 paisaje
 fosil
 paisajes
 mar
 oceano
 espacio
 meteorologia
 nubes
 religion
 bandera
 lengua
 politica
 biologia
 quimica
 medicina
 tecnologia
 diagrama
 mapa
 mapas
 dibujos
 pronunciacion
 arquitectura
 compositor
 pintor
 pintura
 escultura
 museo
 biblioteca
 museos
 bibliotecas
 enciclopedia
 diccionario
 filosofia
 filosofos
 feminismo
 sociologia
 leyes
 coche
 barco
 avion
 transporte
 teatro
 europa
 america
 africa
 asia
 oceania
 australia
 atlantico
 mediterraneo
 fenicios
 griegos
 cartagineses
 palabras
 numeros
 escritura
 isla
 java
 python
 programacion
 piramide
 cuadrado
 geometria
 rectangulo
 circulo
 ciencia
 marx
 engels
 platon
 socrates
 continente
 tormenta
 terremoto
 proyecto
 glosario
 vocabulario
 aprender
 recursos
 lectura
 comunicacion
 salud
 bienestar
 europeo
 africano
 asiatico
 americano
 wiki
 wikis
 documental
 documentales
 bibliografia
 documentacion
 ciencias
 naturales
 sociales
 inteligencia
 investigacion
 cientifico
 tecnico
 cientifica
 enlaces
 antropologia
 arqueologia
 arqueologo
 filologia
 arduino
 software
 hardware
 computador
 ordenador
 siglo xx
 siglo xix
 siglo xviii
 siglo xvii
 siglo xvi
 siglo xv
 libros
 marte
 tierra
 mercurio
 jupiter
 saturno
 urano
 neptuno
 pluton
 cometa
 asteroide
 luna
 pajaro
 ave
 aves
 reptil
 reptiles
 flores
 arboles
 flor
 dictadura
 democracia
 parlamento
 universidad
 universidades
 empresa
 comida
 alimento
 equipo
 lampara
 luz
 bombilla
 electricidad
 frigorifico
 lavadora
 mueble
 fregona
 espacio
 sol
 estrella
 fenomeno
 hispanico
 hispanica
 biodiversidad
 guerra fria
--- a/testing/test_dumpgenerator.py
+++ b/testing/test_dumpgenerator.py
@ -62,14 +62,14 @@ class TestDumpgenerator(unittest.TestCase):
        tests = [
            # Alone wikis
            #['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'],
-            ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
+            ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
-            ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
+            #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
            # Editthis wikifarm
            # It has a page view limit
            # Gamepedia wikifarm
-            ['http://dawngate.gamepedia.com/index.php', 'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'],
+            #['http://dawngate.gamepedia.com/index.php', 'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'],
            # Neoseeker wikifarm
            #['http://digimon.neoseeker.com/w/index.php', 'http://digimon.neoseeker.com/w/api.php', u'Ogremon card.png'],
@ -78,13 +78,13 @@ class TestDumpgenerator(unittest.TestCase):
            #['http://mc.orain.org/w/index.php', 'http://mc.orain.org/w/api.php', u'Mojang logo.svg'],
            # Referata wikifarm
-            ['http://wikipapers.referata.com/w/index.php', 'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'],
+            #['http://wikipapers.referata.com/w/index.php', 'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'],
            # ShoutWiki wikifarm
-            ['http://commandos.shoutwiki.com/w/index.php', 'http://commandos.shoutwiki.com/w/api.php', u'Night of the Wolves loading.png'],
+            #['http://commandos.shoutwiki.com/w/index.php', 'http://commandos.shoutwiki.com/w/api.php', u'Night of the Wolves loading.png'],
            # Wiki-site wikifarm
-            ['http://minlingo.wiki-site.com/index.php', 'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'],
+            #['http://minlingo.wiki-site.com/index.php', 'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'],
            # Wikkii wikifarm
            # It seems offline
@ -146,8 +146,8 @@ class TestDumpgenerator(unittest.TestCase):
        print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
        tests = [
            # Alone wikis
-            ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'],
+            ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'April Fools\' Day'],
-            ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],
+            #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],
            # Test old allpages API behaviour
            #['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'],
@ -261,7 +261,11 @@ class TestDumpgenerator(unittest.TestCase):
        ]
        for wiki, engine in tests:
            print 'Testing', wiki
-            guess_engine = getWikiEngine(wiki)
+            try:
                guess_engine = getWikiEngine(wiki)
            except ConnectionError:
                print "%s failed to load, skipping..." % (wiki)
                continue
            print 'Got: %s, expected: %s' % (guess_engine, engine)
            self.assertEqual(guess_engine, engine)
@ -269,14 +273,14 @@ class TestDumpgenerator(unittest.TestCase):
        print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73
        tests = [
            # Alone wikis
-            ['http://archiveteam.org', 'http://archiveteam.org/api.php', 'http://archiveteam.org/index.php'],
+            ['https://www.archiveteam.org', 'https://www.archiveteam.org/api.php', 'https://www.archiveteam.org/index.php'],
-            ['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],
+            #['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],
            # Editthis wikifarm
            # It has a page view limit
            # Gamepedia wikifarm
-            ['http://dawngate.gamepedia.com', 'http://dawngate.gamepedia.com/api.php', 'http://dawngate.gamepedia.com/index.php'],
+            #['http://dawngate.gamepedia.com', 'http://dawngate.gamepedia.com/api.php', 'http://dawngate.gamepedia.com/index.php'],
            # Neoseeker wikifarm
            #['http://digimon.neoseeker.com', 'http://digimon.neoseeker.com/w/api.php', 'http://digimon.neoseeker.com/w/index.php'],
@ -288,7 +292,7 @@ class TestDumpgenerator(unittest.TestCase):
            # ['http://wikipapers.referata.com', 'http://wikipapers.referata.com/w/api.php', 'http://wikipapers.referata.com/w/index.php'],
            # ShoutWiki wikifarm
-            ['http://commandos.shoutwiki.com', 'http://commandos.shoutwiki.com/w/api.php', 'http://commandos.shoutwiki.com/w/index.php'],
+            #['http://commandos.shoutwiki.com', 'http://commandos.shoutwiki.com/w/api.php', 'http://commandos.shoutwiki.com/w/index.php'],
            # Wiki-site wikifarm
            #['http://minlingo.wiki-site.com', 'http://minlingo.wiki-site.com/api.php', 'http://minlingo.wiki-site.com/index.php'],
--- a/uploader.py
+++ b/uploader.py
@ -16,6 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import getopt
 import argparse
 import os
 import re
 import subprocess
@ -30,89 +31,41 @@ from internetarchive import get_item
 import dumpgenerator
 # Configuration goes here
 # You need a file named keys.txt with access and secret keys, in two different lines
 accesskey = open('keys.txt', 'r').readlines()[0].strip()
 secretkey = open('keys.txt', 'r').readlines()[1].strip()
 # Use --admin if you are a wikiteam collection admin, or specify another collection:
 collection = 'opensource'
 # Nothing to change below
 convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
 listfile = sys.argv[1]
 uploadeddumps = []
 try:
    uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
 except:
    pass
 print '%d dumps uploaded previously' % (len(uploadeddumps))
 def getParameters(params=[]):
    if not params:
        params = sys.argv[2:]
    config = {
        'prune-directories': False,
        'prune-wikidump': False,
        'collection': collection,
        'update': False,
    }
    #console params
    try:
        opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"])
    except getopt.GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        usage()
        sys.exit(2)
    for o, a in opts:
        if o in ("-h","--help"):
            usage()
            sys.exit()
        elif o in ("--prune-directories"):
            config['prune-directories'] = True
        elif o in ("--prune-wikidump"):
            config['prune-wikidump'] = True
        elif o in ("--admin"):
            config['collection'] = "wikiteam"
        elif o in ("--update"):
            config['update'] = True
    return config
 def usage():
    """  """
    print """uploader.py
-This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
+def log(wiki, dump, msg, config={}):
-The list must be a text file with the wiki's api.php URLs, one per line.
+    f = open('uploader-%s.log' % (config.listfile), 'a')
 Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
 as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
 You need a file named keys.txt with access and secret keys, in two different lines
 You also need dumpgenerator.py in the same directory as this script.
 Use --help to print this help."""
 def log(wiki, dump, msg):
    f = open('uploader-%s.log' % (listfile), 'a')
    f.write('\n%s;%s;%s' % (wiki, dump, msg))
    f.close()
-def upload(wikis, config={}):
+def upload(wikis, config={}, uploadeddumps=[]):
    headers = {'User-Agent': dumpgenerator.getUserAgent()}
    dumpdir = config.wikidump_dir
    filelist = os.listdir(dumpdir)
    for wiki in wikis:
        print "#"*73
        print "# Uploading", wiki
        print "#"*73
        wiki = wiki.lower()
-        prefix = dumpgenerator.domain2prefix(config={'api': wiki})
+        configtemp = config
        try:
            prefix = dumpgenerator.domain2prefix(config={'api': wiki})
        except KeyError:
            print "ERROR: could not produce the prefix for %s" % wiki
        config = configtemp
        wikiname = prefix.split('-')[0]
        dumps = []
-        for dirname, dirnames, filenames in os.walk('.'):
+        for f in filelist:
-            if dirname == '.':
+            if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
-                for f in filenames:
+                print "%s found" % f
-                    if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
+                dumps.append(f)
                        dumps.append(f)
                break
        c = 0
@ -120,30 +73,33 @@ def upload(wikis, config={}):
            wikidate = dump.split('-')[1]
            item = get_item('wiki-' + wikiname)
            if dump in uploadeddumps:
-                if config['prune-directories']:
+                if config.prune_directories:
                    rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
-                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
+                if config.prune_wikidump and dump.endswith('wikidump.7z'):
                        # Simplistic quick&dirty check for the presence of this file in the item
-                        stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+                        print "Checking content in previously uploaded files"
                        stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                        dumphash = re.sub(' +.+\n?', '', stdout)
                        if dumphash in map(lambda x: x['md5'], item.files):
-                            log(wiki, dump, 'verified')
+                            log(wiki, dump, 'verified', config)
-                            rmline='rm -rf %s' % dump
+                            rmline='rm -rf %s' % dumpdir + '/' + dump
                            if not os.system(rmline):
-                                print 'DELETED ' + dump
+                                print 'DELETED ' + dumpdir + '/' + dump
                            print '%s was uploaded before, skipping...' % (dump)
                            continue
                        else:
                            print 'ERROR: The online item misses ' + dump
-                            log(wiki, dump, 'missing')
+                            log(wiki, dump, 'missing', config)
                            # We'll exit this if and go upload the dump
                else:
                    print '%s was uploaded before, skipping...' % (dump)
                    continue
            else:
                print '%s was not uploaded before' % dump
            time.sleep(0.1)
            wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
@ -155,7 +111,7 @@ def upload(wikis, config={}):
            # Logo path
            logourl = ''
-            if ismissingitem or config['update']:
+            if ismissingitem or config.update:
                #get metadata from api.php
                #first sitename and base url
                params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
@ -163,7 +119,7 @@ def upload(wikis, config={}):
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
-                    f = urllib2.urlopen(req)
+                    f = urllib2.urlopen(req, timeout=10)
                    xml = f.read()
                    f.close()
                except:
@ -198,7 +154,7 @@ def upload(wikis, config={}):
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
-                    f = urllib2.urlopen(req)
+                    f = urllib2.urlopen(req, timeout=10)
                    xml = f.read()
                    f.close()
                except:
@ -214,7 +170,7 @@ def upload(wikis, config={}):
                raw = ''
                try:
-                    f = urllib.urlopen(baseurl)
+                    f = urllib.urlopen(baseurl, timeout=10)
                    raw = f.read()
                    f.close()
                except:
@ -238,7 +194,6 @@ def upload(wikis, config={}):
                    logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
                except:
                    pass
                print logourl
                #retrieve some info from the wiki
                wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
@ -264,7 +219,7 @@ def upload(wikis, config={}):
                # Item metadata
                md = {
                    'mediatype': 'web',
-                    'collection': config['collection'],
+                    'collection': config.collection,
                    'title': wikititle,
                    'description': wikidesc,
                    'language': lang,
@ -277,25 +232,54 @@ def upload(wikis, config={}):
            #Upload files and update metadata
            try:
-                item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
+                item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
                item.modify_metadata(md) # update
                print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
                uploadeddumps.append(dump)
                log(wiki, dump, 'ok', config)
                if logourl:
-                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
+                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read())
                    logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
                    logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
                    item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
-                uploadeddumps.append(dump)
+            except Exception as e:
-                log(wiki, dump, 'ok')
+                print wiki, dump, 'Error when uploading?'
-            except:
+                print e.message
                print wiki, dump, 'error when uploading?'
            c += 1
 def main(params=[]):
-    config = getParameters(params=params)
+    parser = argparse.ArgumentParser("""uploader.py
 This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
 The list must be a text file with the wiki's api.php URLs, one per line.
 Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
 as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
 You need a file named keys.txt with access and secret keys, in two different lines
 You also need dumpgenerator.py in the same directory as this script.
 Use --help to print this help.""")
    parser.add_argument('-pd', '--prune_directories', action='store_true')
    parser.add_argument('-pw', '--prune_wikidump', action='store_true')
    parser.add_argument('-a', '--admin', action='store_true')
    parser.add_argument('-c', '--collection', default='opensource')
    parser.add_argument('-wd', '--wikidump_dir', default='.')
    parser.add_argument('-u', '--update', action='store_true')
    parser.add_argument('listfile')
    config = parser.parse_args()
    if config.admin:
        config.collection = 'wikiteam'
    uploadeddumps = []
    listfile = config.listfile
    try:
        uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
    except:
        pass
    print '%d dumps uploaded previously' % (len(uploadeddumps))
    wikis = open(listfile, 'r').read().strip().splitlines()
-    upload(wikis, config)
+
    upload(wikis, config, uploadeddumps)
 if __name__ == "__main__":
    main()
--- a/wikiapiary/wikiapiary-update-ia-params.py
+++ b/wikiapiary/wikiapiary-update-ia-params.py
@ -24,7 +24,7 @@ def main():
    site = pywikibot.Site('wikiapiary', 'wikiapiary')
    catname = 'Category:Website'
    cat = pywikibot.Category(site, catname)
-    gen = pagegenerators.CategorizedPageGenerator(cat, start='Spyropedia')
+    gen = pagegenerators.CategorizedPageGenerator(cat, start='!')
    pre = pagegenerators.PreloadingGenerator(gen)
    for page in pre:
@ -52,7 +52,8 @@ def main():
                print('No API found in WikiApiary, skiping')
                continue
-            urliasearch = 'https://archive.org/search.php?query=originalurl:"%s"' % (apiurl)
+            indexurl = 'index.php'.join(apiurl.rsplit('api.php', 1))
            urliasearch = 'https://archive.org/search.php?query=originalurl:"%s" OR originalurl:"%s"' % (apiurl, indexurl)
            f = urllib.request.urlopen(urliasearch)
            raw = f.read().decode('utf-8')
            if re.search(r'(?i)Your search did not match any items', raw):
--- a/wikispaces.py
+++ b/wikispaces.py
@ -0,0 +1,458 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 # Copyright (C) 2018 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
 # Documentation for developers: http://wikiteam.readthedocs.com
 import csv
 import datetime
 import os
 import random
 import re
 import subprocess
 import sys
 import time
 import urllib.request
 #from internetarchive import get_item
 # Requirements:
 # zip command (apt-get install zip)
 # ia command (pip install internetarchive, and configured properly)
 """
 # You need a file with access and secret keys, in two different lines
 iakeysfilename = '%s/.iakeys' % (os.path.expanduser('~'))
 if os.path.exists(iakeysfilename):
    accesskey = open(iakeysfilename, 'r').readlines()[0].strip()
    secretkey = open(iakeysfilename, 'r').readlines()[1].strip()
 else:
    print('Error, no %s file with S3 keys for Internet Archive account' % (iakeysfilename))
    sys.exit()
 """
 def saveURL(wikidomain='', url='', filename='', path='', overwrite=False, iteration=1):
    filename2 = '%s/%s' % (wikidomain, filename)
    if path:
        filename2 = '%s/%s/%s' % (wikidomain, path, filename)
    if os.path.exists(filename2):
        if not overwrite:
            print('Warning: file exists on disk. Skipping download. Force download with parameter --overwrite')
            return
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    urllib.request.install_opener(opener)
    try:
        urllib.request.urlretrieve(url, filename2)
    except:
        sleep = 10 # seconds
        maxsleep = 30
        while sleep <= maxsleep:
            try:
                print('Error while retrieving: %s' % (url))
                print('Retry in %s seconds...' % (sleep))
                time.sleep(sleep)
                urllib.request.urlretrieve(url, filename2)
                return
            except:
                sleep = sleep * 2
        print('Download failed')
    #sometimes wikispaces returns invalid data, redownload in that cases
    #only 'pages'. 'files' binaries are a pain to open and check
    if (os.path.exists(filename2) and 'pages' in path) or \
        (os.path.exists(filename2) and path == '' and filename2.split('.')[-1] in ['xml', 'html', 'csv']):
        sleep2 = 60 * iteration
        raw = ''
        try:
            with open(filename2, 'r', encoding='utf-8') as f:
                raw = f.read()
        except:
            with open(filename2, 'r', encoding='latin-1') as f:
                raw = f.read()
        if re.findall(r'(?im)<title>TES and THE Status</title>', raw):
            print('Warning: invalid content. Waiting %d seconds and re-downloading' % (sleep2))
            time.sleep(sleep2)
            saveURL(wikidomain=wikidomain, url=url, filename=filename, path=path, overwrite=overwrite, iteration=iteration+1)
 def undoHTMLEntities(text=''):
    """ Undo some HTML codes """
    # i guess only < > & " ' need conversion
    # http://www.w3schools.com/html/html_entities.asp
    text = re.sub('&lt;', '<', text)
    text = re.sub('&gt;', '>', text)
    text = re.sub('&amp;', '&', text)
    text = re.sub('&quot;', '"', text)
    text = re.sub('&#039;', '\'', text)
    return text
 def convertHTML2Wikitext(wikidomain='', filename='', path=''):
    wikitext = ''
    wikitextfile = '%s/%s/%s' % (wikidomain, path, filename)
    if not os.path.exists(wikitextfile):
        print('Error retrieving wikitext, page is a redirect probably')
        return
    with open(wikitextfile, 'r') as f:
        wikitext = f.read()
    with open(wikitextfile, 'w') as f:
        m = re.findall(r'(?im)<div class="WikispacesContent WikispacesBs3">\s*<pre>', wikitext)
        if m:
            try:
                wikitext = wikitext.split(m[0])[1].split('</pre>')[0].strip()
                wikitext = undoHTMLEntities(text=wikitext)
            except:
                pass
        f.write(wikitext)
 def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False):
    pagenameplus = re.sub(' ', '+', pagename)
    pagename_ = urllib.parse.quote(pagename)
    #page current revision (html & wikitext)
    pageurl = '%s/%s' % (wikiurl, pagename_)
    filename = '%s.html' % (pagenameplus)
    print('Downloading page: %s' % (filename))
    saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite)
    pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_)
    filename2 = '%s.wikitext' % (pagenameplus)
    print('Downloading page: %s' % (filename2))
    saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite)
    convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages')
    #csv with page history
    csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_)
    csvfilename = '%s.history.csv' % (pagenameplus)
    print('Downloading page: %s' % (csvfilename))
    saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite)
 def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False):
    filenameplus = re.sub(' ', '+', filename)
    filename_ = urllib.parse.quote(filename)
    #file full resolution
    fileurl = '%s/file/view/%s' % (wikiurl, filename_)
    filename = filenameplus
    print('Downloading file: %s' % (filename))
    saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite)
    #csv with file history
    csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_)
    csvfilename = '%s.history.csv' % (filenameplus)
    print('Downloading file: %s' % (csvfilename))
    saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite)
 def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False):
    print('Downloading Pages and Files from %s' % (wikiurl))
    #csv all pages and files
    csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl)
    saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='')
    #download every page and file
    totallines = 0
    with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f:
        totallines = len(f.read().splitlines()) - 1
    with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile:
        filesc = 0
        pagesc = 0
        print('This wiki has %d pages and files' % (totallines))
        rows = csv.reader(csvfile, delimiter=',', quotechar='"')
        for row in rows:
            if row[0] == 'file':
                filesc += 1
                filename = row[1]
                downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite)
            elif row[0] == 'page':
                pagesc += 1
                pagename = row[1]
                downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite)
            if (filesc + pagesc) % 10 == 0:
                print('  Progress: %d of %d' % ((filesc + pagesc), totallines))
        print('  Progress: %d of %d' % ((filesc + pagesc), totallines))
    print('Downloaded %d pages' % (pagesc))
    print('Downloaded %d files' % (filesc))
 def downloadSitemap(wikidomain='', wikiurl='', overwrite=False):
    print('Downloading sitemap.xml')
    saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite)
 def downloadMainPage(wikidomain='', wikiurl='', overwrite=False):
    print('Downloading index.html')
    saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite)
 def downloadLogo(wikidomain='', wikiurl='', overwrite=False):
    index = '%s/index.html' % (wikidomain)
    if os.path.exists(index):
        raw = ''
        try:
            with open(index, 'r', encoding='utf-8') as f:
                raw = f.read()
        except:
            with open(index, 'r', encoding='latin-1') as f:
                raw = f.read()
        m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', raw)
        if m:
            logourl = m[0]
            logofilename = logourl.split('/')[-1]
            print('Downloading logo')
            saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='', overwrite=overwrite)
            return logofilename
    return ''
 def printhelp():
    helptext = """This script downloads (and uploads) WikiSpaces wikis.
 Parameters available:
 --upload: upload compressed file with downloaded wiki
 --admin: add item to WikiTeam collection (if you are an admin in that collection)
 --overwrite: download again even if files exists locally
 --overwrite-ia: upload again to Internet Archive even if item exists there
 --help: prints this help text
 Examples:
 python3 wikispaces.py https://mywiki.wikispaces.com
   It downloads that wiki
 python3 wikispaces.py wikis.txt
   It downloads a list of wikis (file format is a URL per line)
 python3 wikispaces.py https://mywiki.wikispaces.com --upload
   It downloads that wiki, compress it and uploading to Internet Archive
 """
    print(helptext)
    sys.exit()
 def duckduckgo():
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    urllib.request.install_opener(opener)
    wikis = []
    ignorewikis = [
        'https://wikispaces.com', 
        'https://www.wikispaces.com', 
        'https://wikispaces.net', 
        'https://www.wikispaces.net', 
    ]
    for i in range(1, 100000):
        url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikispaces.com' % (random.randint(100, 5000), random.randint(1000, 9999))
        print('URL search', url)
        try:
            html = urllib.request.urlopen(url).read().decode('utf-8')
        except:
            print('Search error')
            time.sleep(30)
            continue
        html = urllib.parse.unquote(html)
        m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
        for wiki in m:
            wiki = 'https://' + wiki
            wiki = re.sub(r'https://www\.', 'https://', wiki)
            if not wiki in wikis and not wiki in ignorewikis:
                wikis.append(wiki)
                yield wiki
        sleep = random.randint(5,20)
        print('Sleeping %d seconds' % (sleep))
        time.sleep(sleep)
 def main():
    upload = False
    isadmin = False
    overwrite = False
    overwriteia = False
    if len(sys.argv) < 2:
        printhelp()
    param = sys.argv[1]
    if not param:
        printhelp()
    if len(sys.argv) > 2:
        if '--upload' in sys.argv:
            upload = True
        if '--admin' in sys.argv:
            isadmin = True
        if '--overwrite' in sys.argv:
            overwrite = True
        if '--overwrite-ia' in sys.argv:
            overwriteia = True
        if '--help' in sys.argv:
            printhelp()
    wikilist = []
    if '://' in param:
        wikilist.append(param.rstrip('/'))
    elif param.lower() == 'duckduckgo':
        wikilist = duckduckgo()
        #for wiki in wikilist:
        #    print(wiki)
    else:
        with open(param, 'r') as f:
            wikilist = f.read().strip().splitlines()
            wikilist2 = []
            for wiki in wikilist:
                wikilist2.append(wiki.rstrip('/'))
            wikilist = wikilist2
    for wikiurl in wikilist:
        wikidomain = wikiurl.split('://')[1].split('/')[0]
        print('\n')
        print('#'*40,'\n Downloading:', wikiurl)
        print('#'*40,'\n')
        if upload and not overwriteia:
            itemid = 'wiki-%s' % (wikidomain)
            try:
                iahtml = ''
                try:
                    iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
                except:
                    time.sleep(10)
                    iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
                if iahtml and not re.findall(r'(?im)Item cannot be found', iahtml):
                    if not overwriteia:
                        print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia')
                        print('You can find it in https://archive.org/details/%s' % (itemid))
                        time.sleep(1)
                        continue
            except:
                pass
        dirfiles = '%s/files' % (wikidomain)
        if not os.path.exists(dirfiles):
            print('Creating directory %s' % (dirfiles))
            os.makedirs(dirfiles)
        dirpages = '%s/pages' % (wikidomain)
        if not os.path.exists(dirpages):
            print('Creating directory %s' % (dirpages))
            os.makedirs(dirpages)
        sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
        downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite)
        if not os.path.exists('%s/sitemap.xml' % (wikidomain)):
            print('Error, wiki was probably deleted. Skiping wiki...')
            continue
        else:
            sitemapraw = ''
            try:
                with open('%s/sitemap.xml' % (wikidomain), encoding='utf-8') as g:
                    sitemapraw = g.read()
            except:
                with open('%s/sitemap.xml' % (wikidomain), encoding='latin-1') as g:
                    sitemapraw = g.read()
            if re.search(r'(?im)<h1>This wiki has been deactivated</h1>', sitemapraw):
                print('Error, wiki was deactivated. Skiping wiki...')
                continue
        downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
        if not os.path.exists('%s/index.html' % (wikidomain)):
            print('Error, wiki was probably deleted or expired. Skiping wiki...')
            continue
        else:
            indexraw = ''
            try:
                with open('%s/index.html' % (wikidomain), encoding='utf-8') as g:
                    indexraw = g.read()
            except:
                with open('%s/index.html' % (wikidomain), encoding='latin-1') as g:
                    indexraw = g.read()
            if re.search(r'(?im)<h1>Subscription Expired</h1>', indexraw):
                print('Error, wiki subscription expired. Skiping wiki...')
                continue
        downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
        logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
        if upload:
            itemid = 'wiki-%s' % (wikidomain)
            print('\nCompressing dump...')
            wikidir = wikidomain
            os.chdir(wikidir)
            print('Changed directory to', os.getcwd())
            wikizip = '%s.zip' % (wikidomain)
            subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
            os.chdir('..')
            print('Changed directory to', os.getcwd())
            print('\nUploading to Internet Archive...')
            indexfilename = '%s/index.html' % (wikidir)
            if not os.path.exists(indexfilename):
                print('\nError dump incomplete, skipping upload\n')
                continue
            indexhtml = ''
            try:
                with open(indexfilename, 'r', encoding='utf-8') as f:
                    indexhtml = f.read()
            except:
                with open(indexfilename, 'r', encoding='latin-1') as f:
                    indexhtml = f.read()
            wikititle = ''
            try:
                wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
            except:
                wikititle = wikidomain
            if not wikititle:
                wikititle = wikidomain
            wikititle = wikititle.replace("\\'", " ")
            wikititle = wikititle.replace('\\"', " ")
            itemtitle = 'Wiki - %s' % wikititle
            itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
            itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
            itemoriginalurl = wikiurl
            itemlicenseurl = ''
            m = ''
            try:
                m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
            except:
                m = ''
            if m:
                itemlicenseurl = m[0]
            if not itemlicenseurl:
                itemtags.append('unknowncopyright')
            itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
            itemcollection = isadmin and 'wikiteam' or 'opensource'
            itemlang = 'Unknown'
            itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
            itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''                
            callplain = "ia upload %s %s %s --metadata='mediatype:web' --metadata='collection:%s' --metadata='title:%s' --metadata='description:%s' --metadata='language:%s' --metadata='last-updated-date:%s' --metadata='originalurl:%s' %s %s" % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemoriginalurl, itemlicenseurl and "--metadata='licenseurl:%s'" % (itemlicenseurl) or '', itemtags_)
            print(callplain)
            subprocess.call(callplain, shell=True)
            """
            md = {
                'mediatype': 'web',
                'collection': itemcollection,
                'title': itemtitle,
                'description': itemdesc,
                'language': itemlang,
                'last-updated-date': itemdate,
                'subject': '; '.join(itemtags), 
                'licenseurl': itemlicenseurl,
                'originalurl': itemoriginalurl,
            }
            item = get_item(itemid)
            item.upload(wikizip, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
            item.modify_metadata(md)
            if itemlogo:
                item.upload(itemlogo, access_key=accesskey, secret_key=secretkey, verbose=True)
            """
            print('You can find it in https://archive.org/details/%s' % (itemid))
            os.remove(wikizip)
 if __name__ == "__main__":
    main()
--- a/wikiteam/mediawiki.py
+++ b/wikiteam/mediawiki.py
@ -228,7 +228,11 @@ def mwGetImageNamesAPI(config={}):
                url = mwCurateImageURL(config=config, url=url)
                # encoding to ascii is needed to work around this horrible bug:
                # http://bugs.python.org/issue8136
-                filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
+                if 'mwapi' in config and '.wikia.com' in config['mwapi']:
                    #to avoid latest?cb=20120816112532 in filenames
                    filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')
                else:
                    filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
                uploader = re.sub('_', ' ', image['user'])
                imagenames.append([filename, url, uploader])
        else:
		`@ -0,0 +1,2 @@`
							`*.com linguist-vendored`
							`*.org linguist-vendored`