pull/346/head
emijrp 5 years ago
commit aecee2dc53

2
.gitattributes vendored

@ -0,0 +1,2 @@
*.com linguist-vendored
*.org linguist-vendored

@ -4,3 +4,5 @@ install:
- pip install tox - pip install tox
script: script:
- tox - tox
notifications:
email: false

@ -1,7 +1,7 @@
# WikiTeam # WikiTeam
### We archive wikis, from Wikipedia to tiniest wikis ### We archive wikis, from Wikipedia to tiniest wikis
**WikiTeam software is a set of tools for archiving wikis.** They work on MediaWiki wikis, but we want to expand to other wiki engines. As of January 2016, WikiTeam has preserved more than [27,000 stand-alone wikis](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups), several wikifarms, regular Wikipedia dumps and [34 TB of Wikimedia Commons images](https://archive.org/details/wikimediacommons). **WikiTeam software is a set of tools for archiving wikis.** They work on MediaWiki wikis, but we want to expand to other wiki engines. As of 2019, WikiTeam has preserved more than [250,000 wikis](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups), several wikifarms, regular Wikipedia dumps and [34 TB of Wikimedia Commons images](https://archive.org/details/wikimediacommons).
There are [thousands](http://wikiindex.org) of [wikis](https://wikiapiary.com) in the Internet. Every day some of them are no longer publicly available and, due to lack of backups, lost forever. Millions of people download tons of media files (movies, music, books, etc) from the Internet, serving as a kind of distributed backup. Wikis, most of them under free licenses, disappear from time to time because nobody grabbed a copy of them. That is a shame that we would like to solve. There are [thousands](http://wikiindex.org) of [wikis](https://wikiapiary.com) in the Internet. Every day some of them are no longer publicly available and, due to lack of backups, lost forever. Millions of people download tons of media files (movies, music, books, etc) from the Internet, serving as a kind of distributed backup. Wikis, most of them under free licenses, disappear from time to time because nobody grabbed a copy of them. That is a shame that we would like to solve.

File diff suppressed because it is too large Load Diff

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# dumpgenerator.py A generator of dumps for wikis # dumpgenerator.py A generator of dumps for wikis
# Copyright (C) 2011-2016 WikiTeam developers # Copyright (C) 2011-2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or # the Free Software Foundation, either version 3 of the License, or
@ -20,7 +20,7 @@
# https://github.com/WikiTeam/wikiteam/wiki # https://github.com/WikiTeam/wikiteam/wiki
try: try:
from kitchen.text.converters import getwriter from kitchen.text.converters import getwriter, to_unicode
except ImportError: except ImportError:
print "Please install the kitchen module." print "Please install the kitchen module."
import cookielib import cookielib
@ -39,17 +39,31 @@ except ImportError: # Python 2.4 compatibility
from md5 import new as md5 from md5 import new as md5
import os import os
import re import re
import subprocess
try: try:
import requests import requests
except ImportError: except ImportError:
print "Please install or update the Requests module." print "Please install or update the Requests module."
sys.exit(1) sys.exit(1)
try:
import wikitools
except ImportError:
print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
try:
from lxml import etree
from lxml.builder import E
except ImportError:
print "Please install the lxml module if you want to use --xmlrevisions."
import time import time
import urllib import urllib
try:
from urlparse import urlparse, urlunparse
except ImportError:
from urllib.parse import urlparse, urlunparse
UTF8Writer = getwriter('utf8') UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout) sys.stdout = UTF8Writer(sys.stdout)
__VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org __VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org
class PageMissingError(Exception): class PageMissingError(Exception):
def __init__(self, title, xml): def __init__(self, title, xml):
@ -150,7 +164,7 @@ def getNamespacesScraper(config={}, session=None):
namespacenames = {0: ''} # main is 0, no prefix namespacenames = {0: ''} # main is 0, no prefix
if namespaces: if namespaces:
r = session.post( r = session.post(
url=config['index'], data={'title': 'Special:Allpages'}) url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
raw = r.text raw = r.text
delay(config=config, session=session) delay(config=config, session=session)
@ -187,33 +201,41 @@ def getNamespacesAPI(config={}, session=None):
if namespaces: if namespaces:
r = session.post( r = session.post(
url=config['api'], url=config['api'],
data={ params={
'action': 'query', 'action': 'query',
'meta': 'siteinfo', 'meta': 'siteinfo',
'siprop': 'namespaces', 'siprop': 'namespaces',
'format': 'json'} 'format': 'json'},
timeout=30
) )
result = getJSON(r) result = getJSON(r)
delay(config=config, session=session) delay(config=config, session=session)
try:
nsquery = result['query']['namespaces']
except KeyError:
print "Error: could not get namespaces from the API request"
print "HTTP %d" % r.status_code
print r.text
return None
if 'all' in namespaces: if 'all' in namespaces:
namespaces = [] namespaces = []
for i in result['query']['namespaces'].keys(): for i in nsquery.keys():
if int(i) < 0: # -1: Special, -2: Media, excluding if int(i) < 0: # -1: Special, -2: Media, excluding
continue continue
namespaces.append(int(i)) namespaces.append(int(i))
namespacenames[int(i)] = result['query']['namespaces'][i]['*'] namespacenames[int(i)] = nsquery[i]['*']
else: else:
# check if those namespaces really exist in this wiki # check if those namespaces really exist in this wiki
namespaces2 = [] namespaces2 = []
for i in result['query']['namespaces'].keys(): for i in nsquery.keys():
bi = i bi = i
i = int(i) i = int(i)
if i < 0: # -1: Special, -2: Media, excluding if i < 0: # -1: Special, -2: Media, excluding
continue continue
if i in namespaces: if i in namespaces:
namespaces2.append(i) namespaces2.append(i)
namespacenames[i] = result['query']['namespaces'][bi]['*'] namespacenames[i] = nsquery[bi]['*']
namespaces = namespaces2 namespaces = namespaces2
else: else:
namespaces = [0] namespaces = [0]
@ -249,7 +271,7 @@ def getPageTitlesAPI(config={}, session=None):
retryCount = 0 retryCount = 0
while retryCount < config["retries"]: while retryCount < config["retries"]:
try: try:
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params, timeout=30)
break break
except ConnectionError as err: except ConnectionError as err:
print "Connection error: %s" % (str(err),) print "Connection error: %s" % (str(err),)
@ -271,21 +293,27 @@ def getPageTitlesAPI(config={}, session=None):
apfrom = jsontitles['continue']['apcontinue'] apfrom = jsontitles['continue']['apcontinue']
elif 'apfrom' in jsontitles['continue']: elif 'apfrom' in jsontitles['continue']:
apfrom = jsontitles['continue']['apfrom'] apfrom = jsontitles['continue']['apfrom']
# print apfrom # print apfrom
# print jsontitles # print jsontitles
allpages = jsontitles['query']['allpages'] try:
allpages = jsontitles['query']['allpages']
except KeyError:
print "The allpages API returned nothing. Exit."
sys.exit(1)
# Hack for old versions of MediaWiki API where result is dict # Hack for old versions of MediaWiki API where result is dict
if isinstance(allpages, dict): if isinstance(allpages, dict):
allpages = allpages.values() allpages = allpages.values()
for page in allpages: for page in allpages:
yield page['title'] title = page['title']
titles.append(title)
yield title
c += len(allpages) c += len(allpages)
if len(titles) != len(set(titles)): if len(titles) != len(set(titles)):
# probably we are in a loop, server returning dupe titles, stop print 'Probably a loop, switching to next namespace. Duplicate title:'
# it print title
print 'Probably a loop, finishing'
titles = list(set(titles)) titles = list(set(titles))
apfrom = '' apfrom = ''
@ -301,7 +329,7 @@ def getPageTitlesScraper(config={}, session=None):
print ' Retrieving titles in the namespace', namespace print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % ( url = '%s?title=Special:Allpages&namespace=%s' % (
config['index'], namespace) config['index'], namespace)
r = session.get(url=url) r = session.get(url=url, timeout=30)
raw = r.text raw = r.text
raw = cleanHTML(raw) raw = cleanHTML(raw)
@ -353,7 +381,7 @@ def getPageTitlesScraper(config={}, session=None):
# to avoid reload dupe subpages links # to avoid reload dupe subpages links
checked_suballpages.append(name) checked_suballpages.append(name)
delay(config=config, session=session) delay(config=config, session=session)
r2 = session.get(url=url) r2 = session.get(url=url, timeout=10)
raw2 = r2.text raw2 = r2.text
raw2 = cleanHTML(raw2) raw2 = cleanHTML(raw2)
rawacum += raw2 # merge it after removed junk rawacum += raw2 # merge it after removed junk
@ -386,13 +414,11 @@ def getPageTitles(config={}, session=None):
titles = [] titles = []
if 'api' in config and config['api']: if 'api' in config and config['api']:
r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'}) try:
test = getJSON(r)
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
titles = getPageTitlesScraper(config=config, session=session)
else:
titles = getPageTitlesAPI(config=config, session=session) titles = getPageTitlesAPI(config=config, session=session)
except:
print "Error: could not get page titles from the API"
titles = getPageTitlesScraper(config=config, session=session)
elif 'index' in config and config['index']: elif 'index' in config and config['index']:
titles = getPageTitlesScraper(config=config, session=session) titles = getPageTitlesScraper(config=config, session=session)
@ -412,7 +438,7 @@ def getPageTitles(config={}, session=None):
print '%d page titles loaded' % (c) print '%d page titles loaded' % (c)
return titlesfilename return titlesfilename
def getImageNames(config={}, session=None): def getImageNames(config={}, session=None):
""" Get list of image names """ """ Get list of image names """
@ -436,39 +462,60 @@ def getXMLHeader(config={}, session=None):
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x.... # xmlns:x....
randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ
try: print config['api']
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
except PageMissingError as pme: xml = None
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError:
try: try:
if config['api']: print 'Getting the XML header from the API'
print "Trying the local name for the Special namespace instead" r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
r = session.post( xml = r.json()['query']['export']['*']
url=config['api'], if not xml:
data={ r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
'action': 'query', xml = r.text
'meta': 'siteinfo', except requests.exceptions.RetryError:
'siprop': 'namespaces', pass
'format': 'json'}
) else:
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \ try:
+ ':Export' xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme: except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError: except ExportAbortedError:
pass try:
if config['api']:
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'},
timeout=120
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass
header = xml.split('</mediawiki>')[0] header = xml.split('</mediawiki>')[0]
if not re.match(r"\s*<mediawiki", xml): if not re.match(r"\s*<mediawiki", xml):
print 'XML export on this wiki is broken, quitting.' if config['xmlrevisions']:
logerror(u'XML export on this wiki is broken, quitting.') # Try again the old way
sys.exit() print 'Export test via the API failed. Wiki too old? Trying without xmlrevisions.'
config['xmlrevisions'] = False
header, config = getXMLHeader(config=config, session=session)
else:
print 'XML export on this wiki is broken, quitting.'
logerror(u'XML export on this wiki is broken, quitting.')
sys.exit()
return header, config return header, config
@ -512,7 +559,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if c > 0 and c < maxretries: if c > 0 and c < maxretries:
wait = increment * c < maxseconds and increment * \ wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds c or maxseconds # incremental until maxseconds
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['pages'], wait) print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)
time.sleep(wait) time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then # reducing server load requesting smallest chunks (if curonly then
# limit = 1 from mother function) # limit = 1 from mother function)
@ -521,6 +568,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if c >= maxretries: if c >= maxretries:
print ' We have retried %d times' % (c) print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['pages']) print ' MediaWiki error for "%s", network error or whatever...' % (params['pages'])
if config['failfast']:
print "Exit, it will be for another time"
sys.exit()
# If it's not already what we tried: our last chance, preserve only the last revision... # If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save only the last, # config['curonly'] means that the whole dump is configured to save only the last,
# params['curonly'] should mean that we've already tried this # params['curonly'] should mean that we've already tried this
@ -550,7 +600,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
return '' # empty xml return '' # empty xml
# FIXME HANDLE HTTP Errors HERE # FIXME HANDLE HTTP Errors HERE
try: try:
r = session.post(url=config['index'], data=params, headers=headers) r = session.post(url=config['index'], params=params, headers=headers, timeout=10)
handleStatusCode(r) handleStatusCode(r)
xml = fixBOM(r) xml = fixBOM(r)
except requests.exceptions.ConnectionError as e: except requests.exceptions.ConnectionError as e:
@ -675,10 +725,9 @@ def cleanXML(xml=''):
def generateXMLDump(config={}, titles=[], start=None, session=None): def generateXMLDump(config={}, titles=[], start=None, session=None):
""" Generates a XML dump for a list of titles """ """ Generates a XML dump for a list of titles or from revision IDs """
# TODO: titles is now unused. # TODO: titles is now unused.
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
header, config = getXMLHeader(config=config, session=session) header, config = getXMLHeader(config=config, session=session)
footer = '</mediawiki>\n' # new line at the end footer = '</mediawiki>\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
@ -686,48 +735,189 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
config['curonly'] and 'current' or 'history') config['curonly'] and 'current' or 'history')
xmlfile = '' xmlfile = ''
lock = True lock = True
if start:
print "Removing the last chunk of past XML dump: it is probably incomplete." if config['xmlrevisions']:
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True): print 'Retrieving the XML for every page from the beginning'
pass
else:
# requested complete xml dump
lock = False
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8')) xmlfile.write(header.encode('utf-8'))
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
delay(config=config, session=session)
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try: try:
for xml in getXMLPage(config=config, title=title, session=session): r_timestamp = r'<timestamp>([^<]+)</timestamp>'
for xml in getXMLRevisions(config=config, session=session):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
print "%d more revisions exported" % numrevs
xml = cleanXML(xml=xml) xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8')) xmlfile.write(xml.encode('utf-8'))
except PageMissingError: except AttributeError:
logerror( print "This wikitools module version is not working"
config=config, sys.exit()
text=u'The page "%s" was missing in the wiki (probably deleted)' % else:
(title.decode('utf-8')) print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
) if start:
# here, XML is a correct <page> </page> chunk or print "Removing the last chunk of past XML dump: it is probably incomplete."
# an empty string due to a deleted page (logged in errors log) or for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
# an empty string due to an error while retrieving the page from server pass
# (logged in errors log) else:
c += 1 # requested complete xml dump
lock = False
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
delay(config=config, session=session)
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title.decode('utf-8'))
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
xmlfile.write(footer) xmlfile.write(footer)
xmlfile.close() xmlfile.close()
print 'XML dump saved at...', xmlfilename print 'XML dump saved at...', xmlfilename
def getXMLRevisions(config={}, session=None, allpages=False):
site = wikitools.wiki.Wiki(config['api'])
if not 'all' in config['namespaces']:
namespaces = config['namespaces']
else:
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
try:
for namespace in namespaces:
print "Trying to export all revisions from namespace %s" % namespace
arvparams = {
'action': 'query',
'list': 'allrevisions',
'arvlimit': 500,
'arvnamespace': namespace
}
if not config['curonly']:
# We have to build the XML manually...
# Skip flags, presumably needed to add <minor/> which is in the schema.
# Also missing: parentid and contentformat.
arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
arvrequest = wikitools.api.APIRequest(site, arvparams)
results = arvrequest.queryGen()
for result in results:
for page in result['query']['allrevisions']:
yield makeXmlFromPage(page)
else:
# Just cycle through revision IDs and use the XML as is
arvparams['arvprop'] = 'ids'
arvrequest = wikitools.api.APIRequest(site, arvparams)
arvresults = arvrequest.queryGen()
for result in arvresults:
revids = []
for page in result['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
print "%d more revisions listed, until %s" % (len(revids), revids[-1])
exportparams = {
'action': 'query',
'revids': '|'.join(revids),
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
except KeyError:
print "Warning. Could not use allrevisions, wiki too old."
if config['curonly']:
for title in readTitles(config):
exportparams = {
'action': 'query',
'titles': title,
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
else:
for title in readTitles(config):
pparams = {
'action': 'query',
'titles': title,
'prop': 'revisions',
'rvlimit': 'max',
'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
'rawcontinue': 'yes'
}
prequest = wikitools.api.APIRequest(site, pparams)
try:
results = prequest.query()
pages = results['query']['pages']
except KeyError:
raise PageMissingError(title, xml='')
for page in pages:
try:
xml = makeXmlFromPage(pages[page])
except PageMissingError:
logerror(
config=config,
text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
)
continue
yield xml
except wikitools.api.APIError:
print "This wikitools version seems not to work for us. Exiting."
sys.exit()
def makeXmlFromPage(page):
""" Output an XML document as a string from a page as in the API JSON """
try:
p = E.page(
E.title(page['title']),
E.ns(to_unicode(page['ns'])),
E.id(to_unicode(page['pageid'])),
)
for rev in page['revisions']:
revision = E.revision(
E.id(to_unicode(rev['revid'])),
E.parentid(to_unicode(rev['parentid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(to_unicode(rev['userid'])),
E.username(to_unicode(rev['user'])),
),
E.comment(rev['comment']),
E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
)
if 'contentmodel' in rev:
revision.append(E.model(rev['contentmodel']))
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
if 'sha1' in rev:
revision.append(E.sha1(rev['sha1']))
p.append(revision)
except KeyError:
raise PageMissingError(page['title'], '')
return etree.tostring(p, pretty_print=True)
def readTitles(config={}, start=None): def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """ """ Read title list from a file, from the title "start" """
@ -863,10 +1053,11 @@ def getImageNamesScraper(config={}, session=None):
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
r = session.post( r = session.post(
url=config['index'], url=config['index'],
data={ params={
'title': 'Special:Imagelist', 'title': 'Special:Imagelist',
'limit': limit, 'limit': limit,
'offset': offset}) 'offset': offset},
timeout=30)
raw = r.text raw = r.text
delay(config=config, session=session) delay(config=config, session=session)
# delicate wiki # delicate wiki
@ -967,7 +1158,7 @@ def getImageNamesAPI(config={}, session=None):
'format': 'json', 'format': 'json',
'ailimit': 500} 'ailimit': 500}
# FIXME Handle HTTP Errors HERE # FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], params=params, timeout=30)
handleStatusCode(r) handleStatusCode(r)
jsonimages = getJSON(r) jsonimages = getJSON(r)
delay(config=config, session=session) delay(config=config, session=session)
@ -1025,7 +1216,7 @@ def getImageNamesAPI(config={}, session=None):
'iiprop': 'user|url', 'iiprop': 'user|url',
'format': 'json'} 'format': 'json'}
# FIXME Handle HTTP Errors HERE # FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], params=params, timeout=30)
handleStatusCode(r) handleStatusCode(r)
jsonimages = getJSON(r) jsonimages = getJSON(r)
delay(config=config, session=session) delay(config=config, session=session)
@ -1112,10 +1303,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
# saving description if any # saving description if any
try: try:
title = u'Image:%s' % (filename) title = u'Image:%s' % (filename)
xmlfiledesc = getXMLFileDesc( if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
config=config, r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
title=title, xmlfiledesc = r.text
session=session) # use Image: for backwards compatibility else:
xmlfiledesc = getXMLFileDesc(
config=config,
title=title,
session=session) # use Image: for backwards compatibility
except PageMissingError: except PageMissingError:
xmlfiledesc = '' xmlfiledesc = ''
logerror( logerror(
@ -1170,7 +1365,7 @@ def domain2prefix(config={}, session=None):
domain = config['index'] domain = config['index']
domain = domain.lower() domain = domain.lower()
domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain) domain = re.sub(r'(https?://|www\.|/index\.php.*|/api\.php.*)', '', domain)
domain = re.sub(r'/', '_', domain) domain = re.sub(r'/', '_', domain)
domain = re.sub(r'\.', '', domain) domain = re.sub(r'\.', '', domain)
domain = re.sub(r'[^A-Za-z0-9]', '_', domain) domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
@ -1211,8 +1406,9 @@ def welcome():
message += '' message += ''
message += "\n" message += "\n"
message += "#" * 73 message += "#" * 73
message += "\n"
message += "# Copyright (C) 2011-%d WikiTeam developers #\n" % (datetime.datetime.now().year)
message += """ message += """
# Copyright (C) 2011-2014 WikiTeam #
# This program is free software: you can redistribute it and/or modify # # This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by # # it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or # # the Free Software Foundation, either version 3 of the License, or #
@ -1299,7 +1495,9 @@ def getParameters(params=[]):
action='store_true', action='store_true',
help="generates a full history XML dump (--xml --curonly for current revisions only)") help="generates a full history XML dump (--xml --curonly for current revisions only)")
groupDownload.add_argument('--curonly', action='store_true', groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages') help='store only the current version of pages')
groupDownload.add_argument('--xmlrevisions', action='store_true',
help='download all revisions from an API generator. MediaWiki 1.27+ only.')
groupDownload.add_argument( groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump") '--images', action='store_true', help="generates an image dump")
groupDownload.add_argument( groupDownload.add_argument(
@ -1319,6 +1517,10 @@ def getParameters(params=[]):
'--get-wiki-engine', '--get-wiki-engine',
action='store_true', action='store_true',
help="returns the wiki engine") help="returns the wiki engine")
groupMeta.add_argument(
'--failfast',
action='store_true',
help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.")
args = parser.parse_args() args = parser.parse_args()
# print args # print args
@ -1350,11 +1552,22 @@ def getParameters(params=[]):
print 'Using cookies from %s' % args.cookies print 'Using cookies from %s' % args.cookies
session = requests.Session() session = requests.Session()
try:
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
# Courtesy datashaman https://stackoverflow.com/a/35504626
__retries__ = Retry(total=5,
backoff_factor=2,
status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=__retries__))
session.mount('http://', HTTPAdapter(max_retries=__retries__))
except:
# Our urllib3/requests is too old
pass
session.cookies = cj session.cookies = cj
session.headers.update({'User-Agent': getUserAgent()}) session.headers.update({'User-Agent': getUserAgent()})
if args.user and args.password: if args.user and args.password:
session.auth = (args.user, args.password) session.auth = (args.user, args.password)
# session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
# check URLs # check URLs
for url in [args.api, args.index, args.wiki]: for url in [args.api, args.index, args.wiki]:
@ -1392,6 +1605,7 @@ def getParameters(params=[]):
retry = 0 retry = 0
maxretries = args.retries maxretries = args.retries
retrydelay = 20 retrydelay = 20
check = None
while retry < maxretries: while retry < maxretries:
try: try:
check = checkAPI(api=api, session=session) check = checkAPI(api=api, session=session)
@ -1427,15 +1641,20 @@ def getParameters(params=[]):
session=session): session=session):
print 'index.php is OK' print 'index.php is OK'
else: else:
index = '/'.join(index.split('/')[:-1]) try:
index = '/'.join(index.split('/')[:-1])
except AttributeError:
index = None
if index and checkIndex( if index and checkIndex(
index=index, index=index,
cookies=args.cookies, cookies=args.cookies,
session=session): session=session):
print 'index.php is OK' print 'index.php is OK'
else: else:
print 'Error in index.php, please, provide a correct path to index.php' print 'Error in index.php.'
sys.exit(1) if not args.xmlrevisions:
print 'Please, provide a correct path to index.php or use --xmlrevisions. Terminating.'
sys.exit(1)
# check user and pass (one requires both) # check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user): if (args.user and not args.password) or (args.password and not args.user):
@ -1483,10 +1702,12 @@ def getParameters(params=[]):
'curonly': args.curonly, 'curonly': args.curonly,
'date': datetime.datetime.now().strftime('%Y%m%d'), 'date': datetime.datetime.now().strftime('%Y%m%d'),
'api': api, 'api': api,
'failfast': args.failfast,
'index': index, 'index': index,
'images': args.images, 'images': args.images,
'logs': False, 'logs': False,
'xml': args.xml, 'xml': args.xml,
'xmlrevisions': args.xmlrevisions,
'namespaces': namespaces, 'namespaces': namespaces,
'exnamespaces': exnamespaces, 'exnamespaces': exnamespaces,
'path': args.path and os.path.normpath(args.path) or '', 'path': args.path and os.path.normpath(args.path) or '',
@ -1520,18 +1741,23 @@ def checkAPI(api=None, session=None):
data={ data={
'action': 'query', 'action': 'query',
'meta': 'siteinfo', 'meta': 'siteinfo',
'format': 'json'} 'format': 'json'},
timeout=30
) )
if r.url == api: if r.status_code == 200:
break break
else: elif r.status_code < 400:
api = r.url p = r.url
api = urlunparse([p.scheme, p.netloc, p.path, '', '', ''])
elif r.status_code > 400:
print "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
return False
if "MediaWiki API is not enabled for this site." in r.text: if "MediaWiki API is not enabled for this site." in r.text:
return False return False
try: try:
result = getJSON(r) result = getJSON(r)
index = None index = None
if result['query']: if result:
try: try:
index = result['query']['general']['server'] + \ index = result['query']['general']['server'] + \
result['query']['general']['script'] result['query']['general']['script']
@ -1548,7 +1774,7 @@ def checkAPI(api=None, session=None):
def checkIndex(index=None, cookies=None, session=None): def checkIndex(index=None, cookies=None, session=None):
""" Checking index.php availability """ """ Checking index.php availability """
r = session.post(url=index, data={'title': 'Special:Version'}) r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
raw = r.text raw = r.text
print 'Checking index.php...', index print 'Checking index.php...', index
# Workaround for issue 71 # Workaround for issue 71
@ -1587,7 +1813,11 @@ def getJSON(request):
"""Strip Unicode BOM""" """Strip Unicode BOM"""
if request.text.startswith(u'\ufeff'): if request.text.startswith(u'\ufeff'):
request.encoding = 'utf-8-sig' request.encoding = 'utf-8-sig'
return request.json() try:
return request.json()
except:
# Maybe an older API version which did not return correct JSON
return {}
def fixBOM(request): def fixBOM(request):
@ -1633,6 +1863,8 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
else: else:
print 'XML dump seems to be corrupted.' print 'XML dump seems to be corrupted.'
reply = '' reply = ''
if config['failfast']:
reply = 'yes'
while reply.lower() not in ['yes', 'y', 'no', 'n']: while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ') reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
if reply.lower() in ['yes', 'y']: if reply.lower() in ['yes', 'y']:
@ -1679,7 +1911,7 @@ def resumePreviousDump(config={}, other={}):
if lasttitle == '': if lasttitle == '':
lasttitle=lasttitles.next() lasttitle=lasttitles.next()
except: except:
pass # probably file does not exists lasttitle = '' # probably file does not exists
if lasttitle == '--END--': if lasttitle == '--END--':
# titles list is complete # titles list is complete
print 'Title list was completed in the previous session' print 'Title list was completed in the previous session'
@ -1810,7 +2042,7 @@ def saveSpecialVersion(config={}, session=None):
else: else:
print 'Downloading Special:Version with extensions and other related info' print 'Downloading Special:Version with extensions and other related info'
r = session.post( r = session.post(
url=config['index'], data={'title': 'Special:Version'}) url=config['index'], params={'title': 'Special:Version'}, timeout=10)
raw = r.text raw = r.text
delay(config=config, session=session) delay(config=config, session=session)
raw = removeIP(raw=raw) raw = removeIP(raw=raw)
@ -1825,14 +2057,13 @@ def saveIndexPHP(config={}, session=None):
print 'index.html exists, do not overwrite' print 'index.html exists, do not overwrite'
else: else:
print 'Downloading index.php (Main Page) as index.html' print 'Downloading index.php (Main Page) as index.html'
r = session.post(url=config['index'], data={}) r = session.post(url=config['index'], params={}, timeout=10)
raw = r.text raw = r.text
delay(config=config, session=session) delay(config=config, session=session)
raw = removeIP(raw=raw) raw = removeIP(raw=raw)
with open('%s/index.html' % (config['path']), 'w') as outfile: with open('%s/index.html' % (config['path']), 'w') as outfile:
outfile.write(raw.encode('utf-8')) outfile.write(raw.encode('utf-8'))
def saveSiteInfo(config={}, session=None): def saveSiteInfo(config={}, session=None):
""" Save a file with site info """ """ Save a file with site info """
@ -1845,30 +2076,33 @@ def saveSiteInfo(config={}, session=None):
# MediaWiki 1.13+ # MediaWiki 1.13+
r = session.post( r = session.post(
url=config['api'], url=config['api'],
data={ params={
'action': 'query', 'action': 'query',
'meta': 'siteinfo', 'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
'sinumberingroup': 1, 'sinumberingroup': 1,
'format': 'json'}) 'format': 'json'},
timeout=10)
# MediaWiki 1.11-1.12 # MediaWiki 1.11-1.12
if not 'query' in getJSON(r): if not 'query' in getJSON(r):
r = session.post( r = session.post(
url=config['api'], url=config['api'],
data={ params={
'action': 'query', 'action': 'query',
'meta': 'siteinfo', 'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
'format': 'json'}) 'format': 'json'},
timeout=10)
# MediaWiki 1.8-1.10 # MediaWiki 1.8-1.10
if not 'query' in getJSON(r): if not 'query' in getJSON(r):
r = session.post( r = session.post(
url=config['api'], url=config['api'],
data={ params={
'action': 'query', 'action': 'query',
'meta': 'siteinfo', 'meta': 'siteinfo',
'siprop': 'general|namespaces', 'siprop': 'general|namespaces',
'format': 'json'}) 'format': 'json'},
timeout=10)
result = getJSON(r) result = getJSON(r)
delay(config=config, session=session) delay(config=config, session=session)
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
@ -1879,10 +2113,14 @@ def avoidWikimediaProjects(config={}, other={}):
""" Skip Wikimedia projects and redirect to the dumps website """ """ Skip Wikimedia projects and redirect to the dumps website """
# notice about wikipedia dumps # notice about wikipedia dumps
url = ''
if config['api']:
url = url + config['api']
if config['index']:
url = url + config['index']
if re.findall( if re.findall(
r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
config['api'] + url):
config['index']):
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!' print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
print 'Download the dumps from http://dumps.wikimedia.org' print 'Download the dumps from http://dumps.wikimedia.org'
if not other['force']: if not other['force']:
@ -1895,9 +2133,9 @@ def getWikiEngine(url=''):
session = requests.Session() session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()}) session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url) r = session.post(url=url, timeout=30)
if r.status_code == 405 or r.text == '': if r.status_code == 405 or r.text == '':
r = session.get(url=url) r = session.get(url=url, timeout=120)
result = r.text result = r.text
wikiengine = 'Unknown' wikiengine = 'Unknown'
@ -1980,7 +2218,7 @@ def mwGetAPIAndIndex(url=''):
index = '' index = ''
session = requests.Session() session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()}) session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url) r = session.post(url=url, timeout=120)
result = r.text result = r.text
# API # API
@ -2042,6 +2280,8 @@ def main(params=[]):
while not other['resume'] and os.path.isdir(config['path']): while not other['resume'] and os.path.isdir(config['path']):
print '\nWarning!: "%s" path exists' % (config['path']) print '\nWarning!: "%s" path exists' % (config['path'])
reply = '' reply = ''
if config['failfast']:
retry = 'yes'
while reply.lower() not in ['yes', 'y', 'no', 'n']: while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input( reply = raw_input(
'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % 'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %

@ -6,12 +6,12 @@
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or # the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. # (at your option) any later version.
# #
# This program is distributed in the hope that it will be useful, # This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details. # GNU General Public License for more details.
# #
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
@ -30,11 +30,11 @@ def main():
if len(sys.argv) < 2: if len(sys.argv) < 2:
print 'python script.py file-with-apis.txt' print 'python script.py file-with-apis.txt'
sys.exit() sys.exit()
print 'Reading list of APIs from', sys.argv[1] print 'Reading list of APIs from', sys.argv[1]
wikis = open(sys.argv[1], 'r').read().splitlines() wikis = open(sys.argv[1], 'r').read().splitlines()
print '%d APIs found' % (len(wikis)) print '%d APIs found' % (len(wikis))
for wiki in wikis: for wiki in wikis:
print "#"*73 print "#"*73
print "# Downloading", wiki print "# Downloading", wiki
@ -42,17 +42,15 @@ def main():
wiki = wiki.lower() wiki = wiki.lower()
# Make the prefix in standard way; api and index must be defined, not important which is which # Make the prefix in standard way; api and index must be defined, not important which is which
prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki}) prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
#check if compressed, in that case dump was finished previously #check if compressed, in that case dump was finished previously
compressed = False compressed = False
for dirname, dirnames, filenames in os.walk('.'): for f in os.listdir('.'):
if dirname == '.': if f.startswith(prefix) and f.endswith('.7z'):
for f in filenames: compressed = True
if f.startswith(prefix) and f.endswith('.7z'): zipfilename = f
compressed = True
zipfilename = f
break #stop searching, dot not explore subdirectories break #stop searching, dot not explore subdirectories
if compressed: if compressed:
print 'Skipping... This wiki was downloaded and compressed before in', zipfilename print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
# Get the archive's file list. # Get the archive's file list.
@ -67,18 +65,17 @@ def main():
print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+." print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
# TODO: Find a way like grep -q below without doing a 7z l multiple times? # TODO: Find a way like grep -q below without doing a 7z l multiple times?
continue continue
#download #download
started = False #was this wiki download started before? then resume started = False #was this wiki download started before? then resume
wikidir = '' wikidir = ''
for dirname, dirnames, filenames in os.walk('.'): for f in os.listdir('.'):
if dirname == '.': # Does not find numbered wikidumps not verify directories
for d in dirnames: if f.startswith(prefix) and f.endswith('wikidump'):
if d.startswith(prefix): wikidir = f
wikidir = d started = True
started = True
break #stop searching, dot not explore subdirectories break #stop searching, dot not explore subdirectories
# time.sleep(60) # time.sleep(60)
# Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms # Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
# such as editthis.info, wiki-site.com, wikkii (adjust the value as needed; # such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
@ -90,15 +87,14 @@ def main():
subprocess.call('./dumpgenerator.py --api=%s --xml --images --delay=1' % wiki, shell=True) subprocess.call('./dumpgenerator.py --api=%s --xml --images --delay=1' % wiki, shell=True)
started = True started = True
#save wikidir now #save wikidir now
for dirname, dirnames, filenames in os.walk('.'): for f in os.listdir('.'):
if dirname == '.': # Does not find numbered wikidumps not verify directories
for d in dirnames: if f.startswith(prefix) and f.endswith('wikidump'):
if d.startswith(prefix): wikidir = f
wikidir = d
break #stop searching, dot not explore subdirectories break #stop searching, dot not explore subdirectories
prefix = wikidir.split('-wikidump')[0] prefix = wikidir.split('-wikidump')[0]
finished = False finished = False
if started and wikidir and prefix: if started and wikidir and prefix:
if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ): if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
@ -107,7 +103,7 @@ def main():
finished = True finished = True
# You can also issue this on your working directory to find all incomplete dumps: # You can also issue this on your working directory to find all incomplete dumps:
# tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$" # tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
#compress #compress
if finished: if finished:
time.sleep(1) time.sleep(1)

@ -3048,7 +3048,7 @@ http://vai.uibk.ac.at/dadp/doku.php
http://vak.ru/doku.php http://vak.ru/doku.php
http://val.bmstu.ru/dokuwiki/doku.php http://val.bmstu.ru/dokuwiki/doku.php
http://valk.mave.jp/doku.php http://valk.mave.jp/doku.php
http://vancouver.hackspace.ca/doku.php http://vanhack.ca/doku.php
http://vanets.vuse.vanderbilt.edu/dokuwiki/doku.php http://vanets.vuse.vanderbilt.edu/dokuwiki/doku.php
http://vaslor.net/doku.php http://vaslor.net/doku.php
http://vbraun.name/cms/doku.php http://vbraun.name/cms/doku.php
@ -4957,7 +4957,6 @@ http://www.minkhollow.ca/becker/doku.php
http://www.minkhollow.ca/mhf/doku.php http://www.minkhollow.ca/mhf/doku.php
http://www.minkhollow.ca/MHF/doku.php http://www.minkhollow.ca/MHF/doku.php
http://www.minkhollow.ca/Thesis07/doku.php http://www.minkhollow.ca/Thesis07/doku.php
http://www.mirkosertic.de/doku.php
http://www.mirmer.su/wiki/doku.php http://www.mirmer.su/wiki/doku.php
http://www.mixshare.com/wiki/doku.php http://www.mixshare.com/wiki/doku.php
http://www.mixxx.org/wiki/doku.php http://www.mixxx.org/wiki/doku.php

File diff suppressed because it is too large Load Diff

@ -1,7 +1,7 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2014 WikiTeam developers # Copyright (C) 2014-2017 WikiTeam developers
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or # the Free Software Foundation, either version 3 of the License, or
@ -26,9 +26,10 @@ def main():
url = 'https://meta.miraheze.org/wiki/Special:SiteMatrix' url = 'https://meta.miraheze.org/wiki/Special:SiteMatrix'
r = requests.get(url, headers=headers) r = requests.get(url, headers=headers)
raw = r.text raw = r.text
m = re.findall(ur'<tr><td><a href="https://([^>]+?)/">[^<]+</a></td></tr>', raw) m = re.findall(ur'<tr><td>(<del>)?<a href="https://([^>]+?)/">[^<]+</a>', raw)
m.sort()
for i in m: for i in m:
print 'https://' + i + '/w/api.php' print 'https://' + i[1] + '/w/api.php'
if __name__ == '__main__': if __name__ == '__main__':
main() main()

File diff suppressed because it is too large Load Diff

@ -1,5 +1,5 @@
Wikifarm: https://meta.miraheze.org/wiki/Miraheze Wikifarm: https://meta.miraheze.org/wiki/Miraheze
Last update: 2015-09-29 Last update: 2017-06-30
Details: Details:

@ -1,7 +1,7 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2014 WikiTeam developers # Copyright (C) 2014-2017 WikiTeam developers
# This program is free software: you can redistribute it and/or modify # This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or # the Free Software Foundation, either version 3 of the License, or
@ -27,6 +27,7 @@ def main():
r = requests.get(url, headers=headers) r = requests.get(url, headers=headers)
raw = r.text raw = r.text
m = re.findall(ur'<li><a href=\'([^>]+?)/wiki/\'>', raw) m = re.findall(ur'<li><a href=\'([^>]+?)/wiki/\'>', raw)
m.sort()
for i in m: for i in m:
print i + '/w/api.php' print i + '/w/api.php'

@ -2,8 +2,6 @@ http://24.neoseeker.com/w/api.php
http://aceattorney.neoseeker.com/w/api.php http://aceattorney.neoseeker.com/w/api.php
http://advancewars.neoseeker.com/w/api.php http://advancewars.neoseeker.com/w/api.php
http://adventuretime.neoseeker.com/w/api.php http://adventuretime.neoseeker.com/w/api.php
http://alanwake.neoseeker.com/w/api.php
http://alienbreed.neoseeker.com/w/api.php
http://animalcrossing.neoseeker.com/w/api.php http://animalcrossing.neoseeker.com/w/api.php
http://attackontitan.neoseeker.com/w/api.php http://attackontitan.neoseeker.com/w/api.php
http://avatar.neoseeker.com/w/api.php http://avatar.neoseeker.com/w/api.php
@ -17,9 +15,9 @@ http://boktai.neoseeker.com/w/api.php
http://bond.neoseeker.com/w/api.php http://bond.neoseeker.com/w/api.php
http://borderlands.neoseeker.com/w/api.php http://borderlands.neoseeker.com/w/api.php
http://boundbyflame.neoseeker.com/w/api.php http://boundbyflame.neoseeker.com/w/api.php
http://bravely.neoseeker.com/w/api.php
http://breathoffire.neoseeker.com/w/api.php http://breathoffire.neoseeker.com/w/api.php
http://brink.neoseeker.com/w/api.php http://brink.neoseeker.com/w/api.php
http://bulletstorm.neoseeker.com/w/api.php
http://callofduty.neoseeker.com/w/api.php http://callofduty.neoseeker.com/w/api.php
http://castlecrashers.neoseeker.com/w/api.php http://castlecrashers.neoseeker.com/w/api.php
http://castlevania.neoseeker.com/w/api.php http://castlevania.neoseeker.com/w/api.php
@ -35,13 +33,10 @@ http://danganronpa.neoseeker.com/w/api.php
http://darksouls.neoseeker.com/w/api.php http://darksouls.neoseeker.com/w/api.php
http://deadisland.neoseeker.com/w/api.php http://deadisland.neoseeker.com/w/api.php
http://deadoralive.neoseeker.com/w/api.php http://deadoralive.neoseeker.com/w/api.php
http://deadspace.neoseeker.com/w/api.php
http://deathnote.neoseeker.com/w/api.php http://deathnote.neoseeker.com/w/api.php
http://demonssouls.neoseeker.com/w/api.php http://demonssouls.neoseeker.com/w/api.php
http://destiny.neoseeker.com/w/api.php http://destiny.neoseeker.com/w/api.php
http://deusex.neoseeker.com/w/api.php
http://devilmaycry.neoseeker.com/w/api.php http://devilmaycry.neoseeker.com/w/api.php
http://diablo3.neoseeker.com/w/api.php
http://digimon.neoseeker.com/w/api.php http://digimon.neoseeker.com/w/api.php
http://disgaea.neoseeker.com/w/api.php http://disgaea.neoseeker.com/w/api.php
http://doctorwho.neoseeker.com/w/api.php http://doctorwho.neoseeker.com/w/api.php
@ -57,21 +52,17 @@ http://dynastywarriors.neoseeker.com/w/api.php
http://elderscrolls.neoseeker.com/w/api.php http://elderscrolls.neoseeker.com/w/api.php
http://endlessocean.neoseeker.com/w/api.php http://endlessocean.neoseeker.com/w/api.php
http://evangelion.neoseeker.com/w/api.php http://evangelion.neoseeker.com/w/api.php
http://eveonline.neoseeker.com/w/api.php
http://fable.neoseeker.com/w/api.php http://fable.neoseeker.com/w/api.php
http://fairytail.neoseeker.com/w/api.php http://fairytail.neoseeker.com/w/api.php
http://fallout4.neoseeker.com/w/api.php
http://fallout.neoseeker.com/w/api.php http://fallout.neoseeker.com/w/api.php
http://fallout4.neoseeker.com/w/api.php
http://familyguy.neoseeker.com/w/api.php http://familyguy.neoseeker.com/w/api.php
http://farcry.neoseeker.com/w/api.php
http://fatalfury.neoseeker.com/w/api.php http://fatalfury.neoseeker.com/w/api.php
http://fifa.neoseeker.com/w/api.php http://fifa.neoseeker.com/w/api.php
http://finalfantasy.neoseeker.com/w/api.php http://finalfantasy.neoseeker.com/w/api.php
http://fireemblem.neoseeker.com/w/api.php http://fireemblem.neoseeker.com/w/api.php
http://footballmanager.neoseeker.com/w/api.php http://footballmanager.neoseeker.com/w/api.php
http://formula1.neoseeker.com/w/api.php http://formula1.neoseeker.com/w/api.php
http://forza.neoseeker.com/w/api.php
http://friends.neoseeker.com/w/api.php
http://fullmetalalchemist.neoseeker.com/w/api.php http://fullmetalalchemist.neoseeker.com/w/api.php
http://futurama.neoseeker.com/w/api.php http://futurama.neoseeker.com/w/api.php
http://fzero.neoseeker.com/w/api.php http://fzero.neoseeker.com/w/api.php
@ -81,11 +72,9 @@ http://glee.neoseeker.com/w/api.php
http://godofwar.neoseeker.com/w/api.php http://godofwar.neoseeker.com/w/api.php
http://goldensun.neoseeker.com/w/api.php http://goldensun.neoseeker.com/w/api.php
http://granturismo.neoseeker.com/w/api.php http://granturismo.neoseeker.com/w/api.php
http://greysanatomy.neoseeker.com/w/api.php
http://growlanser.neoseeker.com/w/api.php http://growlanser.neoseeker.com/w/api.php
http://gta5.neoseeker.com/w/api.php
http://gta.neoseeker.com/w/api.php http://gta.neoseeker.com/w/api.php
http://guildwars2.neoseeker.com/w/api.php http://gta5.neoseeker.com/w/api.php
http://guildwars.neoseeker.com/w/api.php http://guildwars.neoseeker.com/w/api.php
http://guitarhero.neoseeker.com/w/api.php http://guitarhero.neoseeker.com/w/api.php
http://gundam.neoseeker.com/w/api.php http://gundam.neoseeker.com/w/api.php
@ -106,7 +95,6 @@ http://inuyasha.neoseeker.com/w/api.php
http://jakdaxter.neoseeker.com/w/api.php http://jakdaxter.neoseeker.com/w/api.php
http://kairosoft.neoseeker.com/w/api.php http://kairosoft.neoseeker.com/w/api.php
http://kidicarus.neoseeker.com/w/api.php http://kidicarus.neoseeker.com/w/api.php
http://kingdomcome.neoseeker.com/w/api.php
http://kingdomhearts.neoseeker.com/w/api.php http://kingdomhearts.neoseeker.com/w/api.php
http://kirby.neoseeker.com/w/api.php http://kirby.neoseeker.com/w/api.php
http://knack.neoseeker.com/w/api.php http://knack.neoseeker.com/w/api.php
@ -115,8 +103,6 @@ http://layton.neoseeker.com/w/api.php
http://leagueoflegends.neoseeker.com/w/api.php http://leagueoflegends.neoseeker.com/w/api.php
http://legendofdragoon.neoseeker.com/w/api.php http://legendofdragoon.neoseeker.com/w/api.php
http://littlebigplanet.neoseeker.com/w/api.php http://littlebigplanet.neoseeker.com/w/api.php
http://lmamanager.neoseeker.com/w/api.php
http://lordsofthefallen.neoseeker.com/w/api.php
http://lotr.neoseeker.com/w/api.php http://lotr.neoseeker.com/w/api.php
http://mafia.neoseeker.com/w/api.php http://mafia.neoseeker.com/w/api.php
http://magicalstarsign.neoseeker.com/w/api.php http://magicalstarsign.neoseeker.com/w/api.php
@ -128,7 +114,6 @@ http://megaman.neoseeker.com/w/api.php
http://megamitensei.neoseeker.com/w/api.php http://megamitensei.neoseeker.com/w/api.php
http://metalgear.neoseeker.com/w/api.php http://metalgear.neoseeker.com/w/api.php
http://metroid.neoseeker.com/w/api.php http://metroid.neoseeker.com/w/api.php
http://mightandmagic.neoseeker.com/w/api.php
http://minecraft.neoseeker.com/w/api.php http://minecraft.neoseeker.com/w/api.php
http://monsterhunter.neoseeker.com/w/api.php http://monsterhunter.neoseeker.com/w/api.php
http://mortalkombat.neoseeker.com/w/api.php http://mortalkombat.neoseeker.com/w/api.php
@ -140,7 +125,6 @@ http://ncis.neoseeker.com/w/api.php
http://needforspeed.neoseeker.com/w/api.php http://needforspeed.neoseeker.com/w/api.php
http://ninjagaiden.neoseeker.com/w/api.php http://ninjagaiden.neoseeker.com/w/api.php
http://ninokuni.neoseeker.com/w/api.php http://ninokuni.neoseeker.com/w/api.php
http://nintendogs.neoseeker.com/w/api.php
http://okami.neoseeker.com/w/api.php http://okami.neoseeker.com/w/api.php
http://onepiece.neoseeker.com/w/api.php http://onepiece.neoseeker.com/w/api.php
http://persona.neoseeker.com/w/api.php http://persona.neoseeker.com/w/api.php
@ -160,14 +144,12 @@ http://rockband.neoseeker.com/w/api.php
http://rpgmaker.neoseeker.com/w/api.php http://rpgmaker.neoseeker.com/w/api.php
http://runefactory.neoseeker.com/w/api.php http://runefactory.neoseeker.com/w/api.php
http://runescape.neoseeker.com/w/api.php http://runescape.neoseeker.com/w/api.php
http://runesofmagic.neoseeker.com/w/api.php
http://sandbox.neoseeker.com/w/api.php http://sandbox.neoseeker.com/w/api.php
http://scottpilgrim.neoseeker.com/w/api.php http://scottpilgrim.neoseeker.com/w/api.php
http://scrapmetal.neoseeker.com/w/api.php http://scrapmetal.neoseeker.com/w/api.php
http://scribblenauts.neoseeker.com/w/api.php http://scribblenauts.neoseeker.com/w/api.php
http://shadowofthecolossus.neoseeker.com/w/api.php http://shadowofthecolossus.neoseeker.com/w/api.php
http://shadowrunreturns.neoseeker.com/w/api.php http://shadowrunreturns.neoseeker.com/w/api.php
http://shank.neoseeker.com/w/api.php
http://shenmue.neoseeker.com/w/api.php http://shenmue.neoseeker.com/w/api.php
http://simpsons.neoseeker.com/w/api.php http://simpsons.neoseeker.com/w/api.php
http://skate.neoseeker.com/w/api.php http://skate.neoseeker.com/w/api.php
@ -183,7 +165,6 @@ http://southpark.neoseeker.com/w/api.php
http://spiderman.neoseeker.com/w/api.php http://spiderman.neoseeker.com/w/api.php
http://spongebob.neoseeker.com/w/api.php http://spongebob.neoseeker.com/w/api.php
http://spyro.neoseeker.com/w/api.php http://spyro.neoseeker.com/w/api.php
http://starbound.neoseeker.com/w/api.php
http://starcraft.neoseeker.com/w/api.php http://starcraft.neoseeker.com/w/api.php
http://starfox.neoseeker.com/w/api.php http://starfox.neoseeker.com/w/api.php
http://stargate.neoseeker.com/w/api.php http://stargate.neoseeker.com/w/api.php
@ -196,9 +177,7 @@ http://tales.neoseeker.com/w/api.php
http://tekken.neoseeker.com/w/api.php http://tekken.neoseeker.com/w/api.php
http://terraria.neoseeker.com/w/api.php http://terraria.neoseeker.com/w/api.php
http://thedarkness.neoseeker.com/w/api.php http://thedarkness.neoseeker.com/w/api.php
http://thedivision.neoseeker.com/w/api.php
http://thelastofus.neoseeker.com/w/api.php http://thelastofus.neoseeker.com/w/api.php
http://theorder.neoseeker.com/w/api.php
http://thesecretworld.neoseeker.com/w/api.php http://thesecretworld.neoseeker.com/w/api.php
http://thesims.neoseeker.com/w/api.php http://thesims.neoseeker.com/w/api.php
http://thewarriors.neoseeker.com/w/api.php http://thewarriors.neoseeker.com/w/api.php
@ -206,9 +185,7 @@ http://theworldendswithyou.neoseeker.com/w/api.php
http://thief.neoseeker.com/w/api.php http://thief.neoseeker.com/w/api.php
http://timesplitters.neoseeker.com/w/api.php http://timesplitters.neoseeker.com/w/api.php
http://tonyhawk.neoseeker.com/w/api.php http://tonyhawk.neoseeker.com/w/api.php
http://torchlight2.neoseeker.com/w/api.php
http://toriko.neoseeker.com/w/api.php http://toriko.neoseeker.com/w/api.php
http://transformers.neoseeker.com/w/api.php
http://twilight.neoseeker.com/w/api.php http://twilight.neoseeker.com/w/api.php
http://twistedmetal.neoseeker.com/w/api.php http://twistedmetal.neoseeker.com/w/api.php
http://uncharted.neoseeker.com/w/api.php http://uncharted.neoseeker.com/w/api.php
@ -217,12 +194,9 @@ http://vivapinata.neoseeker.com/w/api.php
http://wakfu.neoseeker.com/w/api.php http://wakfu.neoseeker.com/w/api.php
http://warcraft.neoseeker.com/w/api.php http://warcraft.neoseeker.com/w/api.php
http://warhammer.neoseeker.com/w/api.php http://warhammer.neoseeker.com/w/api.php
http://wasteland2.neoseeker.com/w/api.php
http://watchdogs.neoseeker.com/w/api.php http://watchdogs.neoseeker.com/w/api.php
http://whiteknightchronicles.neoseeker.com/w/api.php http://whiteknightchronicles.neoseeker.com/w/api.php
http://wikiguides.neoseeker.com/w/api.php http://wikiguides.neoseeker.com/w/api.php
http://witcher3.neoseeker.com/w/api.php
http://worldoftanks.neoseeker.com/w/api.php
http://wow.neoseeker.com/w/api.php http://wow.neoseeker.com/w/api.php
http://xenoblade.neoseeker.com/w/api.php http://xenoblade.neoseeker.com/w/api.php
http://yugioh.neoseeker.com/w/api.php http://yugioh.neoseeker.com/w/api.php

@ -1,5 +1,5 @@
Wikifarm: http://neowiki.neoseeker.com/wiki/Main_Page Wikifarm: http://neowiki.neoseeker.com/wiki/Main_Page
Last update: 2015-10-07 Last update: 2017-06-30
Details: Details:

File diff suppressed because it is too large Load Diff

@ -23,7 +23,7 @@ import subprocess
import re import re
from wikitools import wiki, api from wikitools import wiki, api
def getlist(wikia, wkfrom = 1, wkto = 1000): def getlist(wikia, wkfrom = 1, wkto = 100):
params = {'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto,} params = {'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto,}
request = api.APIRequest(wikia, params) request = api.APIRequest(wikia, params)
return request.query()['query']['wkdomains'] return request.query()['query']['wkdomains']
@ -31,8 +31,9 @@ def getlist(wikia, wkfrom = 1, wkto = 1000):
def getall(): def getall():
wikia = wiki.Wiki('http://community.wikia.com/api.php') wikia = wiki.Wiki('http://community.wikia.com/api.php')
offset = 0 offset = 0
limit = 1000 limit = 100
domains = {} domains = {}
empty = 0
# This API module has no query continuation facility # This API module has no query continuation facility
print 'Getting list of active domains...' print 'Getting list of active domains...'
while True: while True:
@ -40,13 +41,21 @@ def getall():
if list: if list:
print offset print offset
domains = dict(domains.items() + list.items() ) domains = dict(domains.items() + list.items() )
offset += 1000 empty = 0
else: else:
empty += 1
offset += limit
if empty > 100:
# Hopefully we don't have more than 10k wikis deleted in a row
break break
return domains return domains
def main(): def main():
domains = getall() domains = getall()
with open('wikia.com', 'w') as out:
out.write('\n'.join(str(domains[i]['domain']) for i in domains))
undumped = [] undumped = []
# Or we could iterate over each sublist while we get it? # Or we could iterate over each sublist while we get it?
for i in domains: for i in domains:
@ -55,21 +64,21 @@ def main():
print dbname print dbname
first = dbname[0] first = dbname[0]
# There are one-letter dbnames; the second letter is replaced by an underscore # There are one-letter dbnames; the second letter is replaced by an underscore
# http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.gz # http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.7z
try: try:
second = dbname[1] second = dbname[1]
except: except:
second = '_' second = '_'
base = 'http://s3.amazonaws.com/wikia_xml_dumps/' + first + '/' \ base = 'http://s3.amazonaws.com/wikia_xml_dumps/' + first + '/' \
+ first + second + '/' + dbname + first + second + '/' + dbname
full = base + '_pages_full.xml.gz' full = base + '_pages_full.xml.7z'
print full print full
current = base + '_pages_current.xml.gz' current = base + '_pages_current.xml.7z'
images = base + '_images.tar' images = base + '_images.tar'
try: try:
#subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full]) #subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
# Use this instead, and comment out the next try, to only list. # Use this instead, and comment out the next try, to only list.
subprocess.check_call(['curl', '-I', '--fail', full]) subprocess.call(['curl', '-I', '--fail', full])
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
# We added --fail for this https://superuser.com/a/854102/283120 # We added --fail for this https://superuser.com/a/854102/283120
if e.returncode == 22: if e.returncode == 22:
@ -81,7 +90,9 @@ def main():
# subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images]) # subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images])
#except: #except:
# pass # pass
print '\n'.join(str(dump) for dump in undumped)
with open('wikia.com-unarchived', 'w+') as out:
out.write('\n'.join(str(domain) for domain in undumped))
if __name__ == '__main__': if __name__ == '__main__':
main() main()

@ -0,0 +1,61 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import random
import re
import sys
import time
import urllib.request
def main():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.1')]
urllib.request.install_opener(opener)
for i in range(1, 100000):
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikidot.com' % (random.randint(100, 5000), random.randint(1000, 9999))
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
time.sleep(30)
continue
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
for wiki in m:
wiki = 'https://' + wiki
if not wiki in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open('wikidot-duckduckgo.txt', 'w') as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r'https?://www\.', 'http://', wiki)
if not wiki in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
f.write('\n'.join(wikis))
print('%d wikis found' % (len(wikis)))
sleep = random.randint(5,20)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
if __name__ == '__main__':
main()

@ -0,0 +1,65 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import random
import re
import sys
import time
import urllib.request
def main():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
wikis = []
with open('wikidot-spider.txt', 'r') as f:
wikis = f.read().strip().splitlines()
for i in range(1, 1000000):
url = random.choice(wikis)
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
time.sleep(30)
continue
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
for wiki in m:
wiki = 'http://' + wiki
if not wiki in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open('wikidot-spider.txt', 'w') as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r'https?://www\.', 'http://', wiki)
if not wiki in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
f.write('\n'.join(wikis))
print('%d wikis found' % (len(wikis)))
sleep = random.randint(1,5)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
if __name__ == '__main__':
main()

@ -0,0 +1,871 @@
http://007.wikidot.com
http://025002.wikidot.com
http://05centraal.wikidot.com
http://05command-ja.wikidot.com
http://05command.wikidot.com
http://05zentrale.wikidot.com
http://101.wikidot.com
http://16thfleet.wikidot.com
http://2012hoax.wikidot.com
http://56wrtg1150.wikidot.com
http://5edndwiki.wikidot.com
http://E-H-S.wikidot.com
http://F90in15Minutes.wikidot.com
http://Health-Matters.wikidot.com
http://Herbis.wikidot.com
http://INCL.wikidot.com
http://a4art.wikidot.com
http://abarrelfull.wikidot.com
http://academicwriting.wikidot.com
http://ad3.wikidot.com
http://admindevelopement.wikidot.com
http://advent-ro.wikidot.com
http://adventuresintherealms.wikidot.com
http://aepassociation.wikidot.com
http://aepsandbox.wikidot.com
http://afterthecomet-v2.wikidot.com
http://ageofascension.wikidot.com
http://ageofheroesmux.wikidot.com
http://airchairbuild.wikidot.com
http://albums-template.wikidot.com
http://alfamedia.wikidot.com
http://algadon.wikidot.com
http://alicebot.wikidot.com
http://alveslima-edu.wikidot.com
http://amawal.wikidot.com
http://amen.wikidot.com
http://amiii.wikidot.com
http://analyticscamp.wikidot.com
http://android0.wikidot.com
http://androidalchemy.wikidot.com
http://angarmegia-creadores.wikidot.com
http://angarmegia-publicaciones.wikidot.com
http://angarmegia-valores.wikidot.com
http://angarmegia.wikidot.com
http://angry-mage-games.wikidot.com
http://anime-planet.wikidot.com
http://apmoderneuro.wikidot.com
http://applebyandwyman.wikidot.com
http://aprendiendo.wikidot.com
http://aq-3d.wikidot.com
http://aqw-swf.wikidot.com
http://aqwwiki.wikidot.com
http://arcana.wikidot.com
http://arcblade.wikidot.com
http://artemachia.wikidot.com
http://artniyet.wikidot.com
http://asen.wikidot.com
http://asoh.wikidot.com
http://aspnet.wikidot.com
http://astrobhadauria.wikidot.com
http://astrobhadauria1414.wikidot.com
http://astroveda.wikidot.com
http://astroyogas.wikidot.com
http://asu-csf.wikidot.com
http://audioprodukcja.wikidot.com
http://avendar.wikidot.com
http://aviationknowledge.wikidot.com
http://avoidglow.wikidot.com
http://azentia.wikidot.com
http://babel-template.wikidot.com
http://backpharma.wikidot.com
http://backupstorage.wikidot.com
http://badwebcomics.wikidot.com
http://balchipedia.wikidot.com
http://barakus.wikidot.com
http://battlestargenesis.wikidot.com
http://bcp.wikidot.com
http://beadersresourceguide.wikidot.com
http://beargod.wikidot.com
http://benitachell-bowls-club.wikidot.com
http://bhg.wikidot.com
http://bibles.wikidot.com
http://bilbreyapwh.wikidot.com
http://biol-117.wikidot.com
http://biol252-biol319.wikidot.com
http://bioproject.wikidot.com
http://bisgmit.wikidot.com
http://blackbelt.wikidot.com
http://blackberrystorm.wikidot.com
http://blackmarches.wikidot.com
http://blank-template.wikidot.com
http://bleachitp.wikidot.com
http://blender0.wikidot.com
http://blender1.wikidot.com
http://blmodding.wikidot.com
http://blog-template.wikidot.com
http://blog.wikidot.com
http://blogs-template.wikidot.com
http://bloodborne.wikidot.com
http://bni-ine.wikidot.com
http://book-template.wikidot.com
http://booriley.wikidot.com
http://bootstrap-playground.wikidot.com
http://borderlands.wikidot.com
http://borradores-insurgencia-del-caos.wikidot.com
http://borradores-scp-es.wikidot.com
http://bozic-nation.wikidot.com
http://brmehta12.wikidot.com
http://brtff.wikidot.com
http://brydz.wikidot.com
http://bua581.wikidot.com
http://bua581beerworks.wikidot.com
http://bua581hallelibraryfinalproject.wikidot.com
http://bugs-template.wikidot.com
http://bugs.wikidot.com
http://burntlands.wikidot.com
http://bvs.wikidot.com
http://bx-community.wikidot.com
http://bzhlab.wikidot.com
http://c4fsharp.wikidot.com
http://calu.wikidot.com
http://campusconfidential.wikidot.com
http://cancer-control.wikidot.com
http://caosinsurgente.wikidot.com
http://carpenoctemstaff.wikidot.com
http://castleage.wikidot.com
http://caughtnotsleeping.wikidot.com
http://ccckmit.wikidot.com
http://ccpd.wikidot.com
http://cctest.wikidot.com
http://ccyms.wikidot.com
http://ccymsevangelization.wikidot.com
http://ccymsfoundations.wikidot.com
http://ccymsjustice.wikidot.com
http://ccymslounge.wikidot.com
http://ccymspastoral.wikidot.com
http://ccymspractices.wikidot.com
http://ccymsprayer.wikidot.com
http://ccymsprinciples.wikidot.com
http://ccymsskills.wikidot.com
http://ccymsstudents.wikidot.com
http://cdaworldhistory.wikidot.com
http://cellworld.wikidot.com
http://celtic-heroes.wikidot.com
http://cf-vanguard.wikidot.com
http://cgp.wikidot.com
http://chaoscomplexityineducation.wikidot.com
http://chat-template.wikidot.com
http://chatroom.wikidot.com
http://chavezbraintrust.wikidot.com
http://chcc.wikidot.com
http://chessvariants.wikidot.com
http://chimiex-bicaz.wikidot.com
http://ci-sandbox.wikidot.com
http://ci-visualdocuments.wikidot.com
http://ci-wiki.wikidot.com
http://circservices.wikidot.com
http://ciscotr.wikidot.com
http://cityofangels.wikidot.com
http://cleanias.wikidot.com
http://cmbeta.wikidot.com
http://coffeetime.wikidot.com
http://coffeetimex.wikidot.com
http://colbycriminaljustice.wikidot.com
http://columbiacity.wikidot.com
http://comando05.wikidot.com
http://comando05ptbr.wikidot.com
http://commandement-alpha.wikidot.com
http://commandemento5.wikidot.com
http://communicity.wikidot.com
http://communicity2010.wikidot.com
http://community-playground.wikidot.com
http://community.wikidot.com
http://computer0.wikidot.com
http://comux.wikidot.com
http://connorscampaigns.wikidot.com
http://connorscentral.wikidot.com
http://connorsgmnotes.wikidot.com
http://connorssettings.wikidot.com
http://consumerpsych2009.wikidot.com
http://convert.wikidot.com
http://copernicon.wikidot.com
http://corvidcollege.wikidot.com
http://corwyn.wikidot.com
http://cpp-wiki.wikidot.com
http://cquniversity.wikidot.com
http://crashfeverwikitw.wikidot.com
http://crimjobs2010-2011.wikidot.com
http://crm-iseg.wikidot.com
http://crm-template.wikidot.com
http://crosswindsgarou.wikidot.com
http://crypsis-net.wikidot.com
http://cs0.wikidot.com
http://cs1.wikidot.com
http://cs101c.wikidot.com
http://cs124project-2009.wikidot.com
http://csc180.wikidot.com
http://csi.wikidot.com
http://css-competition.wikidot.com
http://css-sandbox.wikidot.com
http://css.wikidot.com
http://css3.wikidot.com
http://css3themes.wikidot.com
http://cst133a.wikidot.com
http://ctwiki.wikidot.com
http://cuarteldelo5.wikidot.com
http://cubesat.wikidot.com
http://cuiltheory.wikidot.com
http://cunefa2.wikidot.com
http://cunefb2.wikidot.com
http://cunefc2.wikidot.com
http://cunefe2.wikidot.com
http://cyclods.wikidot.com
http://daeren.wikidot.com
http://darksouls.wikidot.com
http://darksouls2.wikidot.com
http://darksouls3.wikidot.com
http://dawnofanewage.wikidot.com
http://dcernst-teaching.wikidot.com
http://dcernst.wikidot.com
http://ddscat.wikidot.com
http://defa.wikidot.com
http://default-template.wikidot.com
http://defunct-elitequestworlds.wikidot.com
http://demonssouls.wikidot.com
http://denver.wikidot.com
http://desenvolvimentodejogos.wikidot.com
http://design-illustration.wikidot.com
http://destiny.wikidot.com
http://detailed-customer-management.wikidot.com
http://dndis.wikidot.com
http://docpl.wikidot.com
http://dokument-uz.wikidot.com
http://dotflow.wikidot.com
http://downsfolk.wikidot.com
http://dowodztwo.wikidot.com
http://dragon-trees.wikidot.com
http://dreamprogram.wikidot.com
http://dreamteam.wikidot.com
http://dresdenfiles.wikidot.com
http://ds09.wikidot.com
http://ds10.wikidot.com
http://ds2009a.wikidot.com
http://ds2010a.wikidot.com
http://dwd.wikidot.com
http://e-h-s.wikidot.com
http://earlychildhood.wikidot.com
http://eberronunlimited.wikidot.com
http://ecadmin.wikidot.com
http://ecctimeline.wikidot.com
http://echobazaar.wikidot.com
http://ecomind.wikidot.com
http://editor.wikidot.com
http://editora.wikidot.com
http://edmw.wikidot.com
http://educ400-401.wikidot.com
http://education-template.wikidot.com
http://efepereth.wikidot.com
http://eime.wikidot.com
http://eitriggcrafting.wikidot.com
http://ejs-in-india.wikidot.com
http://eldritch00.wikidot.com
http://elishapeterson.wikidot.com
http://elsirvale.wikidot.com
http://elunesjustice.wikidot.com
http://emchina2010.wikidot.com
http://enchantedbros.wikidot.com
http://encyclowiki.wikidot.com
http://energyclub.wikidot.com
http://energyclub4samvedna.wikidot.com
http://energyfuture.wikidot.com
http://eng1d1.wikidot.com
http://eng270.wikidot.com
http://epimreth.wikidot.com
http://epitome.wikidot.com
http://esperanto.wikidot.com
http://estudianteseconomiauned.wikidot.com
http://eventidemush.wikidot.com
http://everydaymagicalgirls.wikidot.com
http://evilhat.wikidot.com
http://execs.wikidot.com
http://exploringsciencewiki.wikidot.com
http://extrabees.wikidot.com
http://f650cs.wikidot.com
http://fairfieldproject.wikidot.com
http://falchionvalley.wikidot.com
http://fallout2online.wikidot.com
http://faq.wikidot.com
http://fearschemistry.wikidot.com
http://fed20.wikidot.com
http://feedback-template.wikidot.com
http://feedback.wikidot.com
http://fifa360.wikidot.com
http://fifabeapro360.wikidot.com
http://fightcorruption.wikidot.com
http://figmentregistry.wikidot.com
http://fillionempire.wikidot.com
http://finalfantasy14fr.wikidot.com
http://first-steps.wikidot.com
http://flyclear.wikidot.com
http://fmi.wikidot.com
http://fmiseria3.wikidot.com
http://fondationscp.wikidot.com
http://fondationscpsandbox.wikidot.com
http://fondazionescp.wikidot.com
http://fortean.wikidot.com
http://forum-template.wikidot.com
http://forum.wikidot.com
http://fourthwallgames.wikidot.com
http://fpt.wikidot.com
http://freevoddler.wikidot.com
http://fretsonfire.wikidot.com
http://futaba8fg.wikidot.com
http://gagetowngaming.wikidot.com
http://galacticunity.wikidot.com
http://game-maker.wikidot.com
http://gamedesign.wikidot.com
http://gamemaker.wikidot.com
http://gasbags.wikidot.com
http://gd28.wikidot.com
http://gdnd.wikidot.com
http://gdt2009.wikidot.com
http://gear-sandbox.wikidot.com
http://geararc.wikidot.com
http://genderbinary.wikidot.com
http://generals.wikidot.com
http://ginnungagap.wikidot.com
http://globalseminarhealth.wikidot.com
http://goddardtech.wikidot.com
http://gorszy.wikidot.com
http://greatestfilipino.wikidot.com
http://green-house.wikidot.com
http://guitarzero.wikidot.com
http://gurpswiki.wikidot.com
http://h205.wikidot.com
http://hackersderede.wikidot.com
http://halfmoonbay.wikidot.com
http://hammer-template.wikidot.com
http://handbook.wikidot.com
http://harvey-capital-lectures.wikidot.com
http://health-matters.wikidot.com
http://herbis.wikidot.com
http://heroes.wikidot.com
http://heroesmush.wikidot.com
http://heroesofalvena.wikidot.com
http://heroessincity.wikidot.com
http://hestia.wikidot.com
http://hfwiki.wikidot.com
http://hiddenprojectwiki.wikidot.com
http://himetop.wikidot.com
http://historynewmedia.wikidot.com
http://hkcentral.wikidot.com
http://hogwarts2092.wikidot.com
http://hopkinswhpg.wikidot.com
http://housegames.wikidot.com
http://hp-intothefire.wikidot.com
http://hrpg.wikidot.com
http://hscwizards.wikidot.com
http://hswiki.wikidot.com
http://html50.wikidot.com
http://iaac-readings.wikidot.com
http://iatkos.wikidot.com
http://ibhistory.wikidot.com
http://ibi-apedia.wikidot.com
http://ibiz.wikidot.com
http://ibmathstuff.wikidot.com
http://ibphysicsstuff.wikidot.com
http://ibstuffqa.wikidot.com
http://iceal.wikidot.com
http://idrumaaps.wikidot.com
http://ifs.wikidot.com
http://igen.wikidot.com
http://igor.wikidot.com
http://imocamp.wikidot.com
http://incl.wikidot.com
http://inctr-news.wikidot.com
http://inctr-palliative-care-handbook.wikidot.com
http://inctr.wikidot.com
http://indexhibit.wikidot.com
http://insomniacramblings.wikidot.com
http://installer.wikidot.com
http://insurrection-du-chaos-sandbox.wikidot.com
http://insurrection-du-chaos.wikidot.com
http://inter-irc.wikidot.com
http://internationalbatesoninstitute.wikidot.com
http://internetior.wikidot.com
http://involo.wikidot.com
http://ipr10.wikidot.com
http://ipr11.wikidot.com
http://ipr12.wikidot.com
http://iracing.wikidot.com
http://irc.wikidot.com
http://irongiant.wikidot.com
http://irunath.wikidot.com
http://is2216.wikidot.com
http://ischool.wikidot.com
http://isocentre.wikidot.com
http://issuetracker-template.wikidot.com
http://istar.wikidot.com
http://istb-winter2010.wikidot.com
http://istep-sandbox.wikidot.com
http://itb322uap.wikidot.com
http://ivm.wikidot.com
http://jakilinux.wikidot.com
http://java.wikidot.com
http://jayashree.wikidot.com
http://jccict.wikidot.com
http://johnmerritt.wikidot.com
http://join.wikidot.com
http://jquery-easyui.wikidot.com
http://jslibrary.wikidot.com
http://jsukfpsd.wikidot.com
http://kalgati.wikidot.com
http://kannadanudi.wikidot.com
http://karma-lab.wikidot.com
http://kdiprivateequity.wikidot.com
http://keramik.wikidot.com
http://kf59.wikidot.com
http://kfmapdb.wikidot.com
http://khaidoan.wikidot.com
http://kharon.wikidot.com
http://kindiy.wikidot.com
http://kingsway.wikidot.com
http://kingswayeap.wikidot.com
http://kingswayelem.wikidot.com
http://kingswayielts.wikidot.com
http://kingswayint.wikidot.com
http://kingswaypreint.wikidot.com
http://kingswayupper.wikidot.com
http://klps.wikidot.com
http://kmhouse.wikidot.com
http://kmk.wikidot.com
http://knightswrite.wikidot.com
http://kodo.wikidot.com
http://koty.wikidot.com
http://ksemoudania.wikidot.com
http://ladyhood66.wikidot.com
http://lafundacionscp.wikidot.com
http://languagearts8.wikidot.com
http://lapidaria.wikidot.com
http://lasthaiku.wikidot.com
http://latindictionary.wikidot.com
http://latmari.wikidot.com
http://leplouc.wikidot.com
http://lepszy.wikidot.com
http://level1wiki.wikidot.com
http://libevents.wikidot.com
http://liblivadia.wikidot.com
http://librarylab.wikidot.com
http://lightworks.wikidot.com
http://linux0.wikidot.com
http://livesupport.wikidot.com
http://lmtoelf.wikidot.com
http://loosepages.wikidot.com
http://ltt.wikidot.com
http://lulu.wikidot.com
http://m5snapoli.wikidot.com
http://ma4140.wikidot.com
http://machines-history.wikidot.com
http://machinima138.wikidot.com
http://mactutorial.wikidot.com
http://maegica.wikidot.com
http://magiamesterei.wikidot.com
http://mainframes.wikidot.com
http://majjhima.wikidot.com
http://makeyourbot.wikidot.com
http://malkavian.wikidot.com
http://managerzonemexico.wikidot.com
http://maratona.wikidot.com
http://marblehornets.wikidot.com
http://margopedia.wikidot.com
http://marketplace-template.wikidot.com
http://marvelreborn.wikidot.com
http://marvelrevolution.wikidot.com
http://masonic.wikidot.com
http://math453fall2008.wikidot.com
http://mathaerobics4samvedna.wikidot.com
http://mathonline.wikidot.com
http://mathroughguides.wikidot.com
http://mbitcoin.wikidot.com
http://mc-21.wikidot.com
http://mcdt25e.wikidot.com
http://me1065.wikidot.com
http://measurementcamp.wikidot.com
http://media.wikidot.com
http://miedzymorze.wikidot.com
http://minahaplo.wikidot.com
http://mis213-2.wikidot.com
http://mk2k.wikidot.com
http://mkworld.wikidot.com
http://mnprek-3.wikidot.com
http://monacobayweyr.wikidot.com
http://monobook-template.wikidot.com
http://monobook.wikidot.com
http://monodot-template.wikidot.com
http://morningside-genetics.wikidot.com
http://morningsidemicro.wikidot.com
http://morphopedics.wikidot.com
http://mpm.wikidot.com
http://mukesh381.wikidot.com
http://multiverse-crisis.wikidot.com
http://musicgames.wikidot.com
http://my-pride.wikidot.com
http://mybookworld.wikidot.com
http://myslimchatroom.wikidot.com
http://myvineyard.wikidot.com
http://nanorodsa.wikidot.com
http://nanorodthermo.wikidot.com
http://narutoitp.wikidot.com
http://narutomushrivalry.wikidot.com
http://nauticoamager.wikidot.com
http://neo-dimension.wikidot.com
http://neosteam.wikidot.com
http://neozone.wikidot.com
http://newapprequirements.wikidot.com
http://news.wikidot.com
http://nightskysymbology.wikidot.com
http://nimin.wikidot.com
http://ninjaproxy.wikidot.com
http://nirn.wikidot.com
http://nnhs-science-restrictedaccess.wikidot.com
http://nnhs-science.wikidot.com
http://noblebeastwars.wikidot.com
http://nomyslamps.wikidot.com
http://norron.wikidot.com
http://notebook-template.wikidot.com
http://notebooks.wikidot.com
http://nre509.wikidot.com
http://nsb.wikidot.com
http://ntumed96.wikidot.com
http://nucularelectronics.wikidot.com
http://o5command-int.wikidot.com
http://o5command-th.wikidot.com
http://oblivionshard.wikidot.com
http://offtopicarium.wikidot.com
http://old-template.wikidot.com
http://oneeleventwentyten.wikidot.com
http://opend6.wikidot.com
http://opensource-template.wikidot.com
http://opensuse.wikidot.com
http://oppt-sa.wikidot.com
http://oregonamhi.wikidot.com
http://osx86.wikidot.com
http://oversoulgame.wikidot.com
http://ozradonc.wikidot.com
http://packages.wikidot.com
http://pagi.wikidot.com
http://pandora-saga.wikidot.com
http://papercraft.wikidot.com
http://paperworks.wikidot.com
http://paradiserpg.wikidot.com
http://paradoxhaze.wikidot.com
http://paralelo.wikidot.com
http://parented.wikidot.com
http://passatb5.wikidot.com
http://pathtogolarion.wikidot.com
http://patriot-box-office.wikidot.com
http://patterns.wikidot.com
http://pbbg.wikidot.com
http://pcg.wikidot.com
http://pcif.wikidot.com
http://pedhemoncreview.wikidot.com
http://perchelinux.wikidot.com
http://pernworld.wikidot.com
http://personal-template.wikidot.com
http://petition-template.wikidot.com
http://pfcuq.wikidot.com
http://pfseconddarkness.wikidot.com
http://phikappatau.wikidot.com
http://philosophia.wikidot.com
http://philosophiesoflife.wikidot.com
http://photo-gallery-template.wikidot.com
http://phylo.wikidot.com
http://pl.wikidot.com
http://playstation3hacksandmods.wikidot.com
http://pofomultiquiz.wikidot.com
http://pogon.wikidot.com
http://polls.wikidot.com
http://porphyrarpg.wikidot.com
http://porsche.wikidot.com
http://pottersarmy.wikidot.com
http://predev.wikidot.com
http://private-template.wikidot.com
http://processexcel.wikidot.com
http://professorallred.wikidot.com
http://profiles.wikidot.com
http://project-template.wikidot.com
http://projects.wikidot.com
http://ps3indexhelp.wikidot.com
http://psi-ppwg.wikidot.com
http://psms.wikidot.com
http://psrboregon.wikidot.com
http://psyc101.wikidot.com
http://psychjobsearch.wikidot.com
http://psychotronicsdivision.wikidot.com
http://pt851.wikidot.com
http://puddincupcss.wikidot.com
http://puppet.wikidot.com
http://pw7890o.wikidot.com
http://pylint-messages.wikidot.com
http://qttabbar.wikidot.com
http://quiat.wikidot.com
http://r.wikidot.com
http://radonc.wikidot.com
http://railgunitp.wikidot.com
http://ravenmarches.wikidot.com
http://realestate-template.wikidot.com
http://redirect-template.wikidot.com
http://redsite.wikidot.com
http://renegadesofpw.wikidot.com
http://reshme.wikidot.com
http://reskitchen.wikidot.com
http://retrolegends.wikidot.com
http://retrowiki.wikidot.com
http://reykjavikmanifesto.wikidot.com
http://rhetoricalgoddess.wikidot.com
http://rmitvnim2007b.wikidot.com
http://roadmap.wikidot.com
http://roboticsclubucla.wikidot.com
http://roboticspedia.wikidot.com
http://rock-xproject.wikidot.com
http://rtd1261.wikidot.com
http://rxwiki.wikidot.com
http://s7s.wikidot.com
http://sacwwiki.wikidot.com
http://salamander724.wikidot.com
http://saludintegral.wikidot.com
http://samvedna.wikidot.com
http://sandboxscpfr.wikidot.com
http://sasana.wikidot.com
http://sasi555.wikidot.com
http://savagetidewithfiretrolls.wikidot.com
http://scala.wikidot.com
http://schoolsteachersparents.wikidot.com
http://schrijven.wikidot.com
http://scienceonlinelondon.wikidot.com
http://scion-mmp.wikidot.com
http://scp-et.wikidot.com
http://scp-field-work.wikidot.com
http://scp-foundation-origins.wikidot.com
http://scp-he.wikidot.com
http://scp-hu.wikidot.com
http://scp-int-sandbox.wikidot.com
http://scp-int.wikidot.com
http://scp-international.wikidot.com
http://scp-jp-admin.wikidot.com
http://scp-jp-archive.wikidot.com
http://scp-jp-sandbox2.wikidot.com
http://scp-jp-sandbox3.wikidot.com
http://scp-jp.wikidot.com
http://scp-ko-15c.wikidot.com
http://scp-kr.wikidot.com
http://scp-la.wikidot.com
http://scp-nd.wikidot.com
http://scp-nl.wikidot.com
http://scp-pl-sandbox.wikidot.com
http://scp-pl.wikidot.com
http://scp-pt-br.wikidot.com
http://scp-pt.wikidot.com
http://scp-ru.wikidot.com
http://scp-sandbox-3.wikidot.com
http://scp-sandbox-la.wikidot.com
http://scp-spqr.wikidot.com
http://scp-template.wikidot.com
http://scp-th-sandbox.wikidot.com
http://scp-th.wikidot.com
http://scp-tw.wikidot.com
http://scp-ukrainian.wikidot.com
http://scp-un.wikidot.com
http://scp-vn.wikidot.com
http://scp-wiki-cn.wikidot.com
http://scp-wiki-de.wikidot.com
http://scp-wiki.wikidot.com
http://scpalex-fh.wikidot.com
http://scpclassic.wikidot.com
http://scpexplained.wikidot.com
http://scpjp-fansite.wikidot.com
http://scpkoreahq.wikidot.com
http://scpminecraft.wikidot.com
http://scpsandbox-jp.wikidot.com
http://scpsandbox-pl.wikidot.com
http://scpsandbox-ua.wikidot.com
http://scpsandbox2.wikidot.com
http://scpsandboxbr.wikidot.com
http://scpsandboxcn.wikidot.com
http://scpsandboxde.wikidot.com
http://scpsandboxit.wikidot.com
http://scpsandboxnl.wikidot.com
http://scpvakfi.wikidot.com
http://scpvakfisandbox.wikidot.com
http://scpvnsandbox.wikidot.com
http://scratch4samvedna.wikidot.com
http://serpents-hand.wikidot.com
http://sfi.wikidot.com
http://sfugamedev.wikidot.com
http://shadow4e.wikidot.com
http://sharecokecodes.wikidot.com
http://shop.wikidot.com
http://sicurezzapubblica.wikidot.com
http://sidowegraty.wikidot.com
http://signaturbogen.wikidot.com
http://siluria.wikidot.com
http://simtrackipedia.wikidot.com
http://sistdig.wikidot.com
http://siteclone.wikidot.com
http://sky852751.wikidot.com
http://skyangel.wikidot.com
http://slaythespire.wikidot.com
http://sliscomps.wikidot.com
http://slownik-geologiczny.wikidot.com
http://small-steps.wikidot.com
http://smofficer.wikidot.com
http://smsalgebra.wikidot.com
http://sniktbub.wikidot.com
http://snippets.wikidot.com
http://snow-template.wikidot.com
http://snowleopard.wikidot.com
http://sociatecture.wikidot.com
http://sociatectureblog.wikidot.com
http://socjobs.wikidot.com
http://socjobs2011.wikidot.com
http://soctech.wikidot.com
http://softwarecraftsmanship.wikidot.com
http://solariapedia.wikidot.com
http://solodarydar.wikidot.com
http://solpadeinehelp.wikidot.com
http://sortibrige.wikidot.com
http://soulslore.wikidot.com
http://soymilkls.wikidot.com
http://sp1.wikidot.com
http://spambotdeathwall.wikidot.com
http://sparks.wikidot.com
http://sped.wikidot.com
http://splinterverse.wikidot.com
http://spolecznosc.wikidot.com
http://srm.wikidot.com
http://st-phelpers.wikidot.com
http://stallmanism.wikidot.com
http://standard-template.wikidot.com
http://starwarsmadness.wikidot.com
http://static.wikidot.com
http://steelandstone.wikidot.com
http://storychip.wikidot.com
http://string-theory.wikidot.com
http://studiocomments.wikidot.com
http://studiolynn.wikidot.com
http://suffadv.wikidot.com
http://summer350.wikidot.com
http://summerisle.wikidot.com
http://sunnybrook-academy.wikidot.com
http://superjet.wikidot.com
http://surreal64ce.wikidot.com
http://sw-gis.wikidot.com
http://swietomuzyki.wikidot.com
http://swwotc.wikidot.com
http://talesofhonor.wikidot.com
http://talkingpadproject.wikidot.com
http://task-management.wikidot.com
http://tasker.wikidot.com
http://tauren.wikidot.com
http://tech-racingcars.wikidot.com
http://techblog-template.wikidot.com
http://techcomm.wikidot.com
http://ten-sb.wikidot.com
http://terrasdeportugal.wikidot.com
http://tex.wikidot.com
http://textanalytics.wikidot.com
http://the-nexus.wikidot.com
http://theanarchstate.wikidot.com
http://theblightedworld.wikidot.com
http://thecollaboratory.wikidot.com
http://thegamerdome.wikidot.com
http://thekingkillerchronicle.wikidot.com
http://thelaststory.wikidot.com
http://themes.wikidot.com
http://thep-serc.wikidot.com
http://therafim.wikidot.com
http://therafimrpg.wikidot.com
http://thesimsonline.wikidot.com
http://theskyremains.wikidot.com
http://theunforgotten.wikidot.com
http://thewake.wikidot.com
http://theweird.wikidot.com
http://theweirdwest.wikidot.com
http://ti-iseg-t12.wikidot.com
http://ti-iseg-t19.wikidot.com
http://tibasicdev.wikidot.com
http://timidgirls.wikidot.com
http://tlug.wikidot.com
http://tlumaczenia.wikidot.com
http://tmduc.wikidot.com
http://tradewithsaint.wikidot.com
http://translate.wikidot.com
http://translators-forum.wikidot.com
http://trb-mux.wikidot.com
http://triathematician.wikidot.com
http://trueblood-dallas.wikidot.com
http://try.wikidot.com
http://ttu-dom.wikidot.com
http://tyf.wikidot.com
http://typesets.wikidot.com
http://ubmedicinefaqs.wikidot.com
http://ucsdgrads.wikidot.com
http://ukcw.wikidot.com
http://ultimatemutantsofgagetown.wikidot.com
http://umassenglishgrad.wikidot.com
http://uml.wikidot.com
http://underworldlarp.wikidot.com
http://uniofbeds.wikidot.com
http://urbanmobile.wikidot.com
http://uscta.wikidot.com
http://user-gemeinschaft.wikidot.com
http://usma387.wikidot.com
http://valeofcallus.wikidot.com
http://veritasbatheo.wikidot.com
http://videoart.wikidot.com
http://viotikoskosmos.wikidot.com
http://virtualwargamer.wikidot.com
http://viscomclass.wikidot.com
http://visual-records.wikidot.com
http://vitalusers.wikidot.com
http://vocaro.wikidot.com
http://vs-tcg.wikidot.com
http://vtls-vital.wikidot.com
http://vusb.wikidot.com
http://vwinterop.wikidot.com
http://vyprmedia.wikidot.com
http://w24.wikidot.com
http://wanderers-library-ko.wikidot.com
http://wanderers-library.wikidot.com
http://wanderers-sandbox.wikidot.com
http://warsztatywww.wikidot.com
http://web0.wikidot.com
http://webcomicauthority.wikidot.com
http://wfh.wikidot.com
http://whanethewhip.wikidot.com
http://whatever.wikidot.com
http://wherearethejoneses.wikidot.com
http://wikidot.com
http://wikiedresearch.wikidot.com
http://wikiethica.wikidot.com
http://wikim5s.wikidot.com
http://wikinorm.wikidot.com
http://wikiofscience.wikidot.com
http://wikirhye.wikidot.com
http://wikirmaphil.wikidot.com
http://wikistoriaenciclopedia.wikidot.com
http://wikitipsgr.wikidot.com
http://windycity.wikidot.com
http://wiwimush.wikidot.com
http://world.wikidot.com
http://wow-arrakis.wikidot.com
http://wpts.wikidot.com
http://wqa.wikidot.com
http://writ-111-office-hour-sign-up.wikidot.com
http://writingoneeleven.wikidot.com
http://wrtg1150.wikidot.com
http://wtg.wikidot.com
http://www-old.wikidot.com
http://wychwood.wikidot.com
http://xanadu.wikidot.com
http://y31.wikidot.com
http://ye-olde-music-industrapedia.wikidot.com
http://yo801106.wikidot.com
http://yyp.wikidot.com
http://zeroshell.wikidot.com
http://zmk.wikidot.com
http://zodiac-ffrpg.wikidot.com
http://zodiac-monster-manual.wikidot.com
http://zombiecafe.wikidot.com
http://zorya.wikidot.com

@ -0,0 +1,214 @@
arte
cine
lengua
literatura
matematicas
ingles
frances
aleman
ruso
idiomas
geografia
historia
secundaria
bachillerato
examen
examenes
profesor
educacion
profesores
historias
extremadura
andalucia
iberia
oceano
cultura
periodico
television
radio
italiano
polaco
chino
japones
coreano
musica
mozart
beethoven
asimov
newton
kilogramo
teoria
fisica
deporte
cancion
futbol
astronomia
telescopio
cuaderno
libro
texto
pizarra
descartes
galileo
fosiles
paisaje
fosil
paisajes
mar
oceano
espacio
meteorologia
nubes
religion
bandera
lengua
politica
biologia
quimica
medicina
tecnologia
diagrama
mapa
mapas
dibujos
pronunciacion
arquitectura
compositor
pintor
pintura
escultura
museo
biblioteca
museos
bibliotecas
enciclopedia
diccionario
filosofia
filosofos
feminismo
sociologia
leyes
coche
barco
avion
transporte
teatro
europa
america
africa
asia
oceania
australia
atlantico
mediterraneo
fenicios
griegos
cartagineses
palabras
numeros
escritura
isla
java
python
programacion
piramide
cuadrado
geometria
rectangulo
circulo
ciencia
marx
engels
platon
socrates
continente
tormenta
terremoto
proyecto
glosario
vocabulario
aprender
recursos
lectura
comunicacion
salud
bienestar
europeo
africano
asiatico
americano
wiki
wikis
documental
documentales
bibliografia
documentacion
ciencias
naturales
sociales
inteligencia
investigacion
cientifico
tecnico
cientifica
enlaces
antropologia
arqueologia
arqueologo
filologia
arduino
software
hardware
computador
ordenador
siglo xx
siglo xix
siglo xviii
siglo xvii
siglo xvi
siglo xv
libros
marte
tierra
mercurio
jupiter
saturno
urano
neptuno
pluton
cometa
asteroide
luna
pajaro
ave
aves
reptil
reptiles
flores
arboles
flor
dictadura
democracia
parlamento
universidad
universidades
empresa
comida
alimento
equipo
lampara
luz
bombilla
electricidad
frigorifico
lavadora
mueble
fregona
espacio
sol
estrella
fenomeno
hispanico
hispanica
biodiversidad
guerra fria

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,86 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import random
import re
import time
import urllib.request
def main():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
words = []
with open('words.txt', 'r') as f:
words = f.read().strip().splitlines()
random.shuffle(words)
print('Loaded %d words from file' % (len(words)))
#words = words + ['%d' % (i) for i in range(1900, 1980, 10)]
wikis = []
with open('wikispaces-duckduckgo.txt', 'r') as f:
wikis = f.read().strip().splitlines()
wikis.sort()
print('Loaded %d wikis from file' % (len(wikis)))
for i in range(1, 100):
random.shuffle(words)
for word in words:
print('Word', word)
word_ = re.sub(' ', '+', word)
url = ''
r = random.randint(0, 10)
if r == 0:
url = 'https://duckduckgo.com/html/?q=%s%%20site:wikispaces.com' % (word_)
elif r == 1:
url = 'https://duckduckgo.com/html/?q=%s%%20wikispaces.com' % (word_)
elif r == 2:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
elif r == 3:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (random.randint(100, 3000), word_)
else:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
sys.exit()
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
for wiki in m:
wiki = 'https://' + wiki
if not wiki in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open('wikispaces-duckduckgo.txt', 'w') as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r'https://www\.', 'https://', wiki)
if not wiki in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
f.write('\n'.join(wikis))
print('%d wikis found' % (len(wikis)))
sleep = random.randint(5,20)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

@ -16,6 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import csv import csv
import random
import re import re
import time import time
import urllib2 import urllib2
@ -88,6 +89,8 @@ def getWikis(user):
return {} return {}
def main(): def main():
sleep = 0.1
rand = 10
users = loadUsers() users = loadUsers()
wikis = loadWikis() wikis = loadWikis()
@ -112,11 +115,16 @@ def main():
c += 1 c += 1
print 'Found %s new users' % (c) print 'Found %s new users' % (c)
if c > 0: if c > 0:
saveUsers(users) if random.randint(0,rand) == 0:
users = loadUsers() saveUsers(users)
saveWikis(wikis) users = loadUsers()
time.sleep(1) if random.randint(0,rand) == 0:
saveWikis(wikis)
time.sleep(sleep)
saveWikis(wikis)
wikis = loadWikis() wikis = loadWikis()
saveUsers(users)
users = loadUsers()
# find more wikis # find more wikis
print 'Scanning users for more wikis' print 'Scanning users for more wikis'
@ -133,10 +141,15 @@ def main():
c += 1 c += 1
print 'Found %s new wikis' % (c) print 'Found %s new wikis' % (c)
if c > 0: if c > 0:
saveWikis(wikis) if random.randint(0,rand) == 0:
wikis = loadWikis() saveWikis(wikis)
saveUsers(users) wikis = loadWikis()
time.sleep(1) if random.randint(0,rand) == 0:
saveUsers(users)
time.sleep(sleep)
saveWikis(wikis)
wikis = loadWikis()
saveUsers(users)
users = loadUsers() users = loadUsers()
print '\nSummary:' print '\nSummary:'

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,214 @@
arte
cine
lengua
literatura
matematicas
ingles
frances
aleman
ruso
idiomas
geografia
historia
secundaria
bachillerato
examen
examenes
profesor
educacion
profesores
historias
extremadura
andalucia
iberia
oceano
cultura
periodico
television
radio
italiano
polaco
chino
japones
coreano
musica
mozart
beethoven
asimov
newton
kilogramo
teoria
fisica
deporte
cancion
futbol
astronomia
telescopio
cuaderno
libro
texto
pizarra
descartes
galileo
fosiles
paisaje
fosil
paisajes
mar
oceano
espacio
meteorologia
nubes
religion
bandera
lengua
politica
biologia
quimica
medicina
tecnologia
diagrama
mapa
mapas
dibujos
pronunciacion
arquitectura
compositor
pintor
pintura
escultura
museo
biblioteca
museos
bibliotecas
enciclopedia
diccionario
filosofia
filosofos
feminismo
sociologia
leyes
coche
barco
avion
transporte
teatro
europa
america
africa
asia
oceania
australia
atlantico
mediterraneo
fenicios
griegos
cartagineses
palabras
numeros
escritura
isla
java
python
programacion
piramide
cuadrado
geometria
rectangulo
circulo
ciencia
marx
engels
platon
socrates
continente
tormenta
terremoto
proyecto
glosario
vocabulario
aprender
recursos
lectura
comunicacion
salud
bienestar
europeo
africano
asiatico
americano
wiki
wikis
documental
documentales
bibliografia
documentacion
ciencias
naturales
sociales
inteligencia
investigacion
cientifico
tecnico
cientifica
enlaces
antropologia
arqueologia
arqueologo
filologia
arduino
software
hardware
computador
ordenador
siglo xx
siglo xix
siglo xviii
siglo xvii
siglo xvi
siglo xv
libros
marte
tierra
mercurio
jupiter
saturno
urano
neptuno
pluton
cometa
asteroide
luna
pajaro
ave
aves
reptil
reptiles
flores
arboles
flor
dictadura
democracia
parlamento
universidad
universidades
empresa
comida
alimento
equipo
lampara
luz
bombilla
electricidad
frigorifico
lavadora
mueble
fregona
espacio
sol
estrella
fenomeno
hispanico
hispanica
biodiversidad
guerra fria

@ -62,14 +62,14 @@ class TestDumpgenerator(unittest.TestCase):
tests = [ tests = [
# Alone wikis # Alone wikis
#['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'], #['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'],
['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'], ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'], #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
# Editthis wikifarm # Editthis wikifarm
# It has a page view limit # It has a page view limit
# Gamepedia wikifarm # Gamepedia wikifarm
['http://dawngate.gamepedia.com/index.php', 'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'], #['http://dawngate.gamepedia.com/index.php', 'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'],
# Neoseeker wikifarm # Neoseeker wikifarm
#['http://digimon.neoseeker.com/w/index.php', 'http://digimon.neoseeker.com/w/api.php', u'Ogremon card.png'], #['http://digimon.neoseeker.com/w/index.php', 'http://digimon.neoseeker.com/w/api.php', u'Ogremon card.png'],
@ -78,13 +78,13 @@ class TestDumpgenerator(unittest.TestCase):
#['http://mc.orain.org/w/index.php', 'http://mc.orain.org/w/api.php', u'Mojang logo.svg'], #['http://mc.orain.org/w/index.php', 'http://mc.orain.org/w/api.php', u'Mojang logo.svg'],
# Referata wikifarm # Referata wikifarm
['http://wikipapers.referata.com/w/index.php', 'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'], #['http://wikipapers.referata.com/w/index.php', 'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'],
# ShoutWiki wikifarm # ShoutWiki wikifarm
['http://commandos.shoutwiki.com/w/index.php', 'http://commandos.shoutwiki.com/w/api.php', u'Night of the Wolves loading.png'], #['http://commandos.shoutwiki.com/w/index.php', 'http://commandos.shoutwiki.com/w/api.php', u'Night of the Wolves loading.png'],
# Wiki-site wikifarm # Wiki-site wikifarm
['http://minlingo.wiki-site.com/index.php', 'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'], #['http://minlingo.wiki-site.com/index.php', 'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'],
# Wikkii wikifarm # Wikkii wikifarm
# It seems offline # It seems offline
@ -146,8 +146,8 @@ class TestDumpgenerator(unittest.TestCase):
print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73 print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
tests = [ tests = [
# Alone wikis # Alone wikis
['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'], ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'April Fools\' Day'],
['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'], #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],
# Test old allpages API behaviour # Test old allpages API behaviour
#['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'], #['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'],
@ -261,7 +261,11 @@ class TestDumpgenerator(unittest.TestCase):
] ]
for wiki, engine in tests: for wiki, engine in tests:
print 'Testing', wiki print 'Testing', wiki
guess_engine = getWikiEngine(wiki) try:
guess_engine = getWikiEngine(wiki)
except ConnectionError:
print "%s failed to load, skipping..." % (wiki)
continue
print 'Got: %s, expected: %s' % (guess_engine, engine) print 'Got: %s, expected: %s' % (guess_engine, engine)
self.assertEqual(guess_engine, engine) self.assertEqual(guess_engine, engine)
@ -269,14 +273,14 @@ class TestDumpgenerator(unittest.TestCase):
print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73 print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73
tests = [ tests = [
# Alone wikis # Alone wikis
['http://archiveteam.org', 'http://archiveteam.org/api.php', 'http://archiveteam.org/index.php'], ['https://www.archiveteam.org', 'https://www.archiveteam.org/api.php', 'https://www.archiveteam.org/index.php'],
['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'], #['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],
# Editthis wikifarm # Editthis wikifarm
# It has a page view limit # It has a page view limit
# Gamepedia wikifarm # Gamepedia wikifarm
['http://dawngate.gamepedia.com', 'http://dawngate.gamepedia.com/api.php', 'http://dawngate.gamepedia.com/index.php'], #['http://dawngate.gamepedia.com', 'http://dawngate.gamepedia.com/api.php', 'http://dawngate.gamepedia.com/index.php'],
# Neoseeker wikifarm # Neoseeker wikifarm
#['http://digimon.neoseeker.com', 'http://digimon.neoseeker.com/w/api.php', 'http://digimon.neoseeker.com/w/index.php'], #['http://digimon.neoseeker.com', 'http://digimon.neoseeker.com/w/api.php', 'http://digimon.neoseeker.com/w/index.php'],
@ -288,7 +292,7 @@ class TestDumpgenerator(unittest.TestCase):
# ['http://wikipapers.referata.com', 'http://wikipapers.referata.com/w/api.php', 'http://wikipapers.referata.com/w/index.php'], # ['http://wikipapers.referata.com', 'http://wikipapers.referata.com/w/api.php', 'http://wikipapers.referata.com/w/index.php'],
# ShoutWiki wikifarm # ShoutWiki wikifarm
['http://commandos.shoutwiki.com', 'http://commandos.shoutwiki.com/w/api.php', 'http://commandos.shoutwiki.com/w/index.php'], #['http://commandos.shoutwiki.com', 'http://commandos.shoutwiki.com/w/api.php', 'http://commandos.shoutwiki.com/w/index.php'],
# Wiki-site wikifarm # Wiki-site wikifarm
#['http://minlingo.wiki-site.com', 'http://minlingo.wiki-site.com/api.php', 'http://minlingo.wiki-site.com/index.php'], #['http://minlingo.wiki-site.com', 'http://minlingo.wiki-site.com/api.php', 'http://minlingo.wiki-site.com/index.php'],

@ -16,6 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import getopt import getopt
import argparse
import os import os
import re import re
import subprocess import subprocess
@ -30,89 +31,41 @@ from internetarchive import get_item
import dumpgenerator import dumpgenerator
# Configuration goes here
# You need a file named keys.txt with access and secret keys, in two different lines # You need a file named keys.txt with access and secret keys, in two different lines
accesskey = open('keys.txt', 'r').readlines()[0].strip() accesskey = open('keys.txt', 'r').readlines()[0].strip()
secretkey = open('keys.txt', 'r').readlines()[1].strip() secretkey = open('keys.txt', 'r').readlines()[1].strip()
# Use --admin if you are a wikiteam collection admin, or specify another collection:
collection = 'opensource'
# Nothing to change below # Nothing to change below
convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'} convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
listfile = sys.argv[1]
uploadeddumps = []
try:
uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
except:
pass
print '%d dumps uploaded previously' % (len(uploadeddumps))
def getParameters(params=[]):
if not params:
params = sys.argv[2:]
config = {
'prune-directories': False,
'prune-wikidump': False,
'collection': collection,
'update': False,
}
#console params
try:
opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
for o, a in opts:
if o in ("-h","--help"):
usage()
sys.exit()
elif o in ("--prune-directories"):
config['prune-directories'] = True
elif o in ("--prune-wikidump"):
config['prune-wikidump'] = True
elif o in ("--admin"):
config['collection'] = "wikiteam"
elif o in ("--update"):
config['update'] = True
return config
def usage():
""" """
print """uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org. def log(wiki, dump, msg, config={}):
The list must be a text file with the wiki's api.php URLs, one per line. f = open('uploader-%s.log' % (config.listfile), 'a')
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
You need a file named keys.txt with access and secret keys, in two different lines
You also need dumpgenerator.py in the same directory as this script.
Use --help to print this help."""
def log(wiki, dump, msg):
f = open('uploader-%s.log' % (listfile), 'a')
f.write('\n%s;%s;%s' % (wiki, dump, msg)) f.write('\n%s;%s;%s' % (wiki, dump, msg))
f.close() f.close()
def upload(wikis, config={}): def upload(wikis, config={}, uploadeddumps=[]):
headers = {'User-Agent': dumpgenerator.getUserAgent()} headers = {'User-Agent': dumpgenerator.getUserAgent()}
dumpdir = config.wikidump_dir
filelist = os.listdir(dumpdir)
for wiki in wikis: for wiki in wikis:
print "#"*73 print "#"*73
print "# Uploading", wiki print "# Uploading", wiki
print "#"*73 print "#"*73
wiki = wiki.lower() wiki = wiki.lower()
prefix = dumpgenerator.domain2prefix(config={'api': wiki}) configtemp = config
try:
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
except KeyError:
print "ERROR: could not produce the prefix for %s" % wiki
config = configtemp
wikiname = prefix.split('-')[0] wikiname = prefix.split('-')[0]
dumps = [] dumps = []
for dirname, dirnames, filenames in os.walk('.'): for f in filelist:
if dirname == '.': if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
for f in filenames: print "%s found" % f
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')): dumps.append(f)
dumps.append(f)
break break
c = 0 c = 0
@ -120,30 +73,33 @@ def upload(wikis, config={}):
wikidate = dump.split('-')[1] wikidate = dump.split('-')[1]
item = get_item('wiki-' + wikiname) item = get_item('wiki-' + wikiname)
if dump in uploadeddumps: if dump in uploadeddumps:
if config['prune-directories']: if config.prune_directories:
rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate) rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
# With -f the deletion might have happened before and we won't know # With -f the deletion might have happened before and we won't know
if not os.system(rmline): if not os.system(rmline):
print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate) print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
if config['prune-wikidump'] and dump.endswith('wikidump.7z'): if config.prune_wikidump and dump.endswith('wikidump.7z'):
# Simplistic quick&dirty check for the presence of this file in the item # Simplistic quick&dirty check for the presence of this file in the item
stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() print "Checking content in previously uploaded files"
stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
dumphash = re.sub(' +.+\n?', '', stdout) dumphash = re.sub(' +.+\n?', '', stdout)
if dumphash in map(lambda x: x['md5'], item.files): if dumphash in map(lambda x: x['md5'], item.files):
log(wiki, dump, 'verified') log(wiki, dump, 'verified', config)
rmline='rm -rf %s' % dump rmline='rm -rf %s' % dumpdir + '/' + dump
if not os.system(rmline): if not os.system(rmline):
print 'DELETED ' + dump print 'DELETED ' + dumpdir + '/' + dump
print '%s was uploaded before, skipping...' % (dump) print '%s was uploaded before, skipping...' % (dump)
continue continue
else: else:
print 'ERROR: The online item misses ' + dump print 'ERROR: The online item misses ' + dump
log(wiki, dump, 'missing') log(wiki, dump, 'missing', config)
# We'll exit this if and go upload the dump # We'll exit this if and go upload the dump
else: else:
print '%s was uploaded before, skipping...' % (dump) print '%s was uploaded before, skipping...' % (dump)
continue continue
else:
print '%s was not uploaded before' % dump
time.sleep(0.1) time.sleep(0.1)
wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8] wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
@ -155,7 +111,7 @@ def upload(wikis, config={}):
# Logo path # Logo path
logourl = '' logourl = ''
if ismissingitem or config['update']: if ismissingitem or config.update:
#get metadata from api.php #get metadata from api.php
#first sitename and base url #first sitename and base url
params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'} params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
@ -163,7 +119,7 @@ def upload(wikis, config={}):
req = urllib2.Request(url=wiki, data=data, headers=headers) req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = '' xml = ''
try: try:
f = urllib2.urlopen(req) f = urllib2.urlopen(req, timeout=10)
xml = f.read() xml = f.read()
f.close() f.close()
except: except:
@ -198,7 +154,7 @@ def upload(wikis, config={}):
req = urllib2.Request(url=wiki, data=data, headers=headers) req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = '' xml = ''
try: try:
f = urllib2.urlopen(req) f = urllib2.urlopen(req, timeout=10)
xml = f.read() xml = f.read()
f.close() f.close()
except: except:
@ -214,7 +170,7 @@ def upload(wikis, config={}):
raw = '' raw = ''
try: try:
f = urllib.urlopen(baseurl) f = urllib.urlopen(baseurl, timeout=10)
raw = f.read() raw = f.read()
f.close() f.close()
except: except:
@ -238,7 +194,6 @@ def upload(wikis, config={}):
logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0] logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
except: except:
pass pass
print logourl
#retrieve some info from the wiki #retrieve some info from the wiki
wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
@ -264,7 +219,7 @@ def upload(wikis, config={}):
# Item metadata # Item metadata
md = { md = {
'mediatype': 'web', 'mediatype': 'web',
'collection': config['collection'], 'collection': config.collection,
'title': wikititle, 'title': wikititle,
'description': wikidesc, 'description': wikidesc,
'language': lang, 'language': lang,
@ -277,25 +232,54 @@ def upload(wikis, config={}):
#Upload files and update metadata #Upload files and update metadata
try: try:
item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True) item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
item.modify_metadata(md) # update item.modify_metadata(md) # update
print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname) print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
uploadeddumps.append(dump)
log(wiki, dump, 'ok', config)
if logourl: if logourl:
logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read()) logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read())
logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown' logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
logo.name = 'wiki-' + wikiname + '_logo.' + logoextension logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True) item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
uploadeddumps.append(dump) except Exception as e:
log(wiki, dump, 'ok') print wiki, dump, 'Error when uploading?'
except: print e.message
print wiki, dump, 'error when uploading?'
c += 1 c += 1
def main(params=[]): def main(params=[]):
config = getParameters(params=params) parser = argparse.ArgumentParser("""uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
The list must be a text file with the wiki's api.php URLs, one per line.
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
You need a file named keys.txt with access and secret keys, in two different lines
You also need dumpgenerator.py in the same directory as this script.
Use --help to print this help.""")
parser.add_argument('-pd', '--prune_directories', action='store_true')
parser.add_argument('-pw', '--prune_wikidump', action='store_true')
parser.add_argument('-a', '--admin', action='store_true')
parser.add_argument('-c', '--collection', default='opensource')
parser.add_argument('-wd', '--wikidump_dir', default='.')
parser.add_argument('-u', '--update', action='store_true')
parser.add_argument('listfile')
config = parser.parse_args()
if config.admin:
config.collection = 'wikiteam'
uploadeddumps = []
listfile = config.listfile
try:
uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
except:
pass
print '%d dumps uploaded previously' % (len(uploadeddumps))
wikis = open(listfile, 'r').read().strip().splitlines() wikis = open(listfile, 'r').read().strip().splitlines()
upload(wikis, config)
upload(wikis, config, uploadeddumps)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

@ -24,7 +24,7 @@ def main():
site = pywikibot.Site('wikiapiary', 'wikiapiary') site = pywikibot.Site('wikiapiary', 'wikiapiary')
catname = 'Category:Website' catname = 'Category:Website'
cat = pywikibot.Category(site, catname) cat = pywikibot.Category(site, catname)
gen = pagegenerators.CategorizedPageGenerator(cat, start='Spyropedia') gen = pagegenerators.CategorizedPageGenerator(cat, start='!')
pre = pagegenerators.PreloadingGenerator(gen) pre = pagegenerators.PreloadingGenerator(gen)
for page in pre: for page in pre:
@ -52,7 +52,8 @@ def main():
print('No API found in WikiApiary, skiping') print('No API found in WikiApiary, skiping')
continue continue
urliasearch = 'https://archive.org/search.php?query=originalurl:"%s"' % (apiurl) indexurl = 'index.php'.join(apiurl.rsplit('api.php', 1))
urliasearch = 'https://archive.org/search.php?query=originalurl:"%s" OR originalurl:"%s"' % (apiurl, indexurl)
f = urllib.request.urlopen(urliasearch) f = urllib.request.urlopen(urliasearch)
raw = f.read().decode('utf-8') raw = f.read().decode('utf-8')
if re.search(r'(?i)Your search did not match any items', raw): if re.search(r'(?i)Your search did not match any items', raw):

@ -0,0 +1,458 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
# Documentation for developers: http://wikiteam.readthedocs.com
import csv
import datetime
import os
import random
import re
import subprocess
import sys
import time
import urllib.request
#from internetarchive import get_item
# Requirements:
# zip command (apt-get install zip)
# ia command (pip install internetarchive, and configured properly)
"""
# You need a file with access and secret keys, in two different lines
iakeysfilename = '%s/.iakeys' % (os.path.expanduser('~'))
if os.path.exists(iakeysfilename):
accesskey = open(iakeysfilename, 'r').readlines()[0].strip()
secretkey = open(iakeysfilename, 'r').readlines()[1].strip()
else:
print('Error, no %s file with S3 keys for Internet Archive account' % (iakeysfilename))
sys.exit()
"""
def saveURL(wikidomain='', url='', filename='', path='', overwrite=False, iteration=1):
filename2 = '%s/%s' % (wikidomain, filename)
if path:
filename2 = '%s/%s/%s' % (wikidomain, path, filename)
if os.path.exists(filename2):
if not overwrite:
print('Warning: file exists on disk. Skipping download. Force download with parameter --overwrite')
return
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
try:
urllib.request.urlretrieve(url, filename2)
except:
sleep = 10 # seconds
maxsleep = 30
while sleep <= maxsleep:
try:
print('Error while retrieving: %s' % (url))
print('Retry in %s seconds...' % (sleep))
time.sleep(sleep)
urllib.request.urlretrieve(url, filename2)
return
except:
sleep = sleep * 2
print('Download failed')
#sometimes wikispaces returns invalid data, redownload in that cases
#only 'pages'. 'files' binaries are a pain to open and check
if (os.path.exists(filename2) and 'pages' in path) or \
(os.path.exists(filename2) and path == '' and filename2.split('.')[-1] in ['xml', 'html', 'csv']):
sleep2 = 60 * iteration
raw = ''
try:
with open(filename2, 'r', encoding='utf-8') as f:
raw = f.read()
except:
with open(filename2, 'r', encoding='latin-1') as f:
raw = f.read()
if re.findall(r'(?im)<title>TES and THE Status</title>', raw):
print('Warning: invalid content. Waiting %d seconds and re-downloading' % (sleep2))
time.sleep(sleep2)
saveURL(wikidomain=wikidomain, url=url, filename=filename, path=path, overwrite=overwrite, iteration=iteration+1)
def undoHTMLEntities(text=''):
""" Undo some HTML codes """
# i guess only < > & " ' need conversion
# http://www.w3schools.com/html/html_entities.asp
text = re.sub('&lt;', '<', text)
text = re.sub('&gt;', '>', text)
text = re.sub('&amp;', '&', text)
text = re.sub('&quot;', '"', text)
text = re.sub('&#039;', '\'', text)
return text
def convertHTML2Wikitext(wikidomain='', filename='', path=''):
wikitext = ''
wikitextfile = '%s/%s/%s' % (wikidomain, path, filename)
if not os.path.exists(wikitextfile):
print('Error retrieving wikitext, page is a redirect probably')
return
with open(wikitextfile, 'r') as f:
wikitext = f.read()
with open(wikitextfile, 'w') as f:
m = re.findall(r'(?im)<div class="WikispacesContent WikispacesBs3">\s*<pre>', wikitext)
if m:
try:
wikitext = wikitext.split(m[0])[1].split('</pre>')[0].strip()
wikitext = undoHTMLEntities(text=wikitext)
except:
pass
f.write(wikitext)
def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False):
pagenameplus = re.sub(' ', '+', pagename)
pagename_ = urllib.parse.quote(pagename)
#page current revision (html & wikitext)
pageurl = '%s/%s' % (wikiurl, pagename_)
filename = '%s.html' % (pagenameplus)
print('Downloading page: %s' % (filename))
saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite)
pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_)
filename2 = '%s.wikitext' % (pagenameplus)
print('Downloading page: %s' % (filename2))
saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite)
convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages')
#csv with page history
csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_)
csvfilename = '%s.history.csv' % (pagenameplus)
print('Downloading page: %s' % (csvfilename))
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite)
def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False):
filenameplus = re.sub(' ', '+', filename)
filename_ = urllib.parse.quote(filename)
#file full resolution
fileurl = '%s/file/view/%s' % (wikiurl, filename_)
filename = filenameplus
print('Downloading file: %s' % (filename))
saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite)
#csv with file history
csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_)
csvfilename = '%s.history.csv' % (filenameplus)
print('Downloading file: %s' % (csvfilename))
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite)
def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False):
print('Downloading Pages and Files from %s' % (wikiurl))
#csv all pages and files
csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl)
saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='')
#download every page and file
totallines = 0
with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f:
totallines = len(f.read().splitlines()) - 1
with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile:
filesc = 0
pagesc = 0
print('This wiki has %d pages and files' % (totallines))
rows = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in rows:
if row[0] == 'file':
filesc += 1
filename = row[1]
downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite)
elif row[0] == 'page':
pagesc += 1
pagename = row[1]
downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite)
if (filesc + pagesc) % 10 == 0:
print(' Progress: %d of %d' % ((filesc + pagesc), totallines))
print(' Progress: %d of %d' % ((filesc + pagesc), totallines))
print('Downloaded %d pages' % (pagesc))
print('Downloaded %d files' % (filesc))
def downloadSitemap(wikidomain='', wikiurl='', overwrite=False):
print('Downloading sitemap.xml')
saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite)
def downloadMainPage(wikidomain='', wikiurl='', overwrite=False):
print('Downloading index.html')
saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite)
def downloadLogo(wikidomain='', wikiurl='', overwrite=False):
index = '%s/index.html' % (wikidomain)
if os.path.exists(index):
raw = ''
try:
with open(index, 'r', encoding='utf-8') as f:
raw = f.read()
except:
with open(index, 'r', encoding='latin-1') as f:
raw = f.read()
m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', raw)
if m:
logourl = m[0]
logofilename = logourl.split('/')[-1]
print('Downloading logo')
saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='', overwrite=overwrite)
return logofilename
return ''
def printhelp():
helptext = """This script downloads (and uploads) WikiSpaces wikis.
Parameters available:
--upload: upload compressed file with downloaded wiki
--admin: add item to WikiTeam collection (if you are an admin in that collection)
--overwrite: download again even if files exists locally
--overwrite-ia: upload again to Internet Archive even if item exists there
--help: prints this help text
Examples:
python3 wikispaces.py https://mywiki.wikispaces.com
It downloads that wiki
python3 wikispaces.py wikis.txt
It downloads a list of wikis (file format is a URL per line)
python3 wikispaces.py https://mywiki.wikispaces.com --upload
It downloads that wiki, compress it and uploading to Internet Archive
"""
print(helptext)
sys.exit()
def duckduckgo():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
wikis = []
ignorewikis = [
'https://wikispaces.com',
'https://www.wikispaces.com',
'https://wikispaces.net',
'https://www.wikispaces.net',
]
for i in range(1, 100000):
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikispaces.com' % (random.randint(100, 5000), random.randint(1000, 9999))
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
time.sleep(30)
continue
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
for wiki in m:
wiki = 'https://' + wiki
wiki = re.sub(r'https://www\.', 'https://', wiki)
if not wiki in wikis and not wiki in ignorewikis:
wikis.append(wiki)
yield wiki
sleep = random.randint(5,20)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
def main():
upload = False
isadmin = False
overwrite = False
overwriteia = False
if len(sys.argv) < 2:
printhelp()
param = sys.argv[1]
if not param:
printhelp()
if len(sys.argv) > 2:
if '--upload' in sys.argv:
upload = True
if '--admin' in sys.argv:
isadmin = True
if '--overwrite' in sys.argv:
overwrite = True
if '--overwrite-ia' in sys.argv:
overwriteia = True
if '--help' in sys.argv:
printhelp()
wikilist = []
if '://' in param:
wikilist.append(param.rstrip('/'))
elif param.lower() == 'duckduckgo':
wikilist = duckduckgo()
#for wiki in wikilist:
# print(wiki)
else:
with open(param, 'r') as f:
wikilist = f.read().strip().splitlines()
wikilist2 = []
for wiki in wikilist:
wikilist2.append(wiki.rstrip('/'))
wikilist = wikilist2
for wikiurl in wikilist:
wikidomain = wikiurl.split('://')[1].split('/')[0]
print('\n')
print('#'*40,'\n Downloading:', wikiurl)
print('#'*40,'\n')
if upload and not overwriteia:
itemid = 'wiki-%s' % (wikidomain)
try:
iahtml = ''
try:
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
except:
time.sleep(10)
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
if iahtml and not re.findall(r'(?im)Item cannot be found', iahtml):
if not overwriteia:
print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia')
print('You can find it in https://archive.org/details/%s' % (itemid))
time.sleep(1)
continue
except:
pass
dirfiles = '%s/files' % (wikidomain)
if not os.path.exists(dirfiles):
print('Creating directory %s' % (dirfiles))
os.makedirs(dirfiles)
dirpages = '%s/pages' % (wikidomain)
if not os.path.exists(dirpages):
print('Creating directory %s' % (dirpages))
os.makedirs(dirpages)
sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite)
if not os.path.exists('%s/sitemap.xml' % (wikidomain)):
print('Error, wiki was probably deleted. Skiping wiki...')
continue
else:
sitemapraw = ''
try:
with open('%s/sitemap.xml' % (wikidomain), encoding='utf-8') as g:
sitemapraw = g.read()
except:
with open('%s/sitemap.xml' % (wikidomain), encoding='latin-1') as g:
sitemapraw = g.read()
if re.search(r'(?im)<h1>This wiki has been deactivated</h1>', sitemapraw):
print('Error, wiki was deactivated. Skiping wiki...')
continue
downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
if not os.path.exists('%s/index.html' % (wikidomain)):
print('Error, wiki was probably deleted or expired. Skiping wiki...')
continue
else:
indexraw = ''
try:
with open('%s/index.html' % (wikidomain), encoding='utf-8') as g:
indexraw = g.read()
except:
with open('%s/index.html' % (wikidomain), encoding='latin-1') as g:
indexraw = g.read()
if re.search(r'(?im)<h1>Subscription Expired</h1>', indexraw):
print('Error, wiki subscription expired. Skiping wiki...')
continue
downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
if upload:
itemid = 'wiki-%s' % (wikidomain)
print('\nCompressing dump...')
wikidir = wikidomain
os.chdir(wikidir)
print('Changed directory to', os.getcwd())
wikizip = '%s.zip' % (wikidomain)
subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
os.chdir('..')
print('Changed directory to', os.getcwd())
print('\nUploading to Internet Archive...')
indexfilename = '%s/index.html' % (wikidir)
if not os.path.exists(indexfilename):
print('\nError dump incomplete, skipping upload\n')
continue
indexhtml = ''
try:
with open(indexfilename, 'r', encoding='utf-8') as f:
indexhtml = f.read()
except:
with open(indexfilename, 'r', encoding='latin-1') as f:
indexhtml = f.read()
wikititle = ''
try:
wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
except:
wikititle = wikidomain
if not wikititle:
wikititle = wikidomain
wikititle = wikititle.replace("\\'", " ")
wikititle = wikititle.replace('\\"', " ")
itemtitle = 'Wiki - %s' % wikititle
itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
itemoriginalurl = wikiurl
itemlicenseurl = ''
m = ''
try:
m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
except:
m = ''
if m:
itemlicenseurl = m[0]
if not itemlicenseurl:
itemtags.append('unknowncopyright')
itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
itemcollection = isadmin and 'wikiteam' or 'opensource'
itemlang = 'Unknown'
itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''
callplain = "ia upload %s %s %s --metadata='mediatype:web' --metadata='collection:%s' --metadata='title:%s' --metadata='description:%s' --metadata='language:%s' --metadata='last-updated-date:%s' --metadata='originalurl:%s' %s %s" % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemoriginalurl, itemlicenseurl and "--metadata='licenseurl:%s'" % (itemlicenseurl) or '', itemtags_)
print(callplain)
subprocess.call(callplain, shell=True)
"""
md = {
'mediatype': 'web',
'collection': itemcollection,
'title': itemtitle,
'description': itemdesc,
'language': itemlang,
'last-updated-date': itemdate,
'subject': '; '.join(itemtags),
'licenseurl': itemlicenseurl,
'originalurl': itemoriginalurl,
}
item = get_item(itemid)
item.upload(wikizip, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
item.modify_metadata(md)
if itemlogo:
item.upload(itemlogo, access_key=accesskey, secret_key=secretkey, verbose=True)
"""
print('You can find it in https://archive.org/details/%s' % (itemid))
os.remove(wikizip)
if __name__ == "__main__":
main()

@ -228,7 +228,11 @@ def mwGetImageNamesAPI(config={}):
url = mwCurateImageURL(config=config, url=url) url = mwCurateImageURL(config=config, url=url)
# encoding to ascii is needed to work around this horrible bug: # encoding to ascii is needed to work around this horrible bug:
# http://bugs.python.org/issue8136 # http://bugs.python.org/issue8136
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore') if 'mwapi' in config and '.wikia.com' in config['mwapi']:
#to avoid latest?cb=20120816112532 in filenames
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')
else:
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
uploader = re.sub('_', ' ', image['user']) uploader = re.sub('_', ' ', image['user'])
imagenames.append([filename, url, uploader]) imagenames.append([filename, url, uploader])
else: else:

Loading…
Cancel
Save