pull/346/head
emijrp 5 years ago
commit aecee2dc53

2
.gitattributes vendored

@ -0,0 +1,2 @@
*.com linguist-vendored
*.org linguist-vendored

@ -4,3 +4,5 @@ install:
- pip install tox
script:
- tox
notifications:
email: false

@ -1,7 +1,7 @@
# WikiTeam
### We archive wikis, from Wikipedia to tiniest wikis
**WikiTeam software is a set of tools for archiving wikis.** They work on MediaWiki wikis, but we want to expand to other wiki engines. As of January 2016, WikiTeam has preserved more than [27,000 stand-alone wikis](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups), several wikifarms, regular Wikipedia dumps and [34 TB of Wikimedia Commons images](https://archive.org/details/wikimediacommons).
**WikiTeam software is a set of tools for archiving wikis.** They work on MediaWiki wikis, but we want to expand to other wiki engines. As of 2019, WikiTeam has preserved more than [250,000 wikis](https://github.com/WikiTeam/wikiteam/wiki/Available-Backups), several wikifarms, regular Wikipedia dumps and [34 TB of Wikimedia Commons images](https://archive.org/details/wikimediacommons).
There are [thousands](http://wikiindex.org) of [wikis](https://wikiapiary.com) in the Internet. Every day some of them are no longer publicly available and, due to lack of backups, lost forever. Millions of people download tons of media files (movies, music, books, etc) from the Internet, serving as a kind of distributed backup. Wikis, most of them under free licenses, disappear from time to time because nobody grabbed a copy of them. That is a shame that we would like to solve.

File diff suppressed because it is too large Load Diff

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# dumpgenerator.py A generator of dumps for wikis
# Copyright (C) 2011-2016 WikiTeam developers
# Copyright (C) 2011-2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@ -20,7 +20,7 @@
# https://github.com/WikiTeam/wikiteam/wiki
try:
from kitchen.text.converters import getwriter
from kitchen.text.converters import getwriter, to_unicode
except ImportError:
print "Please install the kitchen module."
import cookielib
@ -39,17 +39,31 @@ except ImportError: # Python 2.4 compatibility
from md5 import new as md5
import os
import re
import subprocess
try:
import requests
except ImportError:
print "Please install or update the Requests module."
sys.exit(1)
try:
import wikitools
except ImportError:
print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
try:
from lxml import etree
from lxml.builder import E
except ImportError:
print "Please install the lxml module if you want to use --xmlrevisions."
import time
import urllib
try:
from urlparse import urlparse, urlunparse
except ImportError:
from urllib.parse import urlparse, urlunparse
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
__VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org
__VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org
class PageMissingError(Exception):
def __init__(self, title, xml):
@ -150,7 +164,7 @@ def getNamespacesScraper(config={}, session=None):
namespacenames = {0: ''} # main is 0, no prefix
if namespaces:
r = session.post(
url=config['index'], data={'title': 'Special:Allpages'})
url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
raw = r.text
delay(config=config, session=session)
@ -187,33 +201,41 @@ def getNamespacesAPI(config={}, session=None):
if namespaces:
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
'format': 'json'},
timeout=30
)
result = getJSON(r)
delay(config=config, session=session)
try:
nsquery = result['query']['namespaces']
except KeyError:
print "Error: could not get namespaces from the API request"
print "HTTP %d" % r.status_code
print r.text
return None
if 'all' in namespaces:
namespaces = []
for i in result['query']['namespaces'].keys():
for i in nsquery.keys():
if int(i) < 0: # -1: Special, -2: Media, excluding
continue
namespaces.append(int(i))
namespacenames[int(i)] = result['query']['namespaces'][i]['*']
namespacenames[int(i)] = nsquery[i]['*']
else:
# check if those namespaces really exist in this wiki
namespaces2 = []
for i in result['query']['namespaces'].keys():
for i in nsquery.keys():
bi = i
i = int(i)
if i < 0: # -1: Special, -2: Media, excluding
continue
if i in namespaces:
namespaces2.append(i)
namespacenames[i] = result['query']['namespaces'][bi]['*']
namespacenames[i] = nsquery[bi]['*']
namespaces = namespaces2
else:
namespaces = [0]
@ -249,7 +271,7 @@ def getPageTitlesAPI(config={}, session=None):
retryCount = 0
while retryCount < config["retries"]:
try:
r = session.post(url=config['api'], data=params)
r = session.post(url=config['api'], data=params, timeout=30)
break
except ConnectionError as err:
print "Connection error: %s" % (str(err),)
@ -271,21 +293,27 @@ def getPageTitlesAPI(config={}, session=None):
apfrom = jsontitles['continue']['apcontinue']
elif 'apfrom' in jsontitles['continue']:
apfrom = jsontitles['continue']['apfrom']
# print apfrom
# print jsontitles
allpages = jsontitles['query']['allpages']
try:
allpages = jsontitles['query']['allpages']
except KeyError:
print "The allpages API returned nothing. Exit."
sys.exit(1)
# Hack for old versions of MediaWiki API where result is dict
if isinstance(allpages, dict):
allpages = allpages.values()
for page in allpages:
yield page['title']
title = page['title']
titles.append(title)
yield title
c += len(allpages)
if len(titles) != len(set(titles)):
# probably we are in a loop, server returning dupe titles, stop
# it
print 'Probably a loop, finishing'
print 'Probably a loop, switching to next namespace. Duplicate title:'
print title
titles = list(set(titles))
apfrom = ''
@ -301,7 +329,7 @@ def getPageTitlesScraper(config={}, session=None):
print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % (
config['index'], namespace)
r = session.get(url=url)
r = session.get(url=url, timeout=30)
raw = r.text
raw = cleanHTML(raw)
@ -353,7 +381,7 @@ def getPageTitlesScraper(config={}, session=None):
# to avoid reload dupe subpages links
checked_suballpages.append(name)
delay(config=config, session=session)
r2 = session.get(url=url)
r2 = session.get(url=url, timeout=10)
raw2 = r2.text
raw2 = cleanHTML(raw2)
rawacum += raw2 # merge it after removed junk
@ -386,13 +414,11 @@ def getPageTitles(config={}, session=None):
titles = []
if 'api' in config and config['api']:
r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'})
test = getJSON(r)
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
titles = getPageTitlesScraper(config=config, session=session)
else:
try:
titles = getPageTitlesAPI(config=config, session=session)
except:
print "Error: could not get page titles from the API"
titles = getPageTitlesScraper(config=config, session=session)
elif 'index' in config and config['index']:
titles = getPageTitlesScraper(config=config, session=session)
@ -412,7 +438,7 @@ def getPageTitles(config={}, session=None):
print '%d page titles loaded' % (c)
return titlesfilename
def getImageNames(config={}, session=None):
""" Get list of image names """
@ -436,39 +462,60 @@ def getXMLHeader(config={}, session=None):
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x....
randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError:
print config['api']
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
xml = None
try:
if config['api']:
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
print 'Getting the XML header from the API'
r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
xml = r.json()['query']['export']['*']
if not xml:
r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
xml = r.text
except requests.exceptions.RetryError:
pass
else:
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError:
pass
try:
if config['api']:
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'},
timeout=120
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass
header = xml.split('</mediawiki>')[0]
if not re.match(r"\s*<mediawiki", xml):
print 'XML export on this wiki is broken, quitting.'
logerror(u'XML export on this wiki is broken, quitting.')
sys.exit()
if config['xmlrevisions']:
# Try again the old way
print 'Export test via the API failed. Wiki too old? Trying without xmlrevisions.'
config['xmlrevisions'] = False
header, config = getXMLHeader(config=config, session=session)
else:
print 'XML export on this wiki is broken, quitting.'
logerror(u'XML export on this wiki is broken, quitting.')
sys.exit()
return header, config
@ -512,7 +559,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if c > 0 and c < maxretries:
wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['pages'], wait)
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# limit = 1 from mother function)
@ -521,6 +568,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if c >= maxretries:
print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['pages'])
if config['failfast']:
print "Exit, it will be for another time"
sys.exit()
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save only the last,
# params['curonly'] should mean that we've already tried this
@ -550,7 +600,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
return '' # empty xml
# FIXME HANDLE HTTP Errors HERE
try:
r = session.post(url=config['index'], data=params, headers=headers)
r = session.post(url=config['index'], params=params, headers=headers, timeout=10)
handleStatusCode(r)
xml = fixBOM(r)
except requests.exceptions.ConnectionError as e:
@ -675,10 +725,9 @@ def cleanXML(xml=''):
def generateXMLDump(config={}, titles=[], start=None, session=None):
""" Generates a XML dump for a list of titles """
""" Generates a XML dump for a list of titles or from revision IDs """
# TODO: titles is now unused.
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
header, config = getXMLHeader(config=config, session=session)
footer = '</mediawiki>\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
@ -686,48 +735,189 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
config['curonly'] and 'current' or 'history')
xmlfile = ''
lock = True
if start:
print "Removing the last chunk of past XML dump: it is probably incomplete."
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
# requested complete xml dump
lock = False
if config['xmlrevisions']:
print 'Retrieving the XML for every page from the beginning'
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
delay(config=config, session=session)
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
for xml in getXMLRevisions(config=config, session=session):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
print "%d more revisions exported" % numrevs
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title.decode('utf-8'))
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
except AttributeError:
print "This wikitools module version is not working"
sys.exit()
else:
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
if start:
print "Removing the last chunk of past XML dump: it is probably incomplete."
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
# requested complete xml dump
lock = False
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
delay(config=config, session=session)
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title.decode('utf-8'))
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
xmlfile.write(footer)
xmlfile.close()
print 'XML dump saved at...', xmlfilename
def getXMLRevisions(config={}, session=None, allpages=False):
site = wikitools.wiki.Wiki(config['api'])
if not 'all' in config['namespaces']:
namespaces = config['namespaces']
else:
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
try:
for namespace in namespaces:
print "Trying to export all revisions from namespace %s" % namespace
arvparams = {
'action': 'query',
'list': 'allrevisions',
'arvlimit': 500,
'arvnamespace': namespace
}
if not config['curonly']:
# We have to build the XML manually...
# Skip flags, presumably needed to add <minor/> which is in the schema.
# Also missing: parentid and contentformat.
arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
arvrequest = wikitools.api.APIRequest(site, arvparams)
results = arvrequest.queryGen()
for result in results:
for page in result['query']['allrevisions']:
yield makeXmlFromPage(page)
else:
# Just cycle through revision IDs and use the XML as is
arvparams['arvprop'] = 'ids'
arvrequest = wikitools.api.APIRequest(site, arvparams)
arvresults = arvrequest.queryGen()
for result in arvresults:
revids = []
for page in result['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
print "%d more revisions listed, until %s" % (len(revids), revids[-1])
exportparams = {
'action': 'query',
'revids': '|'.join(revids),
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
except KeyError:
print "Warning. Could not use allrevisions, wiki too old."
if config['curonly']:
for title in readTitles(config):
exportparams = {
'action': 'query',
'titles': title,
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
else:
for title in readTitles(config):
pparams = {
'action': 'query',
'titles': title,
'prop': 'revisions',
'rvlimit': 'max',
'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
'rawcontinue': 'yes'
}
prequest = wikitools.api.APIRequest(site, pparams)
try:
results = prequest.query()
pages = results['query']['pages']
except KeyError:
raise PageMissingError(title, xml='')
for page in pages:
try:
xml = makeXmlFromPage(pages[page])
except PageMissingError:
logerror(
config=config,
text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
)
continue
yield xml
except wikitools.api.APIError:
print "This wikitools version seems not to work for us. Exiting."
sys.exit()
def makeXmlFromPage(page):
""" Output an XML document as a string from a page as in the API JSON """
try:
p = E.page(
E.title(page['title']),
E.ns(to_unicode(page['ns'])),
E.id(to_unicode(page['pageid'])),
)
for rev in page['revisions']:
revision = E.revision(
E.id(to_unicode(rev['revid'])),
E.parentid(to_unicode(rev['parentid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(to_unicode(rev['userid'])),
E.username(to_unicode(rev['user'])),
),
E.comment(rev['comment']),
E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
)
if 'contentmodel' in rev:
revision.append(E.model(rev['contentmodel']))
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
if 'sha1' in rev:
revision.append(E.sha1(rev['sha1']))
p.append(revision)
except KeyError:
raise PageMissingError(page['title'], '')
return etree.tostring(p, pretty_print=True)
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """
@ -863,10 +1053,11 @@ def getImageNamesScraper(config={}, session=None):
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
r = session.post(
url=config['index'],
data={
params={
'title': 'Special:Imagelist',
'limit': limit,
'offset': offset})
'offset': offset},
timeout=30)
raw = r.text
delay(config=config, session=session)
# delicate wiki
@ -967,7 +1158,7 @@ def getImageNamesAPI(config={}, session=None):
'format': 'json',
'ailimit': 500}
# FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params)
r = session.post(url=config['api'], params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
@ -1025,7 +1216,7 @@ def getImageNamesAPI(config={}, session=None):
'iiprop': 'user|url',
'format': 'json'}
# FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params)
r = session.post(url=config['api'], params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
@ -1112,10 +1303,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
# saving description if any
try:
title = u'Image:%s' % (filename)
xmlfiledesc = getXMLFileDesc(
config=config,
title=title,
session=session) # use Image: for backwards compatibility
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
xmlfiledesc = r.text
else:
xmlfiledesc = getXMLFileDesc(
config=config,
title=title,
session=session) # use Image: for backwards compatibility
except PageMissingError:
xmlfiledesc = ''
logerror(
@ -1170,7 +1365,7 @@ def domain2prefix(config={}, session=None):
domain = config['index']
domain = domain.lower()
domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
domain = re.sub(r'(https?://|www\.|/index\.php.*|/api\.php.*)', '', domain)
domain = re.sub(r'/', '_', domain)
domain = re.sub(r'\.', '', domain)
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
@ -1211,8 +1406,9 @@ def welcome():
message += ''
message += "\n"
message += "#" * 73
message += "\n"
message += "# Copyright (C) 2011-%d WikiTeam developers #\n" % (datetime.datetime.now().year)
message += """
# Copyright (C) 2011-2014 WikiTeam #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
@ -1299,7 +1495,9 @@ def getParameters(params=[]):
action='store_true',
help="generates a full history XML dump (--xml --curonly for current revisions only)")
groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages')
help='store only the current version of pages')
groupDownload.add_argument('--xmlrevisions', action='store_true',
help='download all revisions from an API generator. MediaWiki 1.27+ only.')
groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump")
groupDownload.add_argument(
@ -1319,6 +1517,10 @@ def getParameters(params=[]):
'--get-wiki-engine',
action='store_true',
help="returns the wiki engine")
groupMeta.add_argument(
'--failfast',
action='store_true',
help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.")
args = parser.parse_args()
# print args
@ -1350,11 +1552,22 @@ def getParameters(params=[]):
print 'Using cookies from %s' % args.cookies
session = requests.Session()
try:
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
# Courtesy datashaman https://stackoverflow.com/a/35504626
__retries__ = Retry(total=5,
backoff_factor=2,
status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=__retries__))
session.mount('http://', HTTPAdapter(max_retries=__retries__))
except:
# Our urllib3/requests is too old
pass
session.cookies = cj
session.headers.update({'User-Agent': getUserAgent()})
if args.user and args.password:
session.auth = (args.user, args.password)
# session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
# check URLs
for url in [args.api, args.index, args.wiki]:
@ -1392,6 +1605,7 @@ def getParameters(params=[]):
retry = 0
maxretries = args.retries
retrydelay = 20
check = None
while retry < maxretries:
try:
check = checkAPI(api=api, session=session)
@ -1427,15 +1641,20 @@ def getParameters(params=[]):
session=session):
print 'index.php is OK'
else:
index = '/'.join(index.split('/')[:-1])
try:
index = '/'.join(index.split('/')[:-1])
except AttributeError:
index = None
if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else:
print 'Error in index.php, please, provide a correct path to index.php'
sys.exit(1)
print 'Error in index.php.'
if not args.xmlrevisions:
print 'Please, provide a correct path to index.php or use --xmlrevisions. Terminating.'
sys.exit(1)
# check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user):
@ -1483,10 +1702,12 @@ def getParameters(params=[]):
'curonly': args.curonly,
'date': datetime.datetime.now().strftime('%Y%m%d'),
'api': api,
'failfast': args.failfast,
'index': index,
'images': args.images,
'logs': False,
'xml': args.xml,
'xmlrevisions': args.xmlrevisions,
'namespaces': namespaces,
'exnamespaces': exnamespaces,
'path': args.path and os.path.normpath(args.path) or '',
@ -1520,18 +1741,23 @@ def checkAPI(api=None, session=None):
data={
'action': 'query',
'meta': 'siteinfo',
'format': 'json'}
'format': 'json'},
timeout=30
)
if r.url == api:
if r.status_code == 200:
break
else:
api = r.url
elif r.status_code < 400:
p = r.url
api = urlunparse([p.scheme, p.netloc, p.path, '', '', ''])
elif r.status_code > 400:
print "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
return False
if "MediaWiki API is not enabled for this site." in r.text:
return False
try:
result = getJSON(r)
index = None
if result['query']:
if result:
try:
index = result['query']['general']['server'] + \
result['query']['general']['script']
@ -1548,7 +1774,7 @@ def checkAPI(api=None, session=None):
def checkIndex(index=None, cookies=None, session=None):
""" Checking index.php availability """
r = session.post(url=index, data={'title': 'Special:Version'})
r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
raw = r.text
print 'Checking index.php...', index
# Workaround for issue 71
@ -1587,7 +1813,11 @@ def getJSON(request):
"""Strip Unicode BOM"""
if request.text.startswith(u'\ufeff'):
request.encoding = 'utf-8-sig'
return request.json()
try:
return request.json()
except:
# Maybe an older API version which did not return correct JSON
return {}
def fixBOM(request):
@ -1633,6 +1863,8 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
else:
print 'XML dump seems to be corrupted.'
reply = ''
if config['failfast']:
reply = 'yes'
while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
if reply.lower() in ['yes', 'y']:
@ -1679,7 +1911,7 @@ def resumePreviousDump(config={}, other={}):
if lasttitle == '':
lasttitle=lasttitles.next()
except:
pass # probably file does not exists
lasttitle = '' # probably file does not exists
if lasttitle == '--END--':
# titles list is complete
print 'Title list was completed in the previous session'
@ -1810,7 +2042,7 @@ def saveSpecialVersion(config={}, session=None):
else:
print 'Downloading Special:Version with extensions and other related info'
r = session.post(
url=config['index'], data={'title': 'Special:Version'})
url=config['index'], params={'title': 'Special:Version'}, timeout=10)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
@ -1825,14 +2057,13 @@ def saveIndexPHP(config={}, session=None):
print 'index.html exists, do not overwrite'
else:
print 'Downloading index.php (Main Page) as index.html'
r = session.post(url=config['index'], data={})
r = session.post(url=config['index'], params={}, timeout=10)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
with open('%s/index.html' % (config['path']), 'w') as outfile:
outfile.write(raw.encode('utf-8'))
def saveSiteInfo(config={}, session=None):
""" Save a file with site info """
@ -1845,30 +2076,33 @@ def saveSiteInfo(config={}, session=None):
# MediaWiki 1.13+
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
'sinumberingroup': 1,
'format': 'json'})
'format': 'json'},
timeout=10)
# MediaWiki 1.11-1.12
if not 'query' in getJSON(r):
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
'format': 'json'})
'format': 'json'},
timeout=10)
# MediaWiki 1.8-1.10
if not 'query' in getJSON(r):
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces',
'format': 'json'})
'format': 'json'},
timeout=10)
result = getJSON(r)
delay(config=config, session=session)
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
@ -1879,10 +2113,14 @@ def avoidWikimediaProjects(config={}, other={}):
""" Skip Wikimedia projects and redirect to the dumps website """
# notice about wikipedia dumps
url = ''
if config['api']:
url = url + config['api']
if config['index']:
url = url + config['index']
if re.findall(
r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
config['api'] +
config['index']):
url):
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
print 'Download the dumps from http://dumps.wikimedia.org'
if not other['force']:
@ -1895,9 +2133,9 @@ def getWikiEngine(url=''):
session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url)
r = session.post(url=url, timeout=30)
if r.status_code == 405 or r.text == '':
r = session.get(url=url)
r = session.get(url=url, timeout=120)
result = r.text
wikiengine = 'Unknown'
@ -1980,7 +2218,7 @@ def mwGetAPIAndIndex(url=''):
index = ''
session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url)
r = session.post(url=url, timeout=120)
result = r.text
# API
@ -2042,6 +2280,8 @@ def main(params=[]):
while not other['resume'] and os.path.isdir(config['path']):
print '\nWarning!: "%s" path exists' % (config['path'])
reply = ''
if config['failfast']:
retry = 'yes'
while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input(
'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %

@ -6,12 +6,12 @@
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
@ -30,11 +30,11 @@ def main():
if len(sys.argv) < 2:
print 'python script.py file-with-apis.txt'
sys.exit()
print 'Reading list of APIs from', sys.argv[1]
wikis = open(sys.argv[1], 'r').read().splitlines()
print '%d APIs found' % (len(wikis))
for wiki in wikis:
print "#"*73
print "# Downloading", wiki
@ -42,17 +42,15 @@ def main():
wiki = wiki.lower()
# Make the prefix in standard way; api and index must be defined, not important which is which
prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
#check if compressed, in that case dump was finished previously
compressed = False
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for f in filenames:
if f.startswith(prefix) and f.endswith('.7z'):
compressed = True
zipfilename = f
for f in os.listdir('.'):
if f.startswith(prefix) and f.endswith('.7z'):
compressed = True
zipfilename = f
break #stop searching, dot not explore subdirectories
if compressed:
print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
# Get the archive's file list.
@ -67,18 +65,17 @@ def main():
print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
# TODO: Find a way like grep -q below without doing a 7z l multiple times?
continue
#download
started = False #was this wiki download started before? then resume
wikidir = ''
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for d in dirnames:
if d.startswith(prefix):
wikidir = d
started = True
for f in os.listdir('.'):
# Does not find numbered wikidumps not verify directories
if f.startswith(prefix) and f.endswith('wikidump'):
wikidir = f
started = True
break #stop searching, dot not explore subdirectories
# time.sleep(60)
# Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
# such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
@ -90,15 +87,14 @@ def main():
subprocess.call('./dumpgenerator.py --api=%s --xml --images --delay=1' % wiki, shell=True)
started = True
#save wikidir now
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for d in dirnames:
if d.startswith(prefix):
wikidir = d
for f in os.listdir('.'):
# Does not find numbered wikidumps not verify directories
if f.startswith(prefix) and f.endswith('wikidump'):
wikidir = f
break #stop searching, dot not explore subdirectories
prefix = wikidir.split('-wikidump')[0]
finished = False
if started and wikidir and prefix:
if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
@ -107,7 +103,7 @@ def main():
finished = True
# You can also issue this on your working directory to find all incomplete dumps:
# tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
#compress
if finished:
time.sleep(1)

@ -3048,7 +3048,7 @@ http://vai.uibk.ac.at/dadp/doku.php
http://vak.ru/doku.php
http://val.bmstu.ru/dokuwiki/doku.php
http://valk.mave.jp/doku.php
http://vancouver.hackspace.ca/doku.php
http://vanhack.ca/doku.php
http://vanets.vuse.vanderbilt.edu/dokuwiki/doku.php
http://vaslor.net/doku.php
http://vbraun.name/cms/doku.php
@ -4957,7 +4957,6 @@ http://www.minkhollow.ca/becker/doku.php
http://www.minkhollow.ca/mhf/doku.php
http://www.minkhollow.ca/MHF/doku.php
http://www.minkhollow.ca/Thesis07/doku.php
http://www.mirkosertic.de/doku.php
http://www.mirmer.su/wiki/doku.php
http://www.mixshare.com/wiki/doku.php
http://www.mixxx.org/wiki/doku.php

File diff suppressed because it is too large Load Diff

@ -1,7 +1,7 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2014 WikiTeam developers
# Copyright (C) 2014-2017 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@ -26,9 +26,10 @@ def main():
url = 'https://meta.miraheze.org/wiki/Special:SiteMatrix'
r = requests.get(url, headers=headers)
raw = r.text
m = re.findall(ur'<tr><td><a href="https://([^>]+?)/">[^<]+</a></td></tr>', raw)
m = re.findall(ur'<tr><td>(<del>)?<a href="https://([^>]+?)/">[^<]+</a>', raw)
m.sort()
for i in m:
print 'https://' + i + '/w/api.php'
print 'https://' + i[1] + '/w/api.php'
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

@ -1,5 +1,5 @@
Wikifarm: https://meta.miraheze.org/wiki/Miraheze
Last update: 2015-09-29
Last update: 2017-06-30
Details:

@ -1,7 +1,7 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2014 WikiTeam developers
# Copyright (C) 2014-2017 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@ -27,6 +27,7 @@ def main():
r = requests.get(url, headers=headers)
raw = r.text
m = re.findall(ur'<li><a href=\'([^>]+?)/wiki/\'>', raw)
m.sort()
for i in m:
print i + '/w/api.php'

@ -2,8 +2,6 @@ http://24.neoseeker.com/w/api.php
http://aceattorney.neoseeker.com/w/api.php
http://advancewars.neoseeker.com/w/api.php
http://adventuretime.neoseeker.com/w/api.php
http://alanwake.neoseeker.com/w/api.php
http://alienbreed.neoseeker.com/w/api.php
http://animalcrossing.neoseeker.com/w/api.php
http://attackontitan.neoseeker.com/w/api.php
http://avatar.neoseeker.com/w/api.php
@ -17,9 +15,9 @@ http://boktai.neoseeker.com/w/api.php
http://bond.neoseeker.com/w/api.php
http://borderlands.neoseeker.com/w/api.php
http://boundbyflame.neoseeker.com/w/api.php
http://bravely.neoseeker.com/w/api.php
http://breathoffire.neoseeker.com/w/api.php
http://brink.neoseeker.com/w/api.php
http://bulletstorm.neoseeker.com/w/api.php
http://callofduty.neoseeker.com/w/api.php
http://castlecrashers.neoseeker.com/w/api.php
http://castlevania.neoseeker.com/w/api.php
@ -35,13 +33,10 @@ http://danganronpa.neoseeker.com/w/api.php
http://darksouls.neoseeker.com/w/api.php
http://deadisland.neoseeker.com/w/api.php
http://deadoralive.neoseeker.com/w/api.php
http://deadspace.neoseeker.com/w/api.php
http://deathnote.neoseeker.com/w/api.php
http://demonssouls.neoseeker.com/w/api.php
http://destiny.neoseeker.com/w/api.php
http://deusex.neoseeker.com/w/api.php
http://devilmaycry.neoseeker.com/w/api.php
http://diablo3.neoseeker.com/w/api.php
http://digimon.neoseeker.com/w/api.php
http://disgaea.neoseeker.com/w/api.php
http://doctorwho.neoseeker.com/w/api.php
@ -57,21 +52,17 @@ http://dynastywarriors.neoseeker.com/w/api.php
http://elderscrolls.neoseeker.com/w/api.php
http://endlessocean.neoseeker.com/w/api.php
http://evangelion.neoseeker.com/w/api.php
http://eveonline.neoseeker.com/w/api.php
http://fable.neoseeker.com/w/api.php
http://fairytail.neoseeker.com/w/api.php
http://fallout4.neoseeker.com/w/api.php
http://fallout.neoseeker.com/w/api.php
http://fallout4.neoseeker.com/w/api.php
http://familyguy.neoseeker.com/w/api.php
http://farcry.neoseeker.com/w/api.php
http://fatalfury.neoseeker.com/w/api.php
http://fifa.neoseeker.com/w/api.php
http://finalfantasy.neoseeker.com/w/api.php
http://fireemblem.neoseeker.com/w/api.php
http://footballmanager.neoseeker.com/w/api.php
http://formula1.neoseeker.com/w/api.php
http://forza.neoseeker.com/w/api.php
http://friends.neoseeker.com/w/api.php
http://fullmetalalchemist.neoseeker.com/w/api.php
http://futurama.neoseeker.com/w/api.php
http://fzero.neoseeker.com/w/api.php
@ -81,11 +72,9 @@ http://glee.neoseeker.com/w/api.php
http://godofwar.neoseeker.com/w/api.php
http://goldensun.neoseeker.com/w/api.php
http://granturismo.neoseeker.com/w/api.php
http://greysanatomy.neoseeker.com/w/api.php
http://growlanser.neoseeker.com/w/api.php
http://gta5.neoseeker.com/w/api.php
http://gta.neoseeker.com/w/api.php
http://guildwars2.neoseeker.com/w/api.php
http://gta5.neoseeker.com/w/api.php
http://guildwars.neoseeker.com/w/api.php
http://guitarhero.neoseeker.com/w/api.php
http://gundam.neoseeker.com/w/api.php
@ -106,7 +95,6 @@ http://inuyasha.neoseeker.com/w/api.php
http://jakdaxter.neoseeker.com/w/api.php
http://kairosoft.neoseeker.com/w/api.php
http://kidicarus.neoseeker.com/w/api.php
http://kingdomcome.neoseeker.com/w/api.php
http://kingdomhearts.neoseeker.com/w/api.php
http://kirby.neoseeker.com/w/api.php
http://knack.neoseeker.com/w/api.php
@ -115,8 +103,6 @@ http://layton.neoseeker.com/w/api.php
http://leagueoflegends.neoseeker.com/w/api.php
http://legendofdragoon.neoseeker.com/w/api.php
http://littlebigplanet.neoseeker.com/w/api.php
http://lmamanager.neoseeker.com/w/api.php
http://lordsofthefallen.neoseeker.com/w/api.php
http://lotr.neoseeker.com/w/api.php
http://mafia.neoseeker.com/w/api.php
http://magicalstarsign.neoseeker.com/w/api.php
@ -128,7 +114,6 @@ http://megaman.neoseeker.com/w/api.php
http://megamitensei.neoseeker.com/w/api.php
http://metalgear.neoseeker.com/w/api.php
http://metroid.neoseeker.com/w/api.php
http://mightandmagic.neoseeker.com/w/api.php
http://minecraft.neoseeker.com/w/api.php
http://monsterhunter.neoseeker.com/w/api.php
http://mortalkombat.neoseeker.com/w/api.php
@ -140,7 +125,6 @@ http://ncis.neoseeker.com/w/api.php
http://needforspeed.neoseeker.com/w/api.php
http://ninjagaiden.neoseeker.com/w/api.php
http://ninokuni.neoseeker.com/w/api.php
http://nintendogs.neoseeker.com/w/api.php
http://okami.neoseeker.com/w/api.php
http://onepiece.neoseeker.com/w/api.php
http://persona.neoseeker.com/w/api.php
@ -160,14 +144,12 @@ http://rockband.neoseeker.com/w/api.php
http://rpgmaker.neoseeker.com/w/api.php
http://runefactory.neoseeker.com/w/api.php
http://runescape.neoseeker.com/w/api.php
http://runesofmagic.neoseeker.com/w/api.php
http://sandbox.neoseeker.com/w/api.php
http://scottpilgrim.neoseeker.com/w/api.php
http://scrapmetal.neoseeker.com/w/api.php
http://scribblenauts.neoseeker.com/w/api.php
http://shadowofthecolossus.neoseeker.com/w/api.php
http://shadowrunreturns.neoseeker.com/w/api.php
http://shank.neoseeker.com/w/api.php
http://shenmue.neoseeker.com/w/api.php
http://simpsons.neoseeker.com/w/api.php
http://skate.neoseeker.com/w/api.php
@ -183,7 +165,6 @@ http://southpark.neoseeker.com/w/api.php
http://spiderman.neoseeker.com/w/api.php
http://spongebob.neoseeker.com/w/api.php
http://spyro.neoseeker.com/w/api.php
http://starbound.neoseeker.com/w/api.php
http://starcraft.neoseeker.com/w/api.php
http://starfox.neoseeker.com/w/api.php
http://stargate.neoseeker.com/w/api.php
@ -196,9 +177,7 @@ http://tales.neoseeker.com/w/api.php
http://tekken.neoseeker.com/w/api.php
http://terraria.neoseeker.com/w/api.php
http://thedarkness.neoseeker.com/w/api.php
http://thedivision.neoseeker.com/w/api.php
http://thelastofus.neoseeker.com/w/api.php
http://theorder.neoseeker.com/w/api.php
http://thesecretworld.neoseeker.com/w/api.php
http://thesims.neoseeker.com/w/api.php
http://thewarriors.neoseeker.com/w/api.php
@ -206,9 +185,7 @@ http://theworldendswithyou.neoseeker.com/w/api.php
http://thief.neoseeker.com/w/api.php
http://timesplitters.neoseeker.com/w/api.php
http://tonyhawk.neoseeker.com/w/api.php
http://torchlight2.neoseeker.com/w/api.php
http://toriko.neoseeker.com/w/api.php
http://transformers.neoseeker.com/w/api.php
http://twilight.neoseeker.com/w/api.php
http://twistedmetal.neoseeker.com/w/api.php
http://uncharted.neoseeker.com/w/api.php
@ -217,12 +194,9 @@ http://vivapinata.neoseeker.com/w/api.php
http://wakfu.neoseeker.com/w/api.php
http://warcraft.neoseeker.com/w/api.php
http://warhammer.neoseeker.com/w/api.php
http://wasteland2.neoseeker.com/w/api.php
http://watchdogs.neoseeker.com/w/api.php
http://whiteknightchronicles.neoseeker.com/w/api.php
http://wikiguides.neoseeker.com/w/api.php
http://witcher3.neoseeker.com/w/api.php
http://worldoftanks.neoseeker.com/w/api.php
http://wow.neoseeker.com/w/api.php
http://xenoblade.neoseeker.com/w/api.php
http://yugioh.neoseeker.com/w/api.php

@ -1,5 +1,5 @@
Wikifarm: http://neowiki.neoseeker.com/wiki/Main_Page
Last update: 2015-10-07
Last update: 2017-06-30
Details:

File diff suppressed because it is too large Load Diff

@ -23,7 +23,7 @@ import subprocess
import re
from wikitools import wiki, api
def getlist(wikia, wkfrom = 1, wkto = 1000):
def getlist(wikia, wkfrom = 1, wkto = 100):
params = {'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto,}
request = api.APIRequest(wikia, params)
return request.query()['query']['wkdomains']
@ -31,8 +31,9 @@ def getlist(wikia, wkfrom = 1, wkto = 1000):
def getall():
wikia = wiki.Wiki('http://community.wikia.com/api.php')
offset = 0
limit = 1000
limit = 100
domains = {}
empty = 0
# This API module has no query continuation facility
print 'Getting list of active domains...'
while True:
@ -40,13 +41,21 @@ def getall():
if list:
print offset
domains = dict(domains.items() + list.items() )
offset += 1000
empty = 0
else:
empty += 1
offset += limit
if empty > 100:
# Hopefully we don't have more than 10k wikis deleted in a row
break
return domains
def main():
domains = getall()
with open('wikia.com', 'w') as out:
out.write('\n'.join(str(domains[i]['domain']) for i in domains))
undumped = []
# Or we could iterate over each sublist while we get it?
for i in domains:
@ -55,21 +64,21 @@ def main():
print dbname
first = dbname[0]
# There are one-letter dbnames; the second letter is replaced by an underscore
# http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.gz
# http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.7z
try:
second = dbname[1]
except:
second = '_'
base = 'http://s3.amazonaws.com/wikia_xml_dumps/' + first + '/' \
+ first + second + '/' + dbname
full = base + '_pages_full.xml.gz'
full = base + '_pages_full.xml.7z'
print full
current = base + '_pages_current.xml.gz'
current = base + '_pages_current.xml.7z'
images = base + '_images.tar'
try:
#subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
# Use this instead, and comment out the next try, to only list.
subprocess.check_call(['curl', '-I', '--fail', full])
subprocess.call(['curl', '-I', '--fail', full])
except subprocess.CalledProcessError as e:
# We added --fail for this https://superuser.com/a/854102/283120
if e.returncode == 22:
@ -81,7 +90,9 @@ def main():
# subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images])
#except:
# pass
print '\n'.join(str(dump) for dump in undumped)
with open('wikia.com-unarchived', 'w+') as out:
out.write('\n'.join(str(domain) for domain in undumped))
if __name__ == '__main__':
main()

@ -0,0 +1,61 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import random
import re
import sys
import time
import urllib.request
def main():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.1')]
urllib.request.install_opener(opener)
for i in range(1, 100000):
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikidot.com' % (random.randint(100, 5000), random.randint(1000, 9999))
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
time.sleep(30)
continue
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
for wiki in m:
wiki = 'https://' + wiki
if not wiki in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open('wikidot-duckduckgo.txt', 'w') as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r'https?://www\.', 'http://', wiki)
if not wiki in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
f.write('\n'.join(wikis))
print('%d wikis found' % (len(wikis)))
sleep = random.randint(5,20)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
if __name__ == '__main__':
main()

@ -0,0 +1,65 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import random
import re
import sys
import time
import urllib.request
def main():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
wikis = []
with open('wikidot-spider.txt', 'r') as f:
wikis = f.read().strip().splitlines()
for i in range(1, 1000000):
url = random.choice(wikis)
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
time.sleep(30)
continue
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
for wiki in m:
wiki = 'http://' + wiki
if not wiki in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open('wikidot-spider.txt', 'w') as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r'https?://www\.', 'http://', wiki)
if not wiki in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
f.write('\n'.join(wikis))
print('%d wikis found' % (len(wikis)))
sleep = random.randint(1,5)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
if __name__ == '__main__':
main()

@ -0,0 +1,871 @@
http://007.wikidot.com
http://025002.wikidot.com
http://05centraal.wikidot.com
http://05command-ja.wikidot.com
http://05command.wikidot.com
http://05zentrale.wikidot.com
http://101.wikidot.com
http://16thfleet.wikidot.com
http://2012hoax.wikidot.com
http://56wrtg1150.wikidot.com
http://5edndwiki.wikidot.com
http://E-H-S.wikidot.com
http://F90in15Minutes.wikidot.com
http://Health-Matters.wikidot.com
http://Herbis.wikidot.com
http://INCL.wikidot.com
http://a4art.wikidot.com
http://abarrelfull.wikidot.com
http://academicwriting.wikidot.com
http://ad3.wikidot.com
http://admindevelopement.wikidot.com
http://advent-ro.wikidot.com
http://adventuresintherealms.wikidot.com
http://aepassociation.wikidot.com
http://aepsandbox.wikidot.com
http://afterthecomet-v2.wikidot.com
http://ageofascension.wikidot.com
http://ageofheroesmux.wikidot.com
http://airchairbuild.wikidot.com
http://albums-template.wikidot.com
http://alfamedia.wikidot.com
http://algadon.wikidot.com
http://alicebot.wikidot.com
http://alveslima-edu.wikidot.com
http://amawal.wikidot.com
http://amen.wikidot.com
http://amiii.wikidot.com
http://analyticscamp.wikidot.com
http://android0.wikidot.com
http://androidalchemy.wikidot.com
http://angarmegia-creadores.wikidot.com
http://angarmegia-publicaciones.wikidot.com
http://angarmegia-valores.wikidot.com
http://angarmegia.wikidot.com
http://angry-mage-games.wikidot.com
http://anime-planet.wikidot.com
http://apmoderneuro.wikidot.com
http://applebyandwyman.wikidot.com
http://aprendiendo.wikidot.com
http://aq-3d.wikidot.com
http://aqw-swf.wikidot.com
http://aqwwiki.wikidot.com
http://arcana.wikidot.com
http://arcblade.wikidot.com
http://artemachia.wikidot.com
http://artniyet.wikidot.com
http://asen.wikidot.com
http://asoh.wikidot.com
http://aspnet.wikidot.com
http://astrobhadauria.wikidot.com
http://astrobhadauria1414.wikidot.com
http://astroveda.wikidot.com
http://astroyogas.wikidot.com
http://asu-csf.wikidot.com
http://audioprodukcja.wikidot.com
http://avendar.wikidot.com
http://aviationknowledge.wikidot.com
http://avoidglow.wikidot.com
http://azentia.wikidot.com
http://babel-template.wikidot.com
http://backpharma.wikidot.com
http://backupstorage.wikidot.com
http://badwebcomics.wikidot.com
http://balchipedia.wikidot.com
http://barakus.wikidot.com
http://battlestargenesis.wikidot.com
http://bcp.wikidot.com
http://beadersresourceguide.wikidot.com
http://beargod.wikidot.com
http://benitachell-bowls-club.wikidot.com
http://bhg.wikidot.com
http://bibles.wikidot.com
http://bilbreyapwh.wikidot.com
http://biol-117.wikidot.com
http://biol252-biol319.wikidot.com
http://bioproject.wikidot.com
http://bisgmit.wikidot.com
http://blackbelt.wikidot.com
http://blackberrystorm.wikidot.com
http://blackmarches.wikidot.com
http://blank-template.wikidot.com
http://bleachitp.wikidot.com
http://blender0.wikidot.com
http://blender1.wikidot.com
http://blmodding.wikidot.com
http://blog-template.wikidot.com
http://blog.wikidot.com
http://blogs-template.wikidot.com
http://bloodborne.wikidot.com
http://bni-ine.wikidot.com
http://book-template.wikidot.com
http://booriley.wikidot.com
http://bootstrap-playground.wikidot.com
http://borderlands.wikidot.com
http://borradores-insurgencia-del-caos.wikidot.com
http://borradores-scp-es.wikidot.com
http://bozic-nation.wikidot.com
http://brmehta12.wikidot.com
http://brtff.wikidot.com
http://brydz.wikidot.com
http://bua581.wikidot.com
http://bua581beerworks.wikidot.com
http://bua581hallelibraryfinalproject.wikidot.com
http://bugs-template.wikidot.com
http://bugs.wikidot.com
http://burntlands.wikidot.com
http://bvs.wikidot.com
http://bx-community.wikidot.com
http://bzhlab.wikidot.com
http://c4fsharp.wikidot.com
http://calu.wikidot.com
http://campusconfidential.wikidot.com
http://cancer-control.wikidot.com
http://caosinsurgente.wikidot.com
http://carpenoctemstaff.wikidot.com
http://castleage.wikidot.com
http://caughtnotsleeping.wikidot.com
http://ccckmit.wikidot.com
http://ccpd.wikidot.com
http://cctest.wikidot.com
http://ccyms.wikidot.com
http://ccymsevangelization.wikidot.com
http://ccymsfoundations.wikidot.com
http://ccymsjustice.wikidot.com
http://ccymslounge.wikidot.com
http://ccymspastoral.wikidot.com
http://ccymspractices.wikidot.com
http://ccymsprayer.wikidot.com
http://ccymsprinciples.wikidot.com
http://ccymsskills.wikidot.com
http://ccymsstudents.wikidot.com
http://cdaworldhistory.wikidot.com
http://cellworld.wikidot.com
http://celtic-heroes.wikidot.com
http://cf-vanguard.wikidot.com
http://cgp.wikidot.com
http://chaoscomplexityineducation.wikidot.com
http://chat-template.wikidot.com
http://chatroom.wikidot.com
http://chavezbraintrust.wikidot.com
http://chcc.wikidot.com
http://chessvariants.wikidot.com
http://chimiex-bicaz.wikidot.com
http://ci-sandbox.wikidot.com
http://ci-visualdocuments.wikidot.com
http://ci-wiki.wikidot.com
http://circservices.wikidot.com
http://ciscotr.wikidot.com
http://cityofangels.wikidot.com
http://cleanias.wikidot.com
http://cmbeta.wikidot.com
http://coffeetime.wikidot.com
http://coffeetimex.wikidot.com
http://colbycriminaljustice.wikidot.com
http://columbiacity.wikidot.com
http://comando05.wikidot.com
http://comando05ptbr.wikidot.com
http://commandement-alpha.wikidot.com
http://commandemento5.wikidot.com
http://communicity.wikidot.com
http://communicity2010.wikidot.com
http://community-playground.wikidot.com
http://community.wikidot.com
http://computer0.wikidot.com
http://comux.wikidot.com
http://connorscampaigns.wikidot.com
http://connorscentral.wikidot.com
http://connorsgmnotes.wikidot.com
http://connorssettings.wikidot.com
http://consumerpsych2009.wikidot.com
http://convert.wikidot.com
http://copernicon.wikidot.com
http://corvidcollege.wikidot.com
http://corwyn.wikidot.com
http://cpp-wiki.wikidot.com
http://cquniversity.wikidot.com
http://crashfeverwikitw.wikidot.com
http://crimjobs2010-2011.wikidot.com
http://crm-iseg.wikidot.com
http://crm-template.wikidot.com
http://crosswindsgarou.wikidot.com
http://crypsis-net.wikidot.com
http://cs0.wikidot.com
http://cs1.wikidot.com
http://cs101c.wikidot.com
http://cs124project-2009.wikidot.com
http://csc180.wikidot.com
http://csi.wikidot.com
http://css-competition.wikidot.com
http://css-sandbox.wikidot.com
http://css.wikidot.com
http://css3.wikidot.com
http://css3themes.wikidot.com
http://cst133a.wikidot.com
http://ctwiki.wikidot.com
http://cuarteldelo5.wikidot.com
http://cubesat.wikidot.com
http://cuiltheory.wikidot.com
http://cunefa2.wikidot.com
http://cunefb2.wikidot.com
http://cunefc2.wikidot.com
http://cunefe2.wikidot.com
http://cyclods.wikidot.com
http://daeren.wikidot.com
http://darksouls.wikidot.com
http://darksouls2.wikidot.com
http://darksouls3.wikidot.com
http://dawnofanewage.wikidot.com
http://dcernst-teaching.wikidot.com
http://dcernst.wikidot.com
http://ddscat.wikidot.com
http://defa.wikidot.com
http://default-template.wikidot.com
http://defunct-elitequestworlds.wikidot.com
http://demonssouls.wikidot.com
http://denver.wikidot.com
http://desenvolvimentodejogos.wikidot.com
http://design-illustration.wikidot.com
http://destiny.wikidot.com
http://detailed-customer-management.wikidot.com
http://dndis.wikidot.com
http://docpl.wikidot.com
http://dokument-uz.wikidot.com
http://dotflow.wikidot.com
http://downsfolk.wikidot.com
http://dowodztwo.wikidot.com
http://dragon-trees.wikidot.com
http://dreamprogram.wikidot.com
http://dreamteam.wikidot.com
http://dresdenfiles.wikidot.com
http://ds09.wikidot.com
http://ds10.wikidot.com
http://ds2009a.wikidot.com
http://ds2010a.wikidot.com
http://dwd.wikidot.com
http://e-h-s.wikidot.com
http://earlychildhood.wikidot.com
http://eberronunlimited.wikidot.com
http://ecadmin.wikidot.com
http://ecctimeline.wikidot.com
http://echobazaar.wikidot.com
http://ecomind.wikidot.com
http://editor.wikidot.com
http://editora.wikidot.com
http://edmw.wikidot.com
http://educ400-401.wikidot.com
http://education-template.wikidot.com
http://efepereth.wikidot.com
http://eime.wikidot.com
http://eitriggcrafting.wikidot.com
http://ejs-in-india.wikidot.com
http://eldritch00.wikidot.com
http://elishapeterson.wikidot.com
http://elsirvale.wikidot.com
http://elunesjustice.wikidot.com
http://emchina2010.wikidot.com
http://enchantedbros.wikidot.com
http://encyclowiki.wikidot.com
http://energyclub.wikidot.com
http://energyclub4samvedna.wikidot.com
http://energyfuture.wikidot.com
http://eng1d1.wikidot.com
http://eng270.wikidot.com
http://epimreth.wikidot.com
http://epitome.wikidot.com
http://esperanto.wikidot.com
http://estudianteseconomiauned.wikidot.com
http://eventidemush.wikidot.com
http://everydaymagicalgirls.wikidot.com
http://evilhat.wikidot.com
http://execs.wikidot.com
http://exploringsciencewiki.wikidot.com
http://extrabees.wikidot.com
http://f650cs.wikidot.com
http://fairfieldproject.wikidot.com
http://falchionvalley.wikidot.com
http://fallout2online.wikidot.com
http://faq.wikidot.com
http://fearschemistry.wikidot.com
http://fed20.wikidot.com
http://feedback-template.wikidot.com
http://feedback.wikidot.com
http://fifa360.wikidot.com
http://fifabeapro360.wikidot.com
http://fightcorruption.wikidot.com
http://figmentregistry.wikidot.com
http://fillionempire.wikidot.com
http://finalfantasy14fr.wikidot.com
http://first-steps.wikidot.com
http://flyclear.wikidot.com
http://fmi.wikidot.com
http://fmiseria3.wikidot.com
http://fondationscp.wikidot.com
http://fondationscpsandbox.wikidot.com
http://fondazionescp.wikidot.com
http://fortean.wikidot.com
http://forum-template.wikidot.com
http://forum.wikidot.com
http://fourthwallgames.wikidot.com
http://fpt.wikidot.com
http://freevoddler.wikidot.com
http://fretsonfire.wikidot.com
http://futaba8fg.wikidot.com
http://gagetowngaming.wikidot.com
http://galacticunity.wikidot.com
http://game-maker.wikidot.com
http://gamedesign.wikidot.com
http://gamemaker.wikidot.com
http://gasbags.wikidot.com
http://gd28.wikidot.com
http://gdnd.wikidot.com
http://gdt2009.wikidot.com
http://gear-sandbox.wikidot.com
http://geararc.wikidot.com
http://genderbinary.wikidot.com
http://generals.wikidot.com
http://ginnungagap.wikidot.com
http://globalseminarhealth.wikidot.com
http://goddardtech.wikidot.com
http://gorszy.wikidot.com
http://greatestfilipino.wikidot.com
http://green-house.wikidot.com
http://guitarzero.wikidot.com
http://gurpswiki.wikidot.com
http://h205.wikidot.com
http://hackersderede.wikidot.com
http://halfmoonbay.wikidot.com
http://hammer-template.wikidot.com
http://handbook.wikidot.com
http://harvey-capital-lectures.wikidot.com
http://health-matters.wikidot.com
http://herbis.wikidot.com
http://heroes.wikidot.com
http://heroesmush.wikidot.com
http://heroesofalvena.wikidot.com
http://heroessincity.wikidot.com
http://hestia.wikidot.com
http://hfwiki.wikidot.com
http://hiddenprojectwiki.wikidot.com
http://himetop.wikidot.com
http://historynewmedia.wikidot.com
http://hkcentral.wikidot.com
http://hogwarts2092.wikidot.com
http://hopkinswhpg.wikidot.com
http://housegames.wikidot.com
http://hp-intothefire.wikidot.com
http://hrpg.wikidot.com
http://hscwizards.wikidot.com
http://hswiki.wikidot.com
http://html50.wikidot.com
http://iaac-readings.wikidot.com
http://iatkos.wikidot.com
http://ibhistory.wikidot.com
http://ibi-apedia.wikidot.com
http://ibiz.wikidot.com
http://ibmathstuff.wikidot.com
http://ibphysicsstuff.wikidot.com
http://ibstuffqa.wikidot.com
http://iceal.wikidot.com
http://idrumaaps.wikidot.com
http://ifs.wikidot.com
http://igen.wikidot.com
http://igor.wikidot.com
http://imocamp.wikidot.com
http://incl.wikidot.com
http://inctr-news.wikidot.com
http://inctr-palliative-care-handbook.wikidot.com
http://inctr.wikidot.com
http://indexhibit.wikidot.com
http://insomniacramblings.wikidot.com
http://installer.wikidot.com
http://insurrection-du-chaos-sandbox.wikidot.com
http://insurrection-du-chaos.wikidot.com
http://inter-irc.wikidot.com
http://internationalbatesoninstitute.wikidot.com
http://internetior.wikidot.com
http://involo.wikidot.com
http://ipr10.wikidot.com
http://ipr11.wikidot.com
http://ipr12.wikidot.com
http://iracing.wikidot.com
http://irc.wikidot.com
http://irongiant.wikidot.com
http://irunath.wikidot.com
http://is2216.wikidot.com
http://ischool.wikidot.com
http://isocentre.wikidot.com
http://issuetracker-template.wikidot.com
http://istar.wikidot.com
http://istb-winter2010.wikidot.com
http://istep-sandbox.wikidot.com
http://itb322uap.wikidot.com
http://ivm.wikidot.com
http://jakilinux.wikidot.com
http://java.wikidot.com
http://jayashree.wikidot.com
http://jccict.wikidot.com
http://johnmerritt.wikidot.com
http://join.wikidot.com
http://jquery-easyui.wikidot.com
http://jslibrary.wikidot.com
http://jsukfpsd.wikidot.com
http://kalgati.wikidot.com
http://kannadanudi.wikidot.com
http://karma-lab.wikidot.com
http://kdiprivateequity.wikidot.com
http://keramik.wikidot.com
http://kf59.wikidot.com
http://kfmapdb.wikidot.com
http://khaidoan.wikidot.com
http://kharon.wikidot.com
http://kindiy.wikidot.com
http://kingsway.wikidot.com
http://kingswayeap.wikidot.com
http://kingswayelem.wikidot.com
http://kingswayielts.wikidot.com
http://kingswayint.wikidot.com
http://kingswaypreint.wikidot.com
http://kingswayupper.wikidot.com
http://klps.wikidot.com
http://kmhouse.wikidot.com
http://kmk.wikidot.com
http://knightswrite.wikidot.com
http://kodo.wikidot.com
http://koty.wikidot.com
http://ksemoudania.wikidot.com
http://ladyhood66.wikidot.com
http://lafundacionscp.wikidot.com
http://languagearts8.wikidot.com
http://lapidaria.wikidot.com
http://lasthaiku.wikidot.com
http://latindictionary.wikidot.com
http://latmari.wikidot.com
http://leplouc.wikidot.com
http://lepszy.wikidot.com
http://level1wiki.wikidot.com
http://libevents.wikidot.com
http://liblivadia.wikidot.com
http://librarylab.wikidot.com
http://lightworks.wikidot.com
http://linux0.wikidot.com
http://livesupport.wikidot.com
http://lmtoelf.wikidot.com
http://loosepages.wikidot.com
http://ltt.wikidot.com
http://lulu.wikidot.com
http://m5snapoli.wikidot.com
http://ma4140.wikidot.com
http://machines-history.wikidot.com
http://machinima138.wikidot.com
http://mactutorial.wikidot.com
http://maegica.wikidot.com
http://magiamesterei.wikidot.com
http://mainframes.wikidot.com
http://majjhima.wikidot.com
http://makeyourbot.wikidot.com
http://malkavian.wikidot.com
http://managerzonemexico.wikidot.com
http://maratona.wikidot.com
http://marblehornets.wikidot.com
http://margopedia.wikidot.com
http://marketplace-template.wikidot.com
http://marvelreborn.wikidot.com
http://marvelrevolution.wikidot.com
http://masonic.wikidot.com
http://math453fall2008.wikidot.com
http://mathaerobics4samvedna.wikidot.com
http://mathonline.wikidot.com
http://mathroughguides.wikidot.com
http://mbitcoin.wikidot.com
http://mc-21.wikidot.com
http://mcdt25e.wikidot.com
http://me1065.wikidot.com
http://measurementcamp.wikidot.com
http://media.wikidot.com
http://miedzymorze.wikidot.com
http://minahaplo.wikidot.com
http://mis213-2.wikidot.com
http://mk2k.wikidot.com
http://mkworld.wikidot.com
http://mnprek-3.wikidot.com
http://monacobayweyr.wikidot.com
http://monobook-template.wikidot.com
http://monobook.wikidot.com
http://monodot-template.wikidot.com
http://morningside-genetics.wikidot.com
http://morningsidemicro.wikidot.com
http://morphopedics.wikidot.com
http://mpm.wikidot.com
http://mukesh381.wikidot.com
http://multiverse-crisis.wikidot.com
http://musicgames.wikidot.com
http://my-pride.wikidot.com
http://mybookworld.wikidot.com
http://myslimchatroom.wikidot.com
http://myvineyard.wikidot.com
http://nanorodsa.wikidot.com
http://nanorodthermo.wikidot.com
http://narutoitp.wikidot.com
http://narutomushrivalry.wikidot.com
http://nauticoamager.wikidot.com
http://neo-dimension.wikidot.com
http://neosteam.wikidot.com
http://neozone.wikidot.com
http://newapprequirements.wikidot.com
http://news.wikidot.com
http://nightskysymbology.wikidot.com
http://nimin.wikidot.com
http://ninjaproxy.wikidot.com
http://nirn.wikidot.com
http://nnhs-science-restrictedaccess.wikidot.com
http://nnhs-science.wikidot.com
http://noblebeastwars.wikidot.com
http://nomyslamps.wikidot.com
http://norron.wikidot.com
http://notebook-template.wikidot.com
http://notebooks.wikidot.com
http://nre509.wikidot.com
http://nsb.wikidot.com
http://ntumed96.wikidot.com
http://nucularelectronics.wikidot.com
http://o5command-int.wikidot.com
http://o5command-th.wikidot.com
http://oblivionshard.wikidot.com
http://offtopicarium.wikidot.com
http://old-template.wikidot.com
http://oneeleventwentyten.wikidot.com
http://opend6.wikidot.com
http://opensource-template.wikidot.com
http://opensuse.wikidot.com
http://oppt-sa.wikidot.com
http://oregonamhi.wikidot.com
http://osx86.wikidot.com
http://oversoulgame.wikidot.com
http://ozradonc.wikidot.com
http://packages.wikidot.com
http://pagi.wikidot.com
http://pandora-saga.wikidot.com
http://papercraft.wikidot.com
http://paperworks.wikidot.com
http://paradiserpg.wikidot.com
http://paradoxhaze.wikidot.com
http://paralelo.wikidot.com
http://parented.wikidot.com
http://passatb5.wikidot.com
http://pathtogolarion.wikidot.com
http://patriot-box-office.wikidot.com
http://patterns.wikidot.com
http://pbbg.wikidot.com
http://pcg.wikidot.com
http://pcif.wikidot.com
http://pedhemoncreview.wikidot.com
http://perchelinux.wikidot.com
http://pernworld.wikidot.com
http://personal-template.wikidot.com
http://petition-template.wikidot.com
http://pfcuq.wikidot.com
http://pfseconddarkness.wikidot.com
http://phikappatau.wikidot.com
http://philosophia.wikidot.com
http://philosophiesoflife.wikidot.com
http://photo-gallery-template.wikidot.com
http://phylo.wikidot.com
http://pl.wikidot.com
http://playstation3hacksandmods.wikidot.com
http://pofomultiquiz.wikidot.com
http://pogon.wikidot.com
http://polls.wikidot.com
http://porphyrarpg.wikidot.com
http://porsche.wikidot.com
http://pottersarmy.wikidot.com
http://predev.wikidot.com
http://private-template.wikidot.com
http://processexcel.wikidot.com
http://professorallred.wikidot.com
http://profiles.wikidot.com
http://project-template.wikidot.com
http://projects.wikidot.com
http://ps3indexhelp.wikidot.com
http://psi-ppwg.wikidot.com
http://psms.wikidot.com
http://psrboregon.wikidot.com
http://psyc101.wikidot.com
http://psychjobsearch.wikidot.com
http://psychotronicsdivision.wikidot.com
http://pt851.wikidot.com
http://puddincupcss.wikidot.com
http://puppet.wikidot.com
http://pw7890o.wikidot.com
http://pylint-messages.wikidot.com
http://qttabbar.wikidot.com
http://quiat.wikidot.com
http://r.wikidot.com
http://radonc.wikidot.com
http://railgunitp.wikidot.com
http://ravenmarches.wikidot.com
http://realestate-template.wikidot.com
http://redirect-template.wikidot.com
http://redsite.wikidot.com
http://renegadesofpw.wikidot.com
http://reshme.wikidot.com
http://reskitchen.wikidot.com
http://retrolegends.wikidot.com
http://retrowiki.wikidot.com
http://reykjavikmanifesto.wikidot.com
http://rhetoricalgoddess.wikidot.com
http://rmitvnim2007b.wikidot.com
http://roadmap.wikidot.com
http://roboticsclubucla.wikidot.com
http://roboticspedia.wikidot.com
http://rock-xproject.wikidot.com
http://rtd1261.wikidot.com
http://rxwiki.wikidot.com
http://s7s.wikidot.com
http://sacwwiki.wikidot.com
http://salamander724.wikidot.com
http://saludintegral.wikidot.com
http://samvedna.wikidot.com
http://sandboxscpfr.wikidot.com
http://sasana.wikidot.com
http://sasi555.wikidot.com
http://savagetidewithfiretrolls.wikidot.com
http://scala.wikidot.com
http://schoolsteachersparents.wikidot.com
http://schrijven.wikidot.com
http://scienceonlinelondon.wikidot.com
http://scion-mmp.wikidot.com
http://scp-et.wikidot.com
http://scp-field-work.wikidot.com
http://scp-foundation-origins.wikidot.com
http://scp-he.wikidot.com
http://scp-hu.wikidot.com
http://scp-int-sandbox.wikidot.com
http://scp-int.wikidot.com
http://scp-international.wikidot.com
http://scp-jp-admin.wikidot.com
http://scp-jp-archive.wikidot.com
http://scp-jp-sandbox2.wikidot.com
http://scp-jp-sandbox3.wikidot.com
http://scp-jp.wikidot.com
http://scp-ko-15c.wikidot.com
http://scp-kr.wikidot.com
http://scp-la.wikidot.com
http://scp-nd.wikidot.com
http://scp-nl.wikidot.com
http://scp-pl-sandbox.wikidot.com
http://scp-pl.wikidot.com
http://scp-pt-br.wikidot.com
http://scp-pt.wikidot.com
http://scp-ru.wikidot.com
http://scp-sandbox-3.wikidot.com
http://scp-sandbox-la.wikidot.com
http://scp-spqr.wikidot.com
http://scp-template.wikidot.com
http://scp-th-sandbox.wikidot.com
http://scp-th.wikidot.com
http://scp-tw.wikidot.com
http://scp-ukrainian.wikidot.com
http://scp-un.wikidot.com
http://scp-vn.wikidot.com
http://scp-wiki-cn.wikidot.com
http://scp-wiki-de.wikidot.com
http://scp-wiki.wikidot.com
http://scpalex-fh.wikidot.com
http://scpclassic.wikidot.com
http://scpexplained.wikidot.com
http://scpjp-fansite.wikidot.com
http://scpkoreahq.wikidot.com
http://scpminecraft.wikidot.com
http://scpsandbox-jp.wikidot.com
http://scpsandbox-pl.wikidot.com
http://scpsandbox-ua.wikidot.com
http://scpsandbox2.wikidot.com
http://scpsandboxbr.wikidot.com
http://scpsandboxcn.wikidot.com
http://scpsandboxde.wikidot.com
http://scpsandboxit.wikidot.com
http://scpsandboxnl.wikidot.com
http://scpvakfi.wikidot.com
http://scpvakfisandbox.wikidot.com
http://scpvnsandbox.wikidot.com
http://scratch4samvedna.wikidot.com
http://serpents-hand.wikidot.com
http://sfi.wikidot.com
http://sfugamedev.wikidot.com
http://shadow4e.wikidot.com
http://sharecokecodes.wikidot.com
http://shop.wikidot.com
http://sicurezzapubblica.wikidot.com
http://sidowegraty.wikidot.com
http://signaturbogen.wikidot.com
http://siluria.wikidot.com
http://simtrackipedia.wikidot.com
http://sistdig.wikidot.com
http://siteclone.wikidot.com
http://sky852751.wikidot.com
http://skyangel.wikidot.com
http://slaythespire.wikidot.com
http://sliscomps.wikidot.com
http://slownik-geologiczny.wikidot.com
http://small-steps.wikidot.com
http://smofficer.wikidot.com
http://smsalgebra.wikidot.com
http://sniktbub.wikidot.com
http://snippets.wikidot.com
http://snow-template.wikidot.com
http://snowleopard.wikidot.com
http://sociatecture.wikidot.com
http://sociatectureblog.wikidot.com
http://socjobs.wikidot.com
http://socjobs2011.wikidot.com
http://soctech.wikidot.com
http://softwarecraftsmanship.wikidot.com
http://solariapedia.wikidot.com
http://solodarydar.wikidot.com
http://solpadeinehelp.wikidot.com
http://sortibrige.wikidot.com
http://soulslore.wikidot.com
http://soymilkls.wikidot.com
http://sp1.wikidot.com
http://spambotdeathwall.wikidot.com
http://sparks.wikidot.com
http://sped.wikidot.com
http://splinterverse.wikidot.com
http://spolecznosc.wikidot.com
http://srm.wikidot.com
http://st-phelpers.wikidot.com
http://stallmanism.wikidot.com
http://standard-template.wikidot.com
http://starwarsmadness.wikidot.com
http://static.wikidot.com
http://steelandstone.wikidot.com
http://storychip.wikidot.com
http://string-theory.wikidot.com
http://studiocomments.wikidot.com
http://studiolynn.wikidot.com
http://suffadv.wikidot.com
http://summer350.wikidot.com
http://summerisle.wikidot.com
http://sunnybrook-academy.wikidot.com
http://superjet.wikidot.com
http://surreal64ce.wikidot.com
http://sw-gis.wikidot.com
http://swietomuzyki.wikidot.com
http://swwotc.wikidot.com
http://talesofhonor.wikidot.com
http://talkingpadproject.wikidot.com
http://task-management.wikidot.com
http://tasker.wikidot.com
http://tauren.wikidot.com
http://tech-racingcars.wikidot.com
http://techblog-template.wikidot.com
http://techcomm.wikidot.com
http://ten-sb.wikidot.com
http://terrasdeportugal.wikidot.com
http://tex.wikidot.com
http://textanalytics.wikidot.com
http://the-nexus.wikidot.com
http://theanarchstate.wikidot.com
http://theblightedworld.wikidot.com
http://thecollaboratory.wikidot.com
http://thegamerdome.wikidot.com
http://thekingkillerchronicle.wikidot.com
http://thelaststory.wikidot.com
http://themes.wikidot.com
http://thep-serc.wikidot.com
http://therafim.wikidot.com
http://therafimrpg.wikidot.com
http://thesimsonline.wikidot.com
http://theskyremains.wikidot.com
http://theunforgotten.wikidot.com
http://thewake.wikidot.com
http://theweird.wikidot.com
http://theweirdwest.wikidot.com
http://ti-iseg-t12.wikidot.com
http://ti-iseg-t19.wikidot.com
http://tibasicdev.wikidot.com
http://timidgirls.wikidot.com
http://tlug.wikidot.com
http://tlumaczenia.wikidot.com
http://tmduc.wikidot.com
http://tradewithsaint.wikidot.com
http://translate.wikidot.com
http://translators-forum.wikidot.com
http://trb-mux.wikidot.com
http://triathematician.wikidot.com
http://trueblood-dallas.wikidot.com
http://try.wikidot.com
http://ttu-dom.wikidot.com
http://tyf.wikidot.com
http://typesets.wikidot.com
http://ubmedicinefaqs.wikidot.com
http://ucsdgrads.wikidot.com
http://ukcw.wikidot.com
http://ultimatemutantsofgagetown.wikidot.com
http://umassenglishgrad.wikidot.com
http://uml.wikidot.com
http://underworldlarp.wikidot.com
http://uniofbeds.wikidot.com
http://urbanmobile.wikidot.com
http://uscta.wikidot.com
http://user-gemeinschaft.wikidot.com
http://usma387.wikidot.com
http://valeofcallus.wikidot.com
http://veritasbatheo.wikidot.com
http://videoart.wikidot.com
http://viotikoskosmos.wikidot.com
http://virtualwargamer.wikidot.com
http://viscomclass.wikidot.com
http://visual-records.wikidot.com
http://vitalusers.wikidot.com
http://vocaro.wikidot.com
http://vs-tcg.wikidot.com
http://vtls-vital.wikidot.com
http://vusb.wikidot.com
http://vwinterop.wikidot.com
http://vyprmedia.wikidot.com
http://w24.wikidot.com
http://wanderers-library-ko.wikidot.com
http://wanderers-library.wikidot.com
http://wanderers-sandbox.wikidot.com
http://warsztatywww.wikidot.com
http://web0.wikidot.com
http://webcomicauthority.wikidot.com
http://wfh.wikidot.com
http://whanethewhip.wikidot.com
http://whatever.wikidot.com
http://wherearethejoneses.wikidot.com
http://wikidot.com
http://wikiedresearch.wikidot.com
http://wikiethica.wikidot.com
http://wikim5s.wikidot.com
http://wikinorm.wikidot.com
http://wikiofscience.wikidot.com
http://wikirhye.wikidot.com
http://wikirmaphil.wikidot.com
http://wikistoriaenciclopedia.wikidot.com
http://wikitipsgr.wikidot.com
http://windycity.wikidot.com
http://wiwimush.wikidot.com
http://world.wikidot.com
http://wow-arrakis.wikidot.com
http://wpts.wikidot.com
http://wqa.wikidot.com
http://writ-111-office-hour-sign-up.wikidot.com
http://writingoneeleven.wikidot.com
http://wrtg1150.wikidot.com
http://wtg.wikidot.com
http://www-old.wikidot.com
http://wychwood.wikidot.com
http://xanadu.wikidot.com
http://y31.wikidot.com
http://ye-olde-music-industrapedia.wikidot.com
http://yo801106.wikidot.com
http://yyp.wikidot.com
http://zeroshell.wikidot.com
http://zmk.wikidot.com
http://zodiac-ffrpg.wikidot.com
http://zodiac-monster-manual.wikidot.com
http://zombiecafe.wikidot.com
http://zorya.wikidot.com

@ -0,0 +1,214 @@
arte
cine
lengua
literatura
matematicas
ingles
frances
aleman
ruso
idiomas
geografia
historia
secundaria
bachillerato
examen
examenes
profesor
educacion
profesores
historias
extremadura
andalucia
iberia
oceano
cultura
periodico
television
radio
italiano
polaco
chino
japones
coreano
musica
mozart
beethoven
asimov
newton
kilogramo
teoria
fisica
deporte
cancion
futbol
astronomia
telescopio
cuaderno
libro
texto
pizarra
descartes
galileo
fosiles
paisaje
fosil
paisajes
mar
oceano
espacio
meteorologia
nubes
religion
bandera
lengua
politica
biologia
quimica
medicina
tecnologia
diagrama
mapa
mapas
dibujos
pronunciacion
arquitectura
compositor
pintor
pintura
escultura
museo
biblioteca
museos
bibliotecas
enciclopedia
diccionario
filosofia
filosofos
feminismo
sociologia
leyes
coche
barco
avion
transporte
teatro
europa
america
africa
asia
oceania
australia
atlantico
mediterraneo
fenicios
griegos
cartagineses
palabras
numeros
escritura
isla
java
python
programacion
piramide
cuadrado
geometria
rectangulo
circulo
ciencia
marx
engels
platon
socrates
continente
tormenta
terremoto
proyecto
glosario
vocabulario
aprender
recursos
lectura
comunicacion
salud
bienestar
europeo
africano
asiatico
americano
wiki
wikis
documental
documentales
bibliografia
documentacion
ciencias
naturales
sociales
inteligencia
investigacion
cientifico
tecnico
cientifica
enlaces
antropologia
arqueologia
arqueologo
filologia
arduino
software
hardware
computador
ordenador
siglo xx
siglo xix
siglo xviii
siglo xvii
siglo xvi
siglo xv
libros
marte
tierra
mercurio
jupiter
saturno
urano
neptuno
pluton
cometa
asteroide
luna
pajaro
ave
aves
reptil
reptiles
flores
arboles
flor
dictadura
democracia
parlamento
universidad
universidades
empresa
comida
alimento
equipo
lampara
luz
bombilla
electricidad
frigorifico
lavadora
mueble
fregona
espacio
sol
estrella
fenomeno
hispanico
hispanica
biodiversidad
guerra fria

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,86 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import random
import re
import time
import urllib.request
def main():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
words = []
with open('words.txt', 'r') as f:
words = f.read().strip().splitlines()
random.shuffle(words)
print('Loaded %d words from file' % (len(words)))
#words = words + ['%d' % (i) for i in range(1900, 1980, 10)]
wikis = []
with open('wikispaces-duckduckgo.txt', 'r') as f:
wikis = f.read().strip().splitlines()
wikis.sort()
print('Loaded %d wikis from file' % (len(wikis)))
for i in range(1, 100):
random.shuffle(words)
for word in words:
print('Word', word)
word_ = re.sub(' ', '+', word)
url = ''
r = random.randint(0, 10)
if r == 0:
url = 'https://duckduckgo.com/html/?q=%s%%20site:wikispaces.com' % (word_)
elif r == 1:
url = 'https://duckduckgo.com/html/?q=%s%%20wikispaces.com' % (word_)
elif r == 2:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
elif r == 3:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (random.randint(100, 3000), word_)
else:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
sys.exit()
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
for wiki in m:
wiki = 'https://' + wiki
if not wiki in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open('wikispaces-duckduckgo.txt', 'w') as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r'https://www\.', 'https://', wiki)
if not wiki in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
f.write('\n'.join(wikis))
print('%d wikis found' % (len(wikis)))
sleep = random.randint(5,20)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

@ -16,6 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import csv
import random
import re
import time
import urllib2
@ -88,6 +89,8 @@ def getWikis(user):
return {}
def main():
sleep = 0.1
rand = 10
users = loadUsers()
wikis = loadWikis()
@ -112,11 +115,16 @@ def main():
c += 1
print 'Found %s new users' % (c)
if c > 0:
saveUsers(users)
users = loadUsers()
saveWikis(wikis)
time.sleep(1)
if random.randint(0,rand) == 0:
saveUsers(users)
users = loadUsers()
if random.randint(0,rand) == 0:
saveWikis(wikis)
time.sleep(sleep)
saveWikis(wikis)
wikis = loadWikis()
saveUsers(users)
users = loadUsers()
# find more wikis
print 'Scanning users for more wikis'
@ -133,10 +141,15 @@ def main():
c += 1
print 'Found %s new wikis' % (c)
if c > 0:
saveWikis(wikis)
wikis = loadWikis()
saveUsers(users)
time.sleep(1)
if random.randint(0,rand) == 0:
saveWikis(wikis)
wikis = loadWikis()
if random.randint(0,rand) == 0:
saveUsers(users)
time.sleep(sleep)
saveWikis(wikis)
wikis = loadWikis()
saveUsers(users)
users = loadUsers()
print '\nSummary:'

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,214 @@
arte
cine
lengua
literatura
matematicas
ingles
frances
aleman
ruso
idiomas
geografia
historia
secundaria
bachillerato
examen
examenes
profesor
educacion
profesores
historias
extremadura
andalucia
iberia
oceano
cultura
periodico
television
radio
italiano
polaco
chino
japones
coreano
musica
mozart
beethoven
asimov
newton
kilogramo
teoria
fisica
deporte
cancion
futbol
astronomia
telescopio
cuaderno
libro
texto
pizarra
descartes
galileo
fosiles
paisaje
fosil
paisajes
mar
oceano
espacio
meteorologia
nubes
religion
bandera
lengua
politica
biologia
quimica
medicina
tecnologia
diagrama
mapa
mapas
dibujos
pronunciacion
arquitectura
compositor
pintor
pintura
escultura
museo
biblioteca
museos
bibliotecas
enciclopedia
diccionario
filosofia
filosofos
feminismo
sociologia
leyes
coche
barco
avion
transporte
teatro
europa
america
africa
asia
oceania
australia
atlantico
mediterraneo
fenicios
griegos
cartagineses
palabras
numeros
escritura
isla
java
python
programacion
piramide
cuadrado
geometria
rectangulo
circulo
ciencia
marx
engels
platon
socrates
continente
tormenta
terremoto
proyecto
glosario
vocabulario
aprender
recursos
lectura
comunicacion
salud
bienestar
europeo
africano
asiatico
americano
wiki
wikis
documental
documentales
bibliografia
documentacion
ciencias
naturales
sociales
inteligencia
investigacion
cientifico
tecnico
cientifica
enlaces
antropologia
arqueologia
arqueologo
filologia
arduino
software
hardware
computador
ordenador
siglo xx
siglo xix
siglo xviii
siglo xvii
siglo xvi
siglo xv
libros
marte
tierra
mercurio
jupiter
saturno
urano
neptuno
pluton
cometa
asteroide
luna
pajaro
ave
aves
reptil
reptiles
flores
arboles
flor
dictadura
democracia
parlamento
universidad
universidades
empresa
comida
alimento
equipo
lampara
luz
bombilla
electricidad
frigorifico
lavadora
mueble
fregona
espacio
sol
estrella
fenomeno
hispanico
hispanica
biodiversidad
guerra fria

@ -62,14 +62,14 @@ class TestDumpgenerator(unittest.TestCase):
tests = [
# Alone wikis
#['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'],
['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
#['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
# Editthis wikifarm
# It has a page view limit
# Gamepedia wikifarm
['http://dawngate.gamepedia.com/index.php', 'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'],
#['http://dawngate.gamepedia.com/index.php', 'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'],
# Neoseeker wikifarm
#['http://digimon.neoseeker.com/w/index.php', 'http://digimon.neoseeker.com/w/api.php', u'Ogremon card.png'],
@ -78,13 +78,13 @@ class TestDumpgenerator(unittest.TestCase):
#['http://mc.orain.org/w/index.php', 'http://mc.orain.org/w/api.php', u'Mojang logo.svg'],
# Referata wikifarm
['http://wikipapers.referata.com/w/index.php', 'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'],
#['http://wikipapers.referata.com/w/index.php', 'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'],
# ShoutWiki wikifarm
['http://commandos.shoutwiki.com/w/index.php', 'http://commandos.shoutwiki.com/w/api.php', u'Night of the Wolves loading.png'],
#['http://commandos.shoutwiki.com/w/index.php', 'http://commandos.shoutwiki.com/w/api.php', u'Night of the Wolves loading.png'],
# Wiki-site wikifarm
['http://minlingo.wiki-site.com/index.php', 'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'],
#['http://minlingo.wiki-site.com/index.php', 'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'],
# Wikkii wikifarm
# It seems offline
@ -146,8 +146,8 @@ class TestDumpgenerator(unittest.TestCase):
print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
tests = [
# Alone wikis
['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'],
['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],
['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'April Fools\' Day'],
#['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],
# Test old allpages API behaviour
#['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'],
@ -261,7 +261,11 @@ class TestDumpgenerator(unittest.TestCase):
]
for wiki, engine in tests:
print 'Testing', wiki
guess_engine = getWikiEngine(wiki)
try:
guess_engine = getWikiEngine(wiki)
except ConnectionError:
print "%s failed to load, skipping..." % (wiki)
continue
print 'Got: %s, expected: %s' % (guess_engine, engine)
self.assertEqual(guess_engine, engine)
@ -269,14 +273,14 @@ class TestDumpgenerator(unittest.TestCase):
print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73
tests = [
# Alone wikis
['http://archiveteam.org', 'http://archiveteam.org/api.php', 'http://archiveteam.org/index.php'],
['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],
['https://www.archiveteam.org', 'https://www.archiveteam.org/api.php', 'https://www.archiveteam.org/index.php'],
#['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],
# Editthis wikifarm
# It has a page view limit
# Gamepedia wikifarm
['http://dawngate.gamepedia.com', 'http://dawngate.gamepedia.com/api.php', 'http://dawngate.gamepedia.com/index.php'],
#['http://dawngate.gamepedia.com', 'http://dawngate.gamepedia.com/api.php', 'http://dawngate.gamepedia.com/index.php'],
# Neoseeker wikifarm
#['http://digimon.neoseeker.com', 'http://digimon.neoseeker.com/w/api.php', 'http://digimon.neoseeker.com/w/index.php'],
@ -288,7 +292,7 @@ class TestDumpgenerator(unittest.TestCase):
# ['http://wikipapers.referata.com', 'http://wikipapers.referata.com/w/api.php', 'http://wikipapers.referata.com/w/index.php'],
# ShoutWiki wikifarm
['http://commandos.shoutwiki.com', 'http://commandos.shoutwiki.com/w/api.php', 'http://commandos.shoutwiki.com/w/index.php'],
#['http://commandos.shoutwiki.com', 'http://commandos.shoutwiki.com/w/api.php', 'http://commandos.shoutwiki.com/w/index.php'],
# Wiki-site wikifarm
#['http://minlingo.wiki-site.com', 'http://minlingo.wiki-site.com/api.php', 'http://minlingo.wiki-site.com/index.php'],

@ -16,6 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import getopt
import argparse
import os
import re
import subprocess
@ -30,89 +31,41 @@ from internetarchive import get_item
import dumpgenerator
# Configuration goes here
# You need a file named keys.txt with access and secret keys, in two different lines
accesskey = open('keys.txt', 'r').readlines()[0].strip()
secretkey = open('keys.txt', 'r').readlines()[1].strip()
# Use --admin if you are a wikiteam collection admin, or specify another collection:
collection = 'opensource'
# Nothing to change below
convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
listfile = sys.argv[1]
uploadeddumps = []
try:
uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
except:
pass
print '%d dumps uploaded previously' % (len(uploadeddumps))
def getParameters(params=[]):
if not params:
params = sys.argv[2:]
config = {
'prune-directories': False,
'prune-wikidump': False,
'collection': collection,
'update': False,
}
#console params
try:
opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
for o, a in opts:
if o in ("-h","--help"):
usage()
sys.exit()
elif o in ("--prune-directories"):
config['prune-directories'] = True
elif o in ("--prune-wikidump"):
config['prune-wikidump'] = True
elif o in ("--admin"):
config['collection'] = "wikiteam"
elif o in ("--update"):
config['update'] = True
return config
def usage():
""" """
print """uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
The list must be a text file with the wiki's api.php URLs, one per line.
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
You need a file named keys.txt with access and secret keys, in two different lines
You also need dumpgenerator.py in the same directory as this script.
Use --help to print this help."""
def log(wiki, dump, msg):
f = open('uploader-%s.log' % (listfile), 'a')
def log(wiki, dump, msg, config={}):
f = open('uploader-%s.log' % (config.listfile), 'a')
f.write('\n%s;%s;%s' % (wiki, dump, msg))
f.close()
def upload(wikis, config={}):
def upload(wikis, config={}, uploadeddumps=[]):
headers = {'User-Agent': dumpgenerator.getUserAgent()}
dumpdir = config.wikidump_dir
filelist = os.listdir(dumpdir)
for wiki in wikis:
print "#"*73
print "# Uploading", wiki
print "#"*73
wiki = wiki.lower()
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
configtemp = config
try:
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
except KeyError:
print "ERROR: could not produce the prefix for %s" % wiki
config = configtemp
wikiname = prefix.split('-')[0]
dumps = []
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for f in filenames:
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
dumps.append(f)
for f in filelist:
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
print "%s found" % f
dumps.append(f)
break
c = 0
@ -120,30 +73,33 @@ def upload(wikis, config={}):
wikidate = dump.split('-')[1]
item = get_item('wiki-' + wikiname)
if dump in uploadeddumps:
if config['prune-directories']:
if config.prune_directories:
rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
# With -f the deletion might have happened before and we won't know
if not os.system(rmline):
print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
if config.prune_wikidump and dump.endswith('wikidump.7z'):
# Simplistic quick&dirty check for the presence of this file in the item
stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
print "Checking content in previously uploaded files"
stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
dumphash = re.sub(' +.+\n?', '', stdout)
if dumphash in map(lambda x: x['md5'], item.files):
log(wiki, dump, 'verified')
rmline='rm -rf %s' % dump
log(wiki, dump, 'verified', config)
rmline='rm -rf %s' % dumpdir + '/' + dump
if not os.system(rmline):
print 'DELETED ' + dump
print 'DELETED ' + dumpdir + '/' + dump
print '%s was uploaded before, skipping...' % (dump)
continue
else:
print 'ERROR: The online item misses ' + dump
log(wiki, dump, 'missing')
log(wiki, dump, 'missing', config)
# We'll exit this if and go upload the dump
else:
print '%s was uploaded before, skipping...' % (dump)
continue
else:
print '%s was not uploaded before' % dump
time.sleep(0.1)
wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
@ -155,7 +111,7 @@ def upload(wikis, config={}):
# Logo path
logourl = ''
if ismissingitem or config['update']:
if ismissingitem or config.update:
#get metadata from api.php
#first sitename and base url
params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
@ -163,7 +119,7 @@ def upload(wikis, config={}):
req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = ''
try:
f = urllib2.urlopen(req)
f = urllib2.urlopen(req, timeout=10)
xml = f.read()
f.close()
except:
@ -198,7 +154,7 @@ def upload(wikis, config={}):
req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = ''
try:
f = urllib2.urlopen(req)
f = urllib2.urlopen(req, timeout=10)
xml = f.read()
f.close()
except:
@ -214,7 +170,7 @@ def upload(wikis, config={}):
raw = ''
try:
f = urllib.urlopen(baseurl)
f = urllib.urlopen(baseurl, timeout=10)
raw = f.read()
f.close()
except:
@ -238,7 +194,6 @@ def upload(wikis, config={}):
logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
except:
pass
print logourl
#retrieve some info from the wiki
wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
@ -264,7 +219,7 @@ def upload(wikis, config={}):
# Item metadata
md = {
'mediatype': 'web',
'collection': config['collection'],
'collection': config.collection,
'title': wikititle,
'description': wikidesc,
'language': lang,
@ -277,25 +232,54 @@ def upload(wikis, config={}):
#Upload files and update metadata
try:
item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
item.modify_metadata(md) # update
print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
uploadeddumps.append(dump)
log(wiki, dump, 'ok', config)
if logourl:
logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read())
logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
uploadeddumps.append(dump)
log(wiki, dump, 'ok')
except:
print wiki, dump, 'error when uploading?'
except Exception as e:
print wiki, dump, 'Error when uploading?'
print e.message
c += 1
def main(params=[]):
config = getParameters(params=params)
parser = argparse.ArgumentParser("""uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
The list must be a text file with the wiki's api.php URLs, one per line.
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
You need a file named keys.txt with access and secret keys, in two different lines
You also need dumpgenerator.py in the same directory as this script.
Use --help to print this help.""")
parser.add_argument('-pd', '--prune_directories', action='store_true')
parser.add_argument('-pw', '--prune_wikidump', action='store_true')
parser.add_argument('-a', '--admin', action='store_true')
parser.add_argument('-c', '--collection', default='opensource')
parser.add_argument('-wd', '--wikidump_dir', default='.')
parser.add_argument('-u', '--update', action='store_true')
parser.add_argument('listfile')
config = parser.parse_args()
if config.admin:
config.collection = 'wikiteam'
uploadeddumps = []
listfile = config.listfile
try:
uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
except:
pass
print '%d dumps uploaded previously' % (len(uploadeddumps))
wikis = open(listfile, 'r').read().strip().splitlines()
upload(wikis, config)
upload(wikis, config, uploadeddumps)
if __name__ == "__main__":
main()

@ -24,7 +24,7 @@ def main():
site = pywikibot.Site('wikiapiary', 'wikiapiary')
catname = 'Category:Website'
cat = pywikibot.Category(site, catname)
gen = pagegenerators.CategorizedPageGenerator(cat, start='Spyropedia')
gen = pagegenerators.CategorizedPageGenerator(cat, start='!')
pre = pagegenerators.PreloadingGenerator(gen)
for page in pre:
@ -52,7 +52,8 @@ def main():
print('No API found in WikiApiary, skiping')
continue
urliasearch = 'https://archive.org/search.php?query=originalurl:"%s"' % (apiurl)
indexurl = 'index.php'.join(apiurl.rsplit('api.php', 1))
urliasearch = 'https://archive.org/search.php?query=originalurl:"%s" OR originalurl:"%s"' % (apiurl, indexurl)
f = urllib.request.urlopen(urliasearch)
raw = f.read().decode('utf-8')
if re.search(r'(?i)Your search did not match any items', raw):

@ -0,0 +1,458 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
# Documentation for developers: http://wikiteam.readthedocs.com
import csv
import datetime
import os
import random
import re
import subprocess
import sys
import time
import urllib.request
#from internetarchive import get_item
# Requirements:
# zip command (apt-get install zip)
# ia command (pip install internetarchive, and configured properly)
"""
# You need a file with access and secret keys, in two different lines
iakeysfilename = '%s/.iakeys' % (os.path.expanduser('~'))
if os.path.exists(iakeysfilename):
accesskey = open(iakeysfilename, 'r').readlines()[0].strip()
secretkey = open(iakeysfilename, 'r').readlines()[1].strip()
else:
print('Error, no %s file with S3 keys for Internet Archive account' % (iakeysfilename))
sys.exit()
"""
def saveURL(wikidomain='', url='', filename='', path='', overwrite=False, iteration=1):
filename2 = '%s/%s' % (wikidomain, filename)
if path:
filename2 = '%s/%s/%s' % (wikidomain, path, filename)
if os.path.exists(filename2):
if not overwrite:
print('Warning: file exists on disk. Skipping download. Force download with parameter --overwrite')
return
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
try:
urllib.request.urlretrieve(url, filename2)
except:
sleep = 10 # seconds
maxsleep = 30
while sleep <= maxsleep:
try:
print('Error while retrieving: %s' % (url))
print('Retry in %s seconds...' % (sleep))
time.sleep(sleep)
urllib.request.urlretrieve(url, filename2)
return
except:
sleep = sleep * 2
print('Download failed')
#sometimes wikispaces returns invalid data, redownload in that cases
#only 'pages'. 'files' binaries are a pain to open and check
if (os.path.exists(filename2) and 'pages' in path) or \
(os.path.exists(filename2) and path == '' and filename2.split('.')[-1] in ['xml', 'html', 'csv']):
sleep2 = 60 * iteration
raw = ''
try:
with open(filename2, 'r', encoding='utf-8') as f:
raw = f.read()
except:
with open(filename2, 'r', encoding='latin-1') as f:
raw = f.read()
if re.findall(r'(?im)<title>TES and THE Status</title>', raw):
print('Warning: invalid content. Waiting %d seconds and re-downloading' % (sleep2))
time.sleep(sleep2)
saveURL(wikidomain=wikidomain, url=url, filename=filename, path=path, overwrite=overwrite, iteration=iteration+1)
def undoHTMLEntities(text=''):
""" Undo some HTML codes """
# i guess only < > & " ' need conversion
# http://www.w3schools.com/html/html_entities.asp
text = re.sub('&lt;', '<', text)
text = re.sub('&gt;', '>', text)
text = re.sub('&amp;', '&', text)
text = re.sub('&quot;', '"', text)
text = re.sub('&#039;', '\'', text)
return text
def convertHTML2Wikitext(wikidomain='', filename='', path=''):
wikitext = ''
wikitextfile = '%s/%s/%s' % (wikidomain, path, filename)
if not os.path.exists(wikitextfile):
print('Error retrieving wikitext, page is a redirect probably')
return
with open(wikitextfile, 'r') as f:
wikitext = f.read()
with open(wikitextfile, 'w') as f:
m = re.findall(r'(?im)<div class="WikispacesContent WikispacesBs3">\s*<pre>', wikitext)
if m:
try:
wikitext = wikitext.split(m[0])[1].split('</pre>')[0].strip()
wikitext = undoHTMLEntities(text=wikitext)
except:
pass
f.write(wikitext)
def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False):
pagenameplus = re.sub(' ', '+', pagename)
pagename_ = urllib.parse.quote(pagename)
#page current revision (html & wikitext)
pageurl = '%s/%s' % (wikiurl, pagename_)
filename = '%s.html' % (pagenameplus)
print('Downloading page: %s' % (filename))
saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite)
pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_)
filename2 = '%s.wikitext' % (pagenameplus)
print('Downloading page: %s' % (filename2))
saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite)
convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages')
#csv with page history
csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_)
csvfilename = '%s.history.csv' % (pagenameplus)
print('Downloading page: %s' % (csvfilename))
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite)
def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False):
filenameplus = re.sub(' ', '+', filename)
filename_ = urllib.parse.quote(filename)
#file full resolution
fileurl = '%s/file/view/%s' % (wikiurl, filename_)
filename = filenameplus
print('Downloading file: %s' % (filename))
saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite)
#csv with file history
csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_)
csvfilename = '%s.history.csv' % (filenameplus)
print('Downloading file: %s' % (csvfilename))
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite)
def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False):
print('Downloading Pages and Files from %s' % (wikiurl))
#csv all pages and files
csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl)
saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='')
#download every page and file
totallines = 0
with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f:
totallines = len(f.read().splitlines()) - 1
with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile:
filesc = 0
pagesc = 0
print('This wiki has %d pages and files' % (totallines))
rows = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in rows:
if row[0] == 'file':
filesc += 1
filename = row[1]
downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite)
elif row[0] == 'page':
pagesc += 1
pagename = row[1]
downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite)
if (filesc + pagesc) % 10 == 0:
print(' Progress: %d of %d' % ((filesc + pagesc), totallines))
print(' Progress: %d of %d' % ((filesc + pagesc), totallines))
print('Downloaded %d pages' % (pagesc))
print('Downloaded %d files' % (filesc))
def downloadSitemap(wikidomain='', wikiurl='', overwrite=False):
print('Downloading sitemap.xml')
saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite)
def downloadMainPage(wikidomain='', wikiurl='', overwrite=False):
print('Downloading index.html')
saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite)
def downloadLogo(wikidomain='', wikiurl='', overwrite=False):
index = '%s/index.html' % (wikidomain)
if os.path.exists(index):
raw = ''
try:
with open(index, 'r', encoding='utf-8') as f:
raw = f.read()
except:
with open(index, 'r', encoding='latin-1') as f:
raw = f.read()
m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', raw)
if m:
logourl = m[0]
logofilename = logourl.split('/')[-1]
print('Downloading logo')
saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='', overwrite=overwrite)
return logofilename
return ''
def printhelp():
helptext = """This script downloads (and uploads) WikiSpaces wikis.
Parameters available:
--upload: upload compressed file with downloaded wiki
--admin: add item to WikiTeam collection (if you are an admin in that collection)
--overwrite: download again even if files exists locally
--overwrite-ia: upload again to Internet Archive even if item exists there
--help: prints this help text
Examples:
python3 wikispaces.py https://mywiki.wikispaces.com
It downloads that wiki
python3 wikispaces.py wikis.txt
It downloads a list of wikis (file format is a URL per line)
python3 wikispaces.py https://mywiki.wikispaces.com --upload
It downloads that wiki, compress it and uploading to Internet Archive
"""
print(helptext)
sys.exit()
def duckduckgo():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
wikis = []
ignorewikis = [
'https://wikispaces.com',
'https://www.wikispaces.com',
'https://wikispaces.net',
'https://www.wikispaces.net',
]
for i in range(1, 100000):
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikispaces.com' % (random.randint(100, 5000), random.randint(1000, 9999))
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
time.sleep(30)
continue
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
for wiki in m:
wiki = 'https://' + wiki
wiki = re.sub(r'https://www\.', 'https://', wiki)
if not wiki in wikis and not wiki in ignorewikis:
wikis.append(wiki)
yield wiki
sleep = random.randint(5,20)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
def main():
upload = False
isadmin = False
overwrite = False
overwriteia = False
if len(sys.argv) < 2:
printhelp()
param = sys.argv[1]
if not param:
printhelp()
if len(sys.argv) > 2:
if '--upload' in sys.argv:
upload = True
if '--admin' in sys.argv:
isadmin = True
if '--overwrite' in sys.argv:
overwrite = True
if '--overwrite-ia' in sys.argv:
overwriteia = True
if '--help' in sys.argv:
printhelp()
wikilist = []
if '://' in param:
wikilist.append(param.rstrip('/'))
elif param.lower() == 'duckduckgo':
wikilist = duckduckgo()
#for wiki in wikilist:
# print(wiki)
else:
with open(param, 'r') as f:
wikilist = f.read().strip().splitlines()
wikilist2 = []
for wiki in wikilist:
wikilist2.append(wiki.rstrip('/'))
wikilist = wikilist2
for wikiurl in wikilist:
wikidomain = wikiurl.split('://')[1].split('/')[0]
print('\n')
print('#'*40,'\n Downloading:', wikiurl)
print('#'*40,'\n')
if upload and not overwriteia:
itemid = 'wiki-%s' % (wikidomain)
try:
iahtml = ''
try:
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
except:
time.sleep(10)
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
if iahtml and not re.findall(r'(?im)Item cannot be found', iahtml):
if not overwriteia:
print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia')
print('You can find it in https://archive.org/details/%s' % (itemid))
time.sleep(1)
continue
except:
pass
dirfiles = '%s/files' % (wikidomain)
if not os.path.exists(dirfiles):
print('Creating directory %s' % (dirfiles))
os.makedirs(dirfiles)
dirpages = '%s/pages' % (wikidomain)
if not os.path.exists(dirpages):
print('Creating directory %s' % (dirpages))
os.makedirs(dirpages)
sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite)
if not os.path.exists('%s/sitemap.xml' % (wikidomain)):
print('Error, wiki was probably deleted. Skiping wiki...')
continue
else:
sitemapraw = ''
try:
with open('%s/sitemap.xml' % (wikidomain), encoding='utf-8') as g:
sitemapraw = g.read()
except:
with open('%s/sitemap.xml' % (wikidomain), encoding='latin-1') as g:
sitemapraw = g.read()
if re.search(r'(?im)<h1>This wiki has been deactivated</h1>', sitemapraw):
print('Error, wiki was deactivated. Skiping wiki...')
continue
downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
if not os.path.exists('%s/index.html' % (wikidomain)):
print('Error, wiki was probably deleted or expired. Skiping wiki...')
continue
else:
indexraw = ''
try:
with open('%s/index.html' % (wikidomain), encoding='utf-8') as g:
indexraw = g.read()
except:
with open('%s/index.html' % (wikidomain), encoding='latin-1') as g:
indexraw = g.read()
if re.search(r'(?im)<h1>Subscription Expired</h1>', indexraw):
print('Error, wiki subscription expired. Skiping wiki...')
continue
downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
if upload:
itemid = 'wiki-%s' % (wikidomain)
print('\nCompressing dump...')
wikidir = wikidomain
os.chdir(wikidir)
print('Changed directory to', os.getcwd())
wikizip = '%s.zip' % (wikidomain)
subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
os.chdir('..')
print('Changed directory to', os.getcwd())
print('\nUploading to Internet Archive...')
indexfilename = '%s/index.html' % (wikidir)
if not os.path.exists(indexfilename):
print('\nError dump incomplete, skipping upload\n')
continue
indexhtml = ''
try:
with open(indexfilename, 'r', encoding='utf-8') as f:
indexhtml = f.read()
except:
with open(indexfilename, 'r', encoding='latin-1') as f:
indexhtml = f.read()
wikititle = ''
try:
wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
except:
wikititle = wikidomain
if not wikititle:
wikititle = wikidomain
wikititle = wikititle.replace("\\'", " ")
wikititle = wikititle.replace('\\"', " ")
itemtitle = 'Wiki - %s' % wikititle
itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
itemoriginalurl = wikiurl
itemlicenseurl = ''
m = ''
try:
m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
except:
m = ''
if m:
itemlicenseurl = m[0]
if not itemlicenseurl:
itemtags.append('unknowncopyright')
itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
itemcollection = isadmin and 'wikiteam' or 'opensource'
itemlang = 'Unknown'
itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''
callplain = "ia upload %s %s %s --metadata='mediatype:web' --metadata='collection:%s' --metadata='title:%s' --metadata='description:%s' --metadata='language:%s' --metadata='last-updated-date:%s' --metadata='originalurl:%s' %s %s" % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemoriginalurl, itemlicenseurl and "--metadata='licenseurl:%s'" % (itemlicenseurl) or '', itemtags_)
print(callplain)
subprocess.call(callplain, shell=True)
"""
md = {
'mediatype': 'web',
'collection': itemcollection,
'title': itemtitle,
'description': itemdesc,
'language': itemlang,
'last-updated-date': itemdate,
'subject': '; '.join(itemtags),
'licenseurl': itemlicenseurl,
'originalurl': itemoriginalurl,
}
item = get_item(itemid)
item.upload(wikizip, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
item.modify_metadata(md)
if itemlogo:
item.upload(itemlogo, access_key=accesskey, secret_key=secretkey, verbose=True)
"""
print('You can find it in https://archive.org/details/%s' % (itemid))
os.remove(wikizip)
if __name__ == "__main__":
main()

@ -228,7 +228,11 @@ def mwGetImageNamesAPI(config={}):
url = mwCurateImageURL(config=config, url=url)
# encoding to ascii is needed to work around this horrible bug:
# http://bugs.python.org/issue8136
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
if 'mwapi' in config and '.wikia.com' in config['mwapi']:
#to avoid latest?cb=20120816112532 in filenames
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')
else:
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
uploader = re.sub('_', ' ', image['user'])
imagenames.append([filename, url, uploader])
else:

Loading…
Cancel
Save