r_images4 = r'(?im)]+ title="[^:>]+:(?P
|meta name="generator" content="MediaWiki)',
raw):
return True
return False
def removeIP(raw=''):
""" Remove IP from HTML comments """
raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw)
# http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
# weird cases as :: are not included
raw = re.sub(
r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}',
'0:0:0:0:0:0:0:0',
raw)
return raw
def getJSON(request):
"""Strip Unicode BOM"""
if request.text.startswith(u'\ufeff'):
request.encoding = 'utf-8-sig'
return request.json()
def fixBOM(request):
"""Strip Unicode BOM"""
if request.text.startswith(u'\ufeff'):
request.encoding = 'utf-8-sig'
return request.text
def checkXMLIntegrity(config={}, titles=[], session=None):
""" Check XML dump integrity, to detect broken XML chunks """
return
print 'Verifying dump...'
checktitles = 0
checkpageopen = 0
checkpageclose = 0
checkrevisionopen = 0
checkrevisionclose = 0
for line in file(
'%s/%s-%s-%s.xml' %
(config['path'],
domain2prefix(
config=config,
session=session),
config['date'],
config['curonly'] and 'current' or 'history'),
'r').read().splitlines():
if "" in line:
checkrevisionopen += 1
elif " " in line:
checkrevisionclose += 1
elif "" in line:
checkpageopen += 1
elif " " in line:
checkpageclose += 1
elif "" in line:
checktitles += 1
else:
continue
if (checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose):
pass
else:
print 'XML dump seems to be corrupted.'
reply = ''
while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
if reply.lower() in ['yes', 'y']:
generateXMLDump(config=config, titles=titles, session=session)
elif reply.lower() in ['no', 'n']:
print 'Not generating a new dump.'
def createNewDump(config={}, other={}):
images = []
print 'Trying generating a new dump into a new directory...'
if config['xml']:
getPageTitles(config=config, session=other['session'])
titles=readTitles(config)
generateXMLDump(config=config, titles=titles, session=other['session'])
checkXMLIntegrity(
config=config,
titles=titles,
session=other['session'])
if config['images']:
images += getImageNames(config=config, session=other['session'])
saveImageNames(config=config, images=images, session=other['session'])
generateImageDump(
config=config,
other=other,
images=images,
session=other['session'])
if config['logs']:
saveLogs(config=config, session=other['session'])
def resumePreviousDump(config={}, other={}):
images = []
print 'Resuming previous dump process...'
if config['xml']:
titles=readTitles(config)
try:
lasttitles = reverse_readline('%s/%s-%s-titles.txt' %
( config['path'],
domain2prefix( config=config, session=other['session'] ),
config['date'])
)
lasttitle=lasttitles.next()
if lasttitle == '':
lasttitle=lasttitles.next()
except:
pass # probably file does not exists
if lasttitle == '--END--':
# titles list is complete
print 'Title list was completed in the previous session'
else:
print 'Title list is incomplete. Reloading...'
# do not resume, reload, to avoid inconsistences, deleted pages or
# so
getPageTitles(config=config, session=other['session'])
# checking xml dump
xmliscomplete = False
lastxmltitle = None
try:
f = reverse_readline(
'%s/%s-%s-%s.xml' %
(config['path'],
domain2prefix(
config=config,
session=other['session']),
config['date'],
config['curonly'] and 'current' or 'history'),
)
for l in f:
if l == '':
# xml dump is complete
xmliscomplete = True
break
xmltitle = re.search(r'([^<]+) ', l)
if xmltitle:
lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
break
except:
pass # probably file does not exists
if xmliscomplete:
print 'XML dump was completed in the previous session'
elif lastxmltitle:
# resuming...
print 'Resuming XML dump from "%s"' % (lastxmltitle)
titles = readTitles(config, start=lastxmltitle)
generateXMLDump(
config=config,
titles=titles,
start=lastxmltitle,
session=other['session'])
else:
# corrupt? only has XML header?
print 'XML is corrupt? Regenerating...'
titles = readTitles(config)
generateXMLDump(
config=config, titles=titles, session=other['session'])
if config['images']:
# load images
lastimage = ''
try:
f = open(
'%s/%s-%s-images.txt' %
(config['path'],
domain2prefix(
config=config),
config['date']),
'r')
raw = unicode(f.read(), 'utf-8').strip()
lines = raw.split('\n')
for l in lines:
if re.search(r'\t', l):
images.append(l.split('\t'))
lastimage = lines[-1]
f.close()
except:
pass # probably file doesnot exists
if lastimage == u'--END--':
print 'Image list was completed in the previous session'
else:
print 'Image list is incomplete. Reloading...'
# do not resume, reload, to avoid inconsistences, deleted images or
# so
images = getImageNames(config=config, session=other['session'])
saveImageNames(config=config, images=images)
# checking images directory
listdir = []
try:
listdir = [n.decode('utf-8') for n in os.listdir('%s/images' % (config['path']))]
except:
pass # probably directory does not exist
listdir.sort()
complete = True
lastfilename = ''
lastfilename2 = ''
c = 0
for filename, url, uploader in images:
lastfilename2 = lastfilename
# return always the complete filename, not the truncated
lastfilename = filename
filename2 = filename
if len(filename2) > other['filenamelimit']:
filename2 = truncateFilename(other=other, filename=filename2)
if filename2 not in listdir:
complete = False
break
c += 1
print '%d images were found in the directory from a previous session' % (c)
if complete:
# image dump is complete
print 'Image dump was completed in the previous session'
else:
# we resume from previous image, which may be corrupted (or missing
# .desc) by the previous session ctrl-c or abort
generateImageDump(
config=config,
other=other,
images=images,
start=lastfilename2,
session=other['session'])
if config['logs']:
# fix
pass
def saveSpecialVersion(config={}, session=None):
""" Save Special:Version as .html, to preserve extensions details """
if os.path.exists('%s/Special:Version.html' % (config['path'])):
print 'Special:Version.html exists, do not overwrite'
else:
print 'Downloading Special:Version with extensions and other related info'
r = session.post(
url=config['index'], data={'title': 'Special:Version'})
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
with open('%s/Special:Version.html' % (config['path']), 'w') as outfile:
outfile.write(raw.encode('utf-8'))
def saveIndexPHP(config={}, session=None):
""" Save index.php as .html, to preserve license details available at the botom of the page """
if os.path.exists('%s/index.html' % (config['path'])):
print 'index.html exists, do not overwrite'
else:
print 'Downloading index.php (Main Page) as index.html'
r = session.post(url=config['index'], data={})
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
with open('%s/index.html' % (config['path']), 'w') as outfile:
outfile.write(raw.encode('utf-8'))
def saveSiteInfo(config={}, session=None):
""" Save a file with site info """
if config['api']:
if os.path.exists('%s/siteinfo.json' % (config['path'])):
print 'siteinfo.json exists, do not overwrite'
else:
print 'Downloading site info as siteinfo.json'
# MediaWiki 1.13+
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
'sinumberingroup': 1,
'format': 'json'})
# MediaWiki 1.11-1.12
if not 'query' in getJSON(r):
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
'format': 'json'})
# MediaWiki 1.8-1.10
if not 'query' in getJSON(r):
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces',
'format': 'json'})
result = getJSON(r)
delay(config=config, session=session)
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
outfile.write(json.dumps(result, indent=4, sort_keys=True))
def avoidWikimediaProjects(config={}, other={}):
""" Skip Wikimedia projects and redirect to the dumps website """
# notice about wikipedia dumps
if re.findall(
r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
config['api'] +
config['index']):
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
print 'Download the dumps from http://dumps.wikimedia.org'
if not other['force']:
print 'Thanks!'
sys.exit()
def getWikiEngine(url=''):
""" Returns the wiki engine of a URL, if known """
session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url)
if r.status_code == 405 or r.text == '':
r = session.get(url=url)
result = r.text
wikiengine = 'Unknown'
if re.search(
ur'(?im)()', result):
wikiengine = 'MoinMoin'
elif re.search(ur'(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)', result):
wikiengine = 'TWiki'
elif re.search(ur'(?im)()', result):
wikiengine = 'PmWiki'
elif re.search(ur'(?im)(| )', result):
wikiengine = 'Wagn'
elif re.search(ur'(?im)(\s*(
)?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', result):
wikiengine = 'JSPWiki'
elif re.search(ur'(?im)(Powered by:?\s*(
)?\s*|\bKwikiNavigation\b)', result):
wikiengine = 'Kwiki'
elif re.search(ur'(?im)(Powered by )', result):
wikiengine = 'Zwiki'
# WakkaWiki forks
elif re.search(ur'(?im)()', result):
wikiengine = 'WikkaWiki' # formerly WikkaWakkaWiki
elif re.search(ur'(?im)(CitiWiki)', result):
wikiengine = 'CitiWiki'
elif re.search(ur'(?im)(Powered by |wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)', result):
wikiengine = 'Wikidot'
elif re.search(ur'(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)', result):
wikiengine = 'Wetpaint'
elif re.search(ur'(?im)(