#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) 2011-2016 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
# Documentation for developers: http://wikiteam.readthedocs.com
import json
import re
import sys
import urllib
import wikiteam
def mwCleanHTML(raw=''):
""" Extract only the real wiki content and remove rubbish """
""" This function is ONLY used to retrieve page titles and file names when no API is available """
""" DO NOT use this function to extract page content """
# different "tags" used by different MediaWiki versions to mark where
# starts and ends content
if re.search('', raw):
raw = raw.split('')[1].split('')[0]
elif re.search('', raw):
raw = raw.split(
'')[1].split('')[0]
elif re.search('', raw):
raw = raw.split(
'')[1].split('')[0]
elif re.search('', raw):
raw = raw.split('')[1].split('')[0]
elif re.search('', raw):
raw = raw.split('')[1].split('')[0]
elif re.search('')[0]
else:
sys.stderr.write(raw[:250])
sys.stderr.write('This wiki doesn\'t use marks to split content\n')
sys.exit()
return raw
def mwCleanXML(xml=''):
""" Trim redundant info """
# do not touch XML codification, leave AS IS
if re.search(r'\n', xml):
xml = xml.split('\n')[1]
if re.search(r'', xml):
xml = xml.split('')[0]
return xml
def mwCreateNewDump(config={}):
sys.stderr.write('Trying generating a new dump into a new directory...')
if config['pages']:
pagetitles = mwGetPageTitles(config=config)
wikiteam.savePageTitles(config=config, pagetitles=pagetitles)
mwGeneratePageDump(config=config, pagetitles=pagetitles)
mwCheckXMLIntegrity(config=config, pagetitles=pagetitles)
if config['images']:
imagenames = mwGetImageNames(config=config)
mwSaveImageNames(config=config, imagenames=imagenames)
mwGenerateImageDump(config=config, imagenames=imagenames)
if config['logs']:
mwSaveLogs(config=config)
mwSaveIndexPHP(config=config)
mwSaveSpecialVersion(config=config)
mwSaveSiteInfo(config=config)
def mwCurateImageURL(config={}, url=''):
""" Returns an absolute URL for an image, adding the domain if missing """
if 'mwindex' in config and config['mwindex']:
# remove from :// (http or https) until the first / after domain
domainalone = config['mwindex'].split(
'://')[0] + '://' + config['mwindex'].split('://')[1].split('/')[0]
elif 'mwapi' in config and config['mwapi']:
domainalone = config['mwapi'].split(
'://')[0] + '://' + config['mwapi'].split('://')[1].split('/')[0]
else:
sys.stderr.write('ERROR: no index nor API')
sys.exit()
if url.startswith('//'): # Orain wikifarm returns URLs starting with //
url = '%s:%s' % (domainalone.split('://')[0], url)
# is it a relative URL?
elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')):
if url[0] == '/': # slash is added later
url = url[1:]
# concat http(s) + domain + relative url
url = '%s/%s' % (domainalone, url)
url = wikiteam.undoHTMLEntities(text=url)
# url = urllib.unquote(url) #do not use unquote with url, it break some
# urls with odd chars
url = re.sub(' ', '_', url)
return url
def mwGeneratePageDump(config={}, pagetitles=None, start=None):
""" Generates a XML dump for page titles """
sys.stderr.write('Retrieving XML for every page from "%s"' % (start or 'start'))
header = mwGetXMLHeader(config=config)
footer = '\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config),
config['date'],
config['curonly'] and 'current' or 'history')
xmlfile = ''
lock = True
if start:
sys.stderr.write("Removing the last chunk of past XML dump: it is probably incomplete.\n")
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
# requested complete xml dump
lock = False
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header)
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for pagetitle in mwGetPageTitles(config=config, start=start):
if not pagetitle.strip():
continue
if pagetitle == start: # start downloading from start, included
lock = False
if lock:
continue
wikiteam.delay(config=config)
if c % 10 == 0:
sys.stderr.write('Downloaded %d pages\n' % (c))
try:
for xml in getXMLPage(config=config, title=title):
xml = cleanXML(xml=xml)
xmlfile.write(xml)
except PageMissingError:
logerror(
config=config,
text='The page "%s" was missing in the wiki (probably deleted)' %
(title))
# here, XML is a correct chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
xmlfile.write(footer)
xmlfile.close()
sys.stderr.write('XML dump saved at... %s\n' % (xmlfilename))
def mwGetAPI(config={}):
""" Returns API for a MediaWiki wiki, if available """
api = ''
html = wikiteam.getURL(url=config['wiki'])
m = re.findall(
r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>',
html)
if m:
api = m[0]
if api.startswith('//'): # gentoo wiki and others
api = url.split('//')[0] + api
return api
def mwGetImageNames(config={}):
""" Get list of image names """
sys.stderr.write('Retrieving image filenames\n')
imagenames = []
if 'mwapi' in config and config['mwapi']:
imagenames = mwGetImageNamesAPI(config=config)
elif 'mwindex' in config and config['mwindex']:
imagenames = mwGetImageNamesScraper(config=config)
# imagenames = list(set(imagenames)) # it is a list of lists
imagenames.sort()
sys.stderr.write('%d image names loaded\n' % (len(imagenames)))
return imagenames
def mwGetImageNamesAPI(config={}):
""" Retrieve file list: filename, url, uploader """
oldAPI = False
aifrom = '!'
imagenames = []
while aifrom:
sys.stderr.write('.') # progress
data = {
'action': 'query',
'list': 'allimages',
'aiprop': 'url|user',
'aifrom': aifrom,
'format': 'json',
'ailimit': 500}
# FIXME Handle HTTP Errors HERE
r = wikiteam.getURL(url=config['mwapi'], data=data)
#handleStatusCode(r)
jsonimages = wikiteam.getJSON(r)
wikiteam.delay(config=config)
if 'query' in jsonimages:
aifrom = ''
if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']:
if 'aicontinue' in jsonimages['query-continue']['allimages']:
aifrom = jsonimages['query-continue']['allimages']['aicontinue']
elif 'aifrom' in jsonimages['query-continue']['allimages']:
aifrom = jsonimages['query-continue']['allimages']['aifrom']
elif 'continue' in jsonimages:
if 'aicontinue' in jsonimages['continue']:
aifrom = jsonimages['continue']['aicontinue']
elif 'aifrom' in jsonimages['continue']:
aifrom = jsonimages['continue']['aifrom']
# sys.stderr.write(aifrom)
for image in jsonimages['query']['allimages']:
url = image['url']
url = mwCurateImageURL(config=config, url=url)
# encoding to ascii is needed to work around this horrible bug:
# http://bugs.python.org/issue8136
if 'mwapi' in config and '.wikia.com' in config['mwapi']:
#to avoid latest?cb=20120816112532 in filenames
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')
else:
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
uploader = re.sub('_', ' ', image['user'])
imagenames.append([filename, url, uploader])
else:
oldAPI = True
break
if oldAPI:
gapfrom = '!'
imagenames = []
while gapfrom:
sys.stderr.write('.') # progress
# Some old APIs doesn't have allimages query
# In this case use allpages (in nm=6) as generator for imageinfo
# Example:
# http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6
# &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
data = {
'action': 'query',
'generator': 'allpages',
'gapnamespace': 6,
'gaplimit': 500,
'gapfrom': gapfrom,
'prop': 'imageinfo',
'iiprop': 'user|url',
'format': 'json'}
# FIXME Handle HTTP Errors HERE
r = wikiteam.getURL(url=config['mwapi'], data=data)
#handleStatusCode(r)
jsonimages = wikiteam.getJSON(r)
wikiteam.delay(config=config)
if 'query' in jsonimages:
gapfrom = ''
if 'query-continue' in jsonimages and 'allpages' in jsonimages['query-continue']:
if 'gapfrom' in jsonimages['query-continue']['allpages']:
gapfrom = jsonimages['query-continue']['allpages']['gapfrom']
for image, props in jsonimages['query']['pages'].items():
url = props['imageinfo'][0]['url']
url = mwCurateImageURL(config=config, url=url)
tmp_filename = ':'.join(props['title'].split(':')[1:])
filename = re.sub('_', ' ', tmp_filename)
uploader = re.sub('_', ' ', props['imageinfo'][0]['user'])
imagenames.append([filename, url, uploader])
else:
# if the API doesn't return query data, then we're done
break
if len(imagenames) == 1:
sys.stderr.write(' Found 1 image')
else:
sys.stderr.write(' Found %d images' % (len(imagenames)))
return imagenames
def mwGetImageNamesScraper(config={}):
""" Retrieve file list: filename, url, uploader """
# (?\d+)&'
imagenames = []
offset = '29990101000000' # january 1, 2999
limit = 5000
retries = config['retries']
while offset:
# 5000 overload some servers, but it is needed for sites like this with
# no next links
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
data={
'title': 'Special:Imagelist',
'limit': limit,
'offset': offset}
raw = wikiteam.getURL(url=config['index'], data=data)
#handleStatusCode(r)
wikiteam.delay(config=config)
# delicate wiki
if re.search(r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw):
if limit > 10:
sys.stderr.write('Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit))
limit = limit / 10
continue
elif retries > 0: # waste retries, then exit
retries -= 1
sys.stderr.write('Retrying...')
continue
else:
sys.stderr.write('No more retries, exit...')
break
raw = mwCleanHTML(raw)
# archiveteam 1.15.1