mirror of https://github.com/WikiTeam/wikiteam
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1076 lines
46 KiB
Python
1076 lines
46 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright (C) 2011-2016 WikiTeam developers
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
|
|
# Documentation for developers: http://wikiteam.readthedocs.com
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
import urllib
|
|
|
|
import wikiteam
|
|
|
|
def mwCleanHTML(raw=''):
|
|
""" Extract only the real wiki content and remove rubbish """
|
|
""" This function is ONLY used to retrieve page titles and file names when no API is available """
|
|
""" DO NOT use this function to extract page content """
|
|
|
|
# different "tags" used by different MediaWiki versions to mark where
|
|
# starts and ends content
|
|
if re.search('<!-- bodytext -->', raw):
|
|
raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0]
|
|
elif re.search('<!-- start content -->', raw):
|
|
raw = raw.split(
|
|
'<!-- start content -->')[1].split('<!-- end content -->')[0]
|
|
elif re.search('<!-- Begin Content Area -->', raw):
|
|
raw = raw.split(
|
|
'<!-- Begin Content Area -->')[1].split('<!-- End Content Area -->')[0]
|
|
elif re.search('<!-- content -->', raw):
|
|
raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0]
|
|
elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw):
|
|
raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[1].split('</article>')[0]
|
|
elif re.search('<body class=', raw):
|
|
raw = raw.split('<body class=')[1].split('<div class="printfooter">')[0]
|
|
else:
|
|
sys.stderr.write(raw[:250])
|
|
sys.stderr.write('This wiki doesn\'t use marks to split content\n')
|
|
sys.exit()
|
|
return raw
|
|
|
|
def mwCleanXML(xml=''):
|
|
""" Trim redundant info """
|
|
|
|
# do not touch XML codification, leave AS IS
|
|
if re.search(r'</siteinfo>\n', xml):
|
|
xml = xml.split('</siteinfo>\n')[1]
|
|
if re.search(r'</mediawiki>', xml):
|
|
xml = xml.split('</mediawiki>')[0]
|
|
return xml
|
|
|
|
def mwCreateNewDump(config={}):
|
|
sys.stderr.write('Trying generating a new dump into a new directory...')
|
|
if config['pages']:
|
|
pagetitles = mwGetPageTitles(config=config)
|
|
wikiteam.savePageTitles(config=config, pagetitles=pagetitles)
|
|
mwGeneratePageDump(config=config, pagetitles=pagetitles)
|
|
mwCheckXMLIntegrity(config=config, pagetitles=pagetitles)
|
|
if config['images']:
|
|
imagenames = mwGetImageNames(config=config)
|
|
mwSaveImageNames(config=config, imagenames=imagenames)
|
|
mwGenerateImageDump(config=config, imagenames=imagenames)
|
|
if config['logs']:
|
|
mwSaveLogs(config=config)
|
|
mwSaveIndexPHP(config=config)
|
|
mwSaveSpecialVersion(config=config)
|
|
mwSaveSiteInfo(config=config)
|
|
|
|
def mwCurateImageURL(config={}, url=''):
|
|
""" Returns an absolute URL for an image, adding the domain if missing """
|
|
|
|
if 'mwindex' in config and config['mwindex']:
|
|
# remove from :// (http or https) until the first / after domain
|
|
domainalone = config['mwindex'].split(
|
|
'://')[0] + '://' + config['mwindex'].split('://')[1].split('/')[0]
|
|
elif 'mwapi' in config and config['mwapi']:
|
|
domainalone = config['mwapi'].split(
|
|
'://')[0] + '://' + config['mwapi'].split('://')[1].split('/')[0]
|
|
else:
|
|
sys.stderr.write('ERROR: no index nor API')
|
|
sys.exit()
|
|
|
|
if url.startswith('//'): # Orain wikifarm returns URLs starting with //
|
|
url = '%s:%s' % (domainalone.split('://')[0], url)
|
|
# is it a relative URL?
|
|
elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')):
|
|
if url[0] == '/': # slash is added later
|
|
url = url[1:]
|
|
# concat http(s) + domain + relative url
|
|
url = '%s/%s' % (domainalone, url)
|
|
url = wikiteam.undoHTMLEntities(text=url)
|
|
# url = urllib.unquote(url) #do not use unquote with url, it break some
|
|
# urls with odd chars
|
|
url = re.sub(' ', '_', url)
|
|
|
|
return url
|
|
|
|
def mwGeneratePageDump(config={}, pagetitles=None, start=None):
|
|
""" Generates a XML dump for page titles """
|
|
|
|
sys.stderr.write('Retrieving XML for every page from "%s"' % (start or 'start'))
|
|
header = mwGetXMLHeader(config=config)
|
|
footer = '</mediawiki>\n' # new line at the end
|
|
xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config),
|
|
config['date'],
|
|
config['curonly'] and 'current' or 'history')
|
|
xmlfile = ''
|
|
lock = True
|
|
if start:
|
|
sys.stderr.write("Removing the last chunk of past XML dump: it is probably incomplete.\n")
|
|
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
|
|
pass
|
|
else:
|
|
# requested complete xml dump
|
|
lock = False
|
|
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
|
|
xmlfile.write(header)
|
|
xmlfile.close()
|
|
|
|
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
|
|
c = 1
|
|
for pagetitle in mwGetPageTitles(config=config, start=start):
|
|
if not pagetitle.strip():
|
|
continue
|
|
if pagetitle == start: # start downloading from start, included
|
|
lock = False
|
|
if lock:
|
|
continue
|
|
wikiteam.delay(config=config)
|
|
if c % 10 == 0:
|
|
sys.stderr.write('Downloaded %d pages\n' % (c))
|
|
try:
|
|
for xml in getXMLPage(config=config, title=title):
|
|
xml = cleanXML(xml=xml)
|
|
xmlfile.write(xml)
|
|
except PageMissingError:
|
|
logerror(
|
|
config=config,
|
|
text='The page "%s" was missing in the wiki (probably deleted)' %
|
|
(title))
|
|
# here, XML is a correct <page> </page> chunk or
|
|
# an empty string due to a deleted page (logged in errors log) or
|
|
# an empty string due to an error while retrieving the page from server
|
|
# (logged in errors log)
|
|
c += 1
|
|
xmlfile.write(footer)
|
|
xmlfile.close()
|
|
sys.stderr.write('XML dump saved at... %s\n' % (xmlfilename))
|
|
|
|
def mwGetAPI(config={}):
|
|
""" Returns API for a MediaWiki wiki, if available """
|
|
|
|
api = ''
|
|
html = wikiteam.getURL(url=config['wiki'])
|
|
m = re.findall(
|
|
r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>',
|
|
html)
|
|
if m:
|
|
api = m[0]
|
|
if api.startswith('//'): # gentoo wiki and others
|
|
api = url.split('//')[0] + api
|
|
return api
|
|
|
|
def mwGetImageNames(config={}):
|
|
""" Get list of image names """
|
|
|
|
sys.stderr.write('Retrieving image filenames\n')
|
|
imagenames = []
|
|
if 'mwapi' in config and config['mwapi']:
|
|
imagenames = mwGetImageNamesAPI(config=config)
|
|
elif 'mwindex' in config and config['mwindex']:
|
|
imagenames = mwGetImageNamesScraper(config=config)
|
|
# imagenames = list(set(imagenames)) # it is a list of lists
|
|
imagenames.sort()
|
|
sys.stderr.write('%d image names loaded\n' % (len(imagenames)))
|
|
return imagenames
|
|
|
|
def mwGetImageNamesAPI(config={}):
|
|
""" Retrieve file list: filename, url, uploader """
|
|
|
|
oldAPI = False
|
|
aifrom = '!'
|
|
imagenames = []
|
|
while aifrom:
|
|
sys.stderr.write('.') # progress
|
|
data = {
|
|
'action': 'query',
|
|
'list': 'allimages',
|
|
'aiprop': 'url|user',
|
|
'aifrom': aifrom,
|
|
'format': 'json',
|
|
'ailimit': 500}
|
|
# FIXME Handle HTTP Errors HERE
|
|
r = wikiteam.getURL(url=config['mwapi'], data=data)
|
|
#handleStatusCode(r)
|
|
jsonimages = wikiteam.getJSON(r)
|
|
wikiteam.delay(config=config)
|
|
|
|
if 'query' in jsonimages:
|
|
aifrom = ''
|
|
if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']:
|
|
if 'aicontinue' in jsonimages['query-continue']['allimages']:
|
|
aifrom = jsonimages['query-continue']['allimages']['aicontinue']
|
|
elif 'aifrom' in jsonimages['query-continue']['allimages']:
|
|
aifrom = jsonimages['query-continue']['allimages']['aifrom']
|
|
elif 'continue' in jsonimages:
|
|
if 'aicontinue' in jsonimages['continue']:
|
|
aifrom = jsonimages['continue']['aicontinue']
|
|
elif 'aifrom' in jsonimages['continue']:
|
|
aifrom = jsonimages['continue']['aifrom']
|
|
# sys.stderr.write(aifrom)
|
|
|
|
for image in jsonimages['query']['allimages']:
|
|
url = image['url']
|
|
url = mwCurateImageURL(config=config, url=url)
|
|
# encoding to ascii is needed to work around this horrible bug:
|
|
# http://bugs.python.org/issue8136
|
|
if 'mwapi' in config and '.wikia.com' in config['mwapi']:
|
|
#to avoid latest?cb=20120816112532 in filenames
|
|
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')
|
|
else:
|
|
filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
|
|
uploader = re.sub('_', ' ', image['user'])
|
|
imagenames.append([filename, url, uploader])
|
|
else:
|
|
oldAPI = True
|
|
break
|
|
|
|
if oldAPI:
|
|
gapfrom = '!'
|
|
imagenames = []
|
|
while gapfrom:
|
|
sys.stderr.write('.') # progress
|
|
# Some old APIs doesn't have allimages query
|
|
# In this case use allpages (in nm=6) as generator for imageinfo
|
|
# Example:
|
|
# http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6
|
|
# &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
|
|
data = {
|
|
'action': 'query',
|
|
'generator': 'allpages',
|
|
'gapnamespace': 6,
|
|
'gaplimit': 500,
|
|
'gapfrom': gapfrom,
|
|
'prop': 'imageinfo',
|
|
'iiprop': 'user|url',
|
|
'format': 'json'}
|
|
# FIXME Handle HTTP Errors HERE
|
|
r = wikiteam.getURL(url=config['mwapi'], data=data)
|
|
#handleStatusCode(r)
|
|
jsonimages = wikiteam.getJSON(r)
|
|
wikiteam.delay(config=config)
|
|
|
|
if 'query' in jsonimages:
|
|
gapfrom = ''
|
|
if 'query-continue' in jsonimages and 'allpages' in jsonimages['query-continue']:
|
|
if 'gapfrom' in jsonimages['query-continue']['allpages']:
|
|
gapfrom = jsonimages['query-continue']['allpages']['gapfrom']
|
|
|
|
for image, props in jsonimages['query']['pages'].items():
|
|
url = props['imageinfo'][0]['url']
|
|
url = mwCurateImageURL(config=config, url=url)
|
|
tmp_filename = ':'.join(props['title'].split(':')[1:])
|
|
filename = re.sub('_', ' ', tmp_filename)
|
|
uploader = re.sub('_', ' ', props['imageinfo'][0]['user'])
|
|
imagenames.append([filename, url, uploader])
|
|
else:
|
|
# if the API doesn't return query data, then we're done
|
|
break
|
|
|
|
if len(imagenames) == 1:
|
|
sys.stderr.write(' Found 1 image')
|
|
else:
|
|
sys.stderr.write(' Found %d images' % (len(imagenames)))
|
|
|
|
return imagenames
|
|
|
|
def mwGetImageNamesScraper(config={}):
|
|
""" Retrieve file list: filename, url, uploader """
|
|
|
|
# (?<! http://docs.python.org/library/re.html
|
|
r_next = r'(?<!&dir=prev)&offset=(?P<offset>\d+)&'
|
|
imagenames = []
|
|
offset = '29990101000000' # january 1, 2999
|
|
limit = 5000
|
|
retries = config['retries']
|
|
while offset:
|
|
# 5000 overload some servers, but it is needed for sites like this with
|
|
# no next links
|
|
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
|
|
data={
|
|
'title': 'Special:Imagelist',
|
|
'limit': limit,
|
|
'offset': offset}
|
|
raw = wikiteam.getURL(url=config['index'], data=data)
|
|
#handleStatusCode(r)
|
|
wikiteam.delay(config=config)
|
|
# delicate wiki
|
|
if re.search(r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw):
|
|
if limit > 10:
|
|
sys.stderr.write('Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit))
|
|
limit = limit / 10
|
|
continue
|
|
elif retries > 0: # waste retries, then exit
|
|
retries -= 1
|
|
sys.stderr.write('Retrying...')
|
|
continue
|
|
else:
|
|
sys.stderr.write('No more retries, exit...')
|
|
break
|
|
|
|
raw = mwCleanHTML(raw)
|
|
# archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
|
|
# wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
|
|
# href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1"
|
|
# class="new" title="Usuario:Fernandocg (página no
|
|
# existe)">Fernandocg</a></td>
|
|
r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
|
|
# wikijuegos 1.9.5
|
|
# http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
|
|
# mediawiki version
|
|
r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
|
|
# gentoowiki 1.18
|
|
r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
|
|
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
|
|
# (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
|
|
r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
|
|
r_images5 = (
|
|
r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
|
|
'<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
|
|
'<td class="TablePager_col_img_size">[^<]*?</td>\s*'
|
|
'<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>')
|
|
|
|
# Select the regexp that returns more results
|
|
regexps = [r_images1, r_images2, r_images3, r_images4, r_images5]
|
|
count = 0
|
|
i = 0
|
|
regexp_best = 0
|
|
for regexp in regexps:
|
|
if len(re.findall(regexp, raw)) > count:
|
|
count = len(re.findall(regexp, raw))
|
|
regexp_best = i
|
|
i += 1
|
|
m = re.compile(regexps[regexp_best]).finditer(raw)
|
|
|
|
# Iter the image results
|
|
for i in m:
|
|
url = i.group('url')
|
|
url = mwCurateImageURL(config=config, url=url)
|
|
filename = re.sub('_', ' ', i.group('filename'))
|
|
filename = wikiteam.undoHTMLEntities(text=filename)
|
|
filename = urllib.unquote(filename)
|
|
uploader = re.sub('_', ' ', i.group('uploader'))
|
|
uploader = wikiteam.undoHTMLEntities(text=uploader)
|
|
uploader = urllib.unquote(uploader)
|
|
imagenames.append([filename, url, uploader])
|
|
|
|
if re.search(r_next, raw):
|
|
new_offset = re.findall(r_next, raw)[0]
|
|
# Avoid infinite loop
|
|
if new_offset != offset:
|
|
offset = new_offset
|
|
retries += 5 # add more retries if we got a page with offset
|
|
else:
|
|
offset = ''
|
|
else:
|
|
offset = ''
|
|
|
|
if (len(imagenames) == 1):
|
|
sys.stderr.write(' Found 1 image')
|
|
else:
|
|
sys.stderr.write(' Found %d images' % (len(imagenames)))
|
|
|
|
imagenames.sort()
|
|
return imagenames
|
|
|
|
def mwGetIndex(config={}):
|
|
""" Returns Index.php for a MediaWiki wiki, if available """
|
|
|
|
if config['mwapi']:
|
|
mwapi = config['mwapi']
|
|
else:
|
|
mwapi = mwGetAPI(config=config)
|
|
index = ''
|
|
html = wikiteam.getURL(url=config['wiki'])
|
|
m = re.findall(r'<li id="ca-viewsource"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', html)
|
|
if m:
|
|
index = m[0]
|
|
else:
|
|
m = re.findall(r'<li id="ca-history"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', html)
|
|
if m:
|
|
index = m[0]
|
|
if index:
|
|
if index.startswith('/'):
|
|
index = '/'.join(mwapi.split('/')[:-1]) + '/' + index.split('/')[-1]
|
|
else:
|
|
if mwapi:
|
|
if len(re.findall(r'/index\.php5\?', html)) > len(re.findall(r'/index\.php\?', html)):
|
|
index = '/'.join(mwapi.split('/')[:-1]) + '/index.php5'
|
|
else:
|
|
index = '/'.join(mwapi.split('/')[:-1]) + '/index.php'
|
|
return index
|
|
|
|
def mwGetNamespaces(config={}):
|
|
""" Get list of namespaces """
|
|
|
|
sys.stderr.write('Retrieving namespaces\n')
|
|
namespaces = []
|
|
namespacenames = []
|
|
if 'mwapi' in config and config['mwapi']:
|
|
namespaces, namespacenames = mwGetNamespacesAPI(config=config)
|
|
elif 'mwindex' in config and config['mwindex']:
|
|
namespaces, namespacenames = mwGetImageNamesScraper(config=config)
|
|
namespaces.sort()
|
|
sys.stderr.write('%d namespaces loaded\n' % (len(namespaces)))
|
|
return namespaces, namespacenames
|
|
|
|
def mwGetNamespacesAPI(config={}):
|
|
""" Uses the API to get the list of namespaces names and ids """
|
|
namespaces = config['namespaces']
|
|
namespacenames = {0: ''} # main is 0, no prefix
|
|
if namespaces:
|
|
data = {'action': 'query',
|
|
'meta': 'siteinfo',
|
|
'siprop': 'namespaces',
|
|
'format': 'json'}
|
|
r = wikiteam.getURL(url=config['mwapi'], data=data)
|
|
result = wikiteam.getJSON(r)
|
|
wikiteam.delay(config=config)
|
|
if 'all' in namespaces:
|
|
namespaces = []
|
|
for i in result['query']['namespaces'].keys():
|
|
if int(i) < 0: # Skipping -1: Special, -2: Media
|
|
continue
|
|
namespaces.append(int(i))
|
|
namespacenames[int(i)] = result['query']['namespaces'][i]['*']
|
|
else:
|
|
# check if those namespaces really exist in this wiki
|
|
namespaces2 = []
|
|
for i in result['query']['namespaces'].keys():
|
|
if int(i) < 0:
|
|
continue
|
|
if int(i) in namespaces:
|
|
namespaces2.append(int(i))
|
|
namespacenames[int(i)] = result['query']['namespaces'][i]['*']
|
|
namespaces = namespaces2
|
|
else:
|
|
namespaces = [0]
|
|
|
|
namespaces = list(set(namespaces)) # uniques
|
|
sys.stderr.write('%d namespaces found\n' % (len(namespaces)))
|
|
return namespaces, namespacenames
|
|
|
|
def mwGetNamespacesScraper(config={}):
|
|
""" Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
|
|
""" Function called if no API is available """
|
|
|
|
namespaces = config['namespaces']
|
|
namespacenames = {0: ''} # main is 0, no prefix
|
|
if namespaces:
|
|
raw = wikiteam.getURL(url=config['index'], data={'title': 'Special:Allpages'})
|
|
wikiteam.delay(config=config)
|
|
|
|
# [^>]*? to include selected="selected"
|
|
m = re.compile(r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw)
|
|
if 'all' in namespaces:
|
|
namespaces = []
|
|
for i in m:
|
|
namespaces.append(int(i.group("namespaceid")))
|
|
namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
|
|
else:
|
|
# check if those namespaces really exist in this wiki
|
|
namespaces2 = []
|
|
for i in m:
|
|
if int(i.group("namespaceid")) in namespaces:
|
|
namespaces2.append(int(i.group("namespaceid")))
|
|
namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
|
|
namespaces = namespaces2
|
|
else:
|
|
namespaces = [0]
|
|
|
|
namespaces = list(set(namespaces)) # uniques
|
|
std.stderr.write('%d namespaces found' % (len(namespaces)))
|
|
return namespaces, namespacenames
|
|
|
|
def mwGetPageTitles(config={}):
|
|
""" Get list of page titles """
|
|
# http://en.wikipedia.org/wiki/Special:AllPages
|
|
# http://archiveteam.org/index.php?title=Special:AllPages
|
|
# http://www.wikanda.es/wiki/Especial:Todas
|
|
sys.stderr.write('Loading page titles from namespaces = %s\n' % (','.join([str(i) for i in config['namespaces']]) or 'None'))
|
|
sys.stderr.write('Excluding titles from namespaces = %s\n' % (','.join([str(i) for i in config['exnamespaces']]) or 'None'))
|
|
|
|
if 'mwapi' in config and config['mwapi']:
|
|
for pagetitle in mwGetPageTitlesAPI(config=config):
|
|
yield pagetitle
|
|
elif 'mwindex' in config and config['mwindex']:
|
|
for pagetitle in mwGetPageTitlesScraper(config=config):
|
|
yield pagetitle
|
|
|
|
def mwGetPageTitlesAPI(config={}):
|
|
""" Uses the API to get the list of page titles """
|
|
pagetitles = []
|
|
namespaces, namespacenames = mwGetNamespacesAPI(
|
|
config=config)
|
|
for namespace in namespaces:
|
|
if namespace in config['exnamespaces']:
|
|
sys.stderr.write(' Skipping namespace = %d\n' % (namespace))
|
|
continue
|
|
|
|
c = 0
|
|
sys.stderr.write(' Retrieving page titles in namespace %d\n' % (namespace))
|
|
apfrom = '!'
|
|
while apfrom:
|
|
sys.stderr.write('.') # progress
|
|
data = {
|
|
'action': 'query',
|
|
'list': 'allpages',
|
|
'apnamespace': namespace,
|
|
'apfrom': apfrom.encode('utf-8'),
|
|
'format': 'json',
|
|
'aplimit': 500}
|
|
retryCount = 0
|
|
while retryCount < config["retries"]:
|
|
try:
|
|
r = wikiteam.getURL(url=config['mwapi'], data=data)
|
|
break
|
|
except ConnectionError as err:
|
|
sys.stderr.write("Connection error: %s\n" % (str(err),))
|
|
retryCount += 1
|
|
time.sleep(20)
|
|
#wikiteam.handleStatusCode(r)
|
|
# FIXME Handle HTTP errors here!
|
|
jsontitles = wikiteam.getJSON(r)
|
|
apfrom = ''
|
|
if 'query-continue' in jsontitles and 'allpages' in jsontitles[
|
|
'query-continue']:
|
|
if 'apcontinue' in jsontitles['query-continue']['allpages']:
|
|
apfrom = jsontitles[
|
|
'query-continue']['allpages']['apcontinue']
|
|
elif 'apfrom' in jsontitles['query-continue']['allpages']:
|
|
apfrom = jsontitles['query-continue']['allpages']['apfrom']
|
|
elif 'continue' in jsontitles:
|
|
if 'apcontinue' in jsontitles['continue']:
|
|
apfrom = jsontitles['continue']['apcontinue']
|
|
elif 'apfrom' in jsontitles['continue']:
|
|
apfrom = jsontitles['continue']['apfrom']
|
|
|
|
# sys.stderr.write(apfrom)
|
|
# sys.stderr.write(jsontitles)
|
|
allpages = jsontitles['query']['allpages']
|
|
# Hack for old versions of MediaWiki API where result is dict
|
|
if isinstance(allpages, dict):
|
|
allpages = allpages.values()
|
|
for page in allpages:
|
|
yield page['title']
|
|
c += len(allpages)
|
|
|
|
if len(pagetitles) != len(set(pagetitles)):
|
|
# Are we in a loop? Server returning dupes, stop it
|
|
sys.stderr.write('Probably a loop, finishing\n')
|
|
apfrom = ''
|
|
|
|
wikiteam.delay(config=config)
|
|
sys.stderr.write(' %d titles retrieved in namespace %d\n' % (c, namespace))
|
|
|
|
|
|
def mwGetPageTitlesScraper(config={}):
|
|
""" Scrape list of page titles from Special:Allpages """
|
|
|
|
pagetitles = []
|
|
namespaces, namespacenames = mwGetNamespacesScraper(
|
|
config=config)
|
|
for namespace in namespaces:
|
|
sys.stderr.write(' Retrieving titles in namespace %s\n' % (namespace))
|
|
url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
|
|
raw = wikiteam.getURL(url=url)
|
|
raw = mwCleanHTML(raw)
|
|
|
|
r_title = r'title="(?P<title>[^>]+)">'
|
|
r_suballpages = ''
|
|
r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">'
|
|
r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">'
|
|
r_suballpages3 = r'&from=(?P<from>[^>]+)" title="[^>]+">'
|
|
if re.search(r_suballpages1, raw):
|
|
r_suballpages = r_suballpages1
|
|
elif re.search(r_suballpages2, raw):
|
|
r_suballpages = r_suballpages2
|
|
elif re.search(r_suballpages3, raw):
|
|
r_suballpages = r_suballpages3
|
|
else:
|
|
pass # perhaps no subpages
|
|
|
|
# 3 is the current deep of English Wikipedia for Special:Allpages
|
|
deep = 3
|
|
c = 0
|
|
checked_suballpages = []
|
|
rawacum = raw
|
|
while r_suballpages and re.search(r_suballpages, raw) and c < deep:
|
|
# load sub-Allpages
|
|
m = re.compile(r_suballpages).finditer(raw)
|
|
for i in m:
|
|
fr = i.group('from')
|
|
|
|
if r_suballpages == r_suballpages1:
|
|
to = i.group('to')
|
|
name = '%s-%s' % (fr, to)
|
|
url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (
|
|
config['index'], namespace, fr, to) # do not put urllib.quote in fr or to
|
|
# fix, esta regexp no carga bien todas? o falla el r_title en
|
|
# este tipo de subpag? (wikiindex)
|
|
elif r_suballpages == r_suballpages2:
|
|
# clean &namespace=\d, sometimes happens
|
|
fr = fr.split('&namespace=')[0]
|
|
name = fr
|
|
url = '%s?title=Special:Allpages/%s&namespace=%s' % (
|
|
config['index'], name, namespace)
|
|
elif r_suballpages == r_suballpages3:
|
|
fr = fr.split('&namespace=')[0]
|
|
name = fr
|
|
url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
|
|
config['index'], name, namespace)
|
|
|
|
if name not in checked_suballpages:
|
|
# to avoid reload dupe subpages links
|
|
checked_suballpages.append(name)
|
|
wikiteam.delay(config=config)
|
|
raw2 = wikiteam.getURL(url=url)
|
|
raw2 = mwCleanHTML(raw2)
|
|
rawacum += raw2 # merge it after removed junk
|
|
sys.stderr.write(' Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \
|
|
len(re.findall(r_suballpages, raw2)), \
|
|
len(re.findall(r_title, raw2))))
|
|
|
|
wikiteam.delay(config=config)
|
|
c += 1
|
|
|
|
c = 0
|
|
m = re.compile(r_title).finditer(rawacum)
|
|
for i in m:
|
|
t = wikiteam.undoHTMLEntities(text=i.group('title'))
|
|
if not t.startswith('Special:'):
|
|
if t not in pagetitles:
|
|
pagetitles.append(t)
|
|
c += 1
|
|
sys.stderr.write(' %d titles retrieved in the namespace %d\n' % (c, namespace))
|
|
return pagetitles
|
|
|
|
def mwGetXMLHeader(config={}):
|
|
""" Retrieve a random page to extract XML header (namespace info, etc) """
|
|
|
|
pagetitle = 'Main_Page'
|
|
try:
|
|
xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)])
|
|
except PageMissingError as pme:
|
|
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
|
|
xml = pme.xml
|
|
except ExportAbortedError:
|
|
# Issue 26: Account for missing "Special" namespace.
|
|
# Hope the canonical special name has not been removed.
|
|
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
|
|
try:
|
|
if config['mwapi']:
|
|
sys.stderr.write("Trying the local name for the Special namespace instead\n")
|
|
xml = "".join([x for x in mwGetXMLPage(config=config, pagetitle=pagetitle, verbose=False)])
|
|
except PageMissingError as pme:
|
|
xml = pme.xml
|
|
except ExportAbortedError:
|
|
pass
|
|
|
|
header = xml.split('</mediawiki>')[0]
|
|
if not re.match(r"\s*<mediawiki", xml):
|
|
sys.stderr.write('XML export on this wiki is broken, quitting.\n')
|
|
logerror('XML export on this wiki is broken, quitting.')
|
|
sys.exit()
|
|
return header
|
|
|
|
def mwGetXMLPage(config={}, pagetitle='', verbose=True):
|
|
""" Get the full history (or current only) of a page """
|
|
|
|
# if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partially truncated
|
|
# http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
|
|
|
|
limit = 1000
|
|
truncated = False
|
|
pagetitle_ = re.sub(' ', '_', pagetitle)
|
|
# do not convert & into %26, pagetitle_ = re.sub('&', '%26', pagetitle_)
|
|
data = {'title': config['mwexport'], 'pages': pagetitle_, 'action': 'submit'}
|
|
if config['curonly']:
|
|
data['curonly'] = 1
|
|
data['limit'] = 1
|
|
else:
|
|
data['offset'] = '1' # 1 always < 2000s
|
|
data['limit'] = limit
|
|
# in other case, do not set data['templates']
|
|
if 'templates' in config and config['templates']: #fix, what is this option for?
|
|
data['templates'] = 1
|
|
|
|
xml = mwGetXMLPageCore(config=config, data=data)
|
|
if not xml:
|
|
raise ExportAbortedError(config['index'])
|
|
if not "</page>" in xml:
|
|
raise PageMissingError(data['title'], xml)
|
|
else:
|
|
# strip these sha1s sums which keep showing up in the export and
|
|
# which are invalid for the XML schema (they only apply to
|
|
# revisions)
|
|
xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
|
|
xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
|
|
|
|
yield xml.split("</page>")[0]
|
|
|
|
# if complete history, check if this page history has > limit edits,
|
|
# if so, retrieve all revisions using offset if available
|
|
# else, warning about Special:Export truncating large page histories
|
|
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
|
|
numedits = 0
|
|
numedits += len(re.findall(r_timestamp, xml))
|
|
|
|
# search for timestamps in xml to avoid analysing empty pages like
|
|
# Special:Allpages and the random one
|
|
if not config['curonly'] and re.search(r_timestamp, xml):
|
|
while not truncated and data['offset']: # next chunk
|
|
# get the last timestamp from the acum XML
|
|
# assuming history is sorted chronologically
|
|
data['offset'] = re.findall(r_timestamp, xml)[-1]
|
|
try:
|
|
xml2 = mwGetXMLPageCore(config=config, data=data)
|
|
except MemoryError:
|
|
sys.stderr.write("Page history exceeds our memory, halving limit.\n")
|
|
data['limit'] = data['limit'] / 2
|
|
continue
|
|
|
|
# are there more edits in this next XML chunk or no <page></page>?
|
|
if re.findall(r_timestamp, xml2):
|
|
if re.findall(r_timestamp, xml2)[-1] == data['offset']:
|
|
# again the same XML, this wiki does not support params in
|
|
# Special:Export, offer complete XML up to X edits (usually
|
|
# 1000)
|
|
sys.stderr.write('ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated\n')
|
|
truncated = True
|
|
break
|
|
else:
|
|
""" </namespaces>
|
|
</siteinfo>
|
|
<page>
|
|
<title>Main Page</title>
|
|
<id>15580374</id>
|
|
<restrictions>edit=sysop:move=sysop</restrictions> (?)
|
|
<revision>
|
|
<id>418009832</id>
|
|
<timestamp>2011-03-09T19:57:06Z</timestamp>
|
|
<contributor>
|
|
"""
|
|
# offset is OK in this wiki, merge with the previous chunk
|
|
# of this page history and continue
|
|
try:
|
|
xml2 = xml2.split("</page>")[0]
|
|
yield ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
|
|
except MemoryError:
|
|
sys.stderr.write("Page's history exceeds our memory, halving limit.\n")
|
|
data['limit'] = data['limit'] / 2
|
|
continue
|
|
xml = xml2
|
|
numedits += len(re.findall(r_timestamp, xml))
|
|
else:
|
|
data['offset'] = '' # no more edits in this page history
|
|
yield "</page>\n"
|
|
|
|
if verbose:
|
|
if numedits == 1:
|
|
sys.stderr.write(' %s, 1 edit\n' % (pagetitle))
|
|
else:
|
|
sys.stderr.write(' %s, %d edits\n' % (pagetitle, numedits))
|
|
|
|
def mwGetXMLPageCore(config={}, data={}):
|
|
""" Returns a XML containing data['limit'] revisions (or current only), ending in </mediawiki>
|
|
if retrieving data['limit'] revisions fails, returns current only version
|
|
if all fail, returns empty string
|
|
"""
|
|
|
|
xml = ''
|
|
cretries = 0
|
|
maxseconds = 100 # max seconds to wait in a single sleeping
|
|
maxretries = config['retries'] # x retries and exit
|
|
increment = 20 # increment seconds every retry
|
|
|
|
while not re.search(r'</mediawiki>', xml):
|
|
if cretries > 0 and cretries < maxretries:
|
|
wait = increment * cretries < maxseconds and increment * \
|
|
cretries or maxseconds # incremental until maxseconds
|
|
sys.stderr.write(' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...\n' % (c, data['pages'], wait))
|
|
time.sleep(wait)
|
|
# reducing server load requesting smallest chunks (if curonly then
|
|
# limit = 1 from mother function)
|
|
if data['limit'] > 1:
|
|
data['limit'] = data['limit'] / 2 # half
|
|
if cretries >= maxretries:
|
|
sys.stderr.write(' We have retried %d times\n' % (cretries))
|
|
sys.stderr.write(' MediaWiki error for "%s", probably network error...' % (data['pages']))
|
|
# If it's not already what we tried: our last chance, preserve only the last revision...
|
|
# config['curonly'] means that the whole dump is configured to save only the last,
|
|
# data['curonly'] should mean that we've already tried this
|
|
# fallback, because it's set by the following if and passed to
|
|
# mwGetXMLPageCore
|
|
if not config['curonly'] and not 'curonly' in data:
|
|
sys.stderr.write(' Trying to save only the last revision for this page...\n')
|
|
data['curonly'] = 1
|
|
logerror(
|
|
config=config,
|
|
text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' %
|
|
(data['pages'])
|
|
)
|
|
return mwGetXMLPageCore(config=config, data=data)
|
|
else:
|
|
sys.stderr.write(' Saving in error log, skipping...\n')
|
|
logerror(
|
|
config=config,
|
|
text='Error while retrieving last revision of "%s". Skipping.\n' %
|
|
(data['pages']))
|
|
raise ExportAbortedError(config['index'])
|
|
return '' # empty xml
|
|
# FIXME HANDLE HTTP Errors HERE
|
|
try:
|
|
r = wikiteam.getURL(url=config['index'], data=data)
|
|
#handleStatusCode(r)
|
|
#r = fixBOM(r)
|
|
xml = fixBOM(r)
|
|
except:
|
|
sys.stderr.write(' Connection error\n')
|
|
xml = ''
|
|
cretries += 1
|
|
|
|
return xml
|
|
|
|
def mwReadPageTitles(config={}, start=None):
|
|
""" Read title list from a file, from the title "start" """
|
|
|
|
titlesfilename = '%s-%s-titles.txt' % (
|
|
domain2prefix(config=config), config['date'])
|
|
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')
|
|
|
|
seeking = False
|
|
if start:
|
|
seeking = True
|
|
|
|
with titlesfile as f:
|
|
for line in f:
|
|
if line.strip() == '--END--':
|
|
break
|
|
elif seeking and line.strip() != start:
|
|
continue
|
|
elif seeking and line.strip() == start:
|
|
seeking = False
|
|
yield line.strip()
|
|
else:
|
|
yield line.strip()
|
|
|
|
def mwRemoveIP(raw=''):
|
|
""" Remove IP from HTML comments <!-- --> """
|
|
|
|
raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw)
|
|
# http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
|
|
# weird cases as :: are not included
|
|
raw = re.sub(
|
|
r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}',
|
|
'0:0:0:0:0:0:0:0',
|
|
raw)
|
|
|
|
return raw
|
|
|
|
def mwResumePreviousDump(config={}):
|
|
imagenames = []
|
|
sys.stderr.write('Resuming previous dump process...')
|
|
if config['xml']:
|
|
pagetitles = mwReadPageTitles(config=config)
|
|
try:
|
|
lasttitles = wikiteam.reverseReadline('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date']))
|
|
lasttitle=lasttitles.next()
|
|
if lasttitle == '':
|
|
lasttitle=lasttitles.next()
|
|
except:
|
|
pass # probably file does not exists
|
|
if lasttitle == '--END--':
|
|
# titles list is complete
|
|
sys.stderr.write('Title list was completed in the previous session')
|
|
else:
|
|
sys.stderr.write('Title list is incomplete. Reloading...')
|
|
# do not resume, reload, to avoid inconsistencies, deleted pages or
|
|
# so
|
|
pagetitles = mwGetPageTitles(config=config, start=lastxmltitle)
|
|
wikiteam.savePageTitles(config=config, pagetitles=pagetitles)
|
|
|
|
# checking xml dump
|
|
xmliscomplete = False
|
|
lastxmltitle = None
|
|
try:
|
|
f = wikiteam.reverseReadline(
|
|
'%s/%s-%s-%s.xml' %
|
|
(config['path'],
|
|
domain2prefix(
|
|
config=config),
|
|
config['date'],
|
|
config['curonly'] and 'current' or 'history'),
|
|
)
|
|
for l in f:
|
|
if l == '</mediawiki>':
|
|
# xml dump is complete
|
|
xmliscomplete = True
|
|
break
|
|
|
|
xmltitle = re.search(r'<title>([^<]+)</title>', l)
|
|
if xmltitle:
|
|
lastxmltitle = wikiteam.undoHTMLEntities(text=xmltitle.group(1))
|
|
break
|
|
except:
|
|
pass # probably file does not exists
|
|
|
|
if xmliscomplete:
|
|
sys.stderr.write('XML dump was completed in the previous session')
|
|
elif lastxmltitle:
|
|
# resuming...
|
|
sys.stderr.write('Resuming XML dump from "%s"' % (lastxmltitle))
|
|
pagetitles = mwReadPageTitles(config=config, start=lastxmltitle)
|
|
mwGenerateXMLDump(
|
|
config=config,
|
|
pagetitles=pagetitles,
|
|
start=lastxmltitle)
|
|
else:
|
|
# corrupt? only has XML header?
|
|
sys.stderr.write('XML is corrupt? Regenerating...')
|
|
pagetitles = mwReadPageTitles(config=config)
|
|
mwGenerateXMLDump(config=config, pagetitles=pagetitles)
|
|
|
|
if config['images']:
|
|
# load images
|
|
lastimage = ''
|
|
try:
|
|
f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
|
|
raw = f.read().strip()
|
|
lines = raw.split('\n')
|
|
for l in lines:
|
|
if re.search(r'\t', l):
|
|
imagenames.append(l.split('\t'))
|
|
lastimage = lines[-1]
|
|
f.close()
|
|
except:
|
|
pass # probably file doesnot exists
|
|
if lastimage == '--END--':
|
|
sys.stderr.write('Image list was completed in the previous session')
|
|
else:
|
|
sys.stderr.write('Image list is incomplete. Reloading...')
|
|
# do not resume, reload, to avoid inconsistencies, deleted images or
|
|
# so
|
|
imagenames = mwGetImageNames(config=config)
|
|
saveImageNames(config=config, imagenames=imagenames)
|
|
# checking images directory
|
|
listdir = []
|
|
try:
|
|
listdir = [n.decode('utf-8') for n in os.listdir('%s/images' % (config['path']))]
|
|
except:
|
|
pass # probably directory does not exist
|
|
listdir.sort()
|
|
complete = True
|
|
lastfilename = ''
|
|
lastfilename2 = ''
|
|
c = 0
|
|
for filename, url, uploader in imagenames:
|
|
lastfilename2 = lastfilename
|
|
# return always the complete filename, not the truncated
|
|
lastfilename = filename
|
|
filename2 = filename
|
|
if len(filename2) > other['filenamelimit']:
|
|
filename2 = truncateFilename(other=other, filename=filename2)
|
|
if filename2 not in listdir:
|
|
complete = False
|
|
break
|
|
c += 1
|
|
sys.stderr.write('%d images were found in the directory from a previous session' % (c))
|
|
if complete:
|
|
# image dump is complete
|
|
sys.stderr.write('Image dump was completed in the previous session')
|
|
else:
|
|
# we resume from previous image, which may be corrupted (or missing
|
|
# .desc) by the previous session ctrl-c or abort
|
|
mwGenerateImageDump(config=config, imagenames=imagenames, start=lastfilename2)
|
|
|
|
if config['logs']:
|
|
# fix
|
|
pass
|
|
|
|
mwSaveIndexPHP(config=config)
|
|
mwSaveSpecialVersion(config=config)
|
|
mwSaveSiteInfo(config=config)
|
|
|
|
def mwSaveIndexPHP(config={}):
|
|
""" Save index.php as .html, to preserve license details available at the botom of the page """
|
|
|
|
if os.path.exists('%s/index.html' % (config['path'])):
|
|
sys.stderr.write('index.html exists, do not overwrite')
|
|
else:
|
|
sys.stderr.write('Downloading index.php (Main Page) as index.html')
|
|
raw = wikiteam.getURL(url=config['index'], data={})
|
|
wikiteam.delay(config=config)
|
|
raw = mwRemoveIP(raw=raw)
|
|
with open('%s/index.html' % (config['path']), 'w') as outfile:
|
|
outfile.write(raw)
|
|
|
|
def mwSaveSiteInfo(config={}):
|
|
""" Save a file with site info """
|
|
|
|
if config['api']:
|
|
if os.path.exists('%s/siteinfo.json' % (config['path'])):
|
|
sys.stderr.write('siteinfo.json exists, do not overwrite')
|
|
else:
|
|
sys.stderr.write('Downloading site info as siteinfo.json')
|
|
|
|
# MediaWiki 1.13+
|
|
raw = wikiteam.getURL(url=config['api'], data={
|
|
'action': 'query',
|
|
'meta': 'siteinfo',
|
|
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
|
|
'sinumberingroup': 1,
|
|
'format': 'json'})
|
|
wikiteam.delay(config=config)
|
|
# MediaWiki 1.11-1.12
|
|
if not 'query' in wikiteam.getJSON(raw):
|
|
raw = wikiteam.getURL(url=config['api'], data={
|
|
'action': 'query',
|
|
'meta': 'siteinfo',
|
|
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
|
|
'format': 'json'})
|
|
# MediaWiki 1.8-1.10
|
|
if not 'query' in wikiteam.getJSON(raw):
|
|
raw = wikiteam.getURL(url=config['api'], data={
|
|
'action': 'query',
|
|
'meta': 'siteinfo',
|
|
'siprop': 'general|namespaces',
|
|
'format': 'json'})
|
|
result = wikiteam.getJSON(raw)
|
|
wikiteam.delay(config=config)
|
|
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
|
|
outfile.write(json.dumps(result, indent=4, sort_keys=True))
|
|
|
|
def mwSaveSpecialVersion(config={}):
|
|
""" Save Special:Version as .html, to preserve extensions details """
|
|
|
|
if os.path.exists('%s/Special:Version.html' % (config['path'])):
|
|
sys.stderr.write('Special:Version.html exists, do not overwrite')
|
|
else:
|
|
sys.stderr.write('Downloading Special:Version with extensions and other related info')
|
|
raw = wikiteam.getURL(url=config['index'], data={'title': 'Special:Version'})
|
|
wikiteam.delay(config=config)
|
|
raw = mwRemoveIP(raw=raw)
|
|
with open('%s/Special:Version.html' % (config['path']), 'w') as outfile:
|
|
outfile.write(raw)
|
|
|
|
def main():
|
|
pass
|
|
|
|
if __name__ == "__main__":
|
|
main()
|