pull/319/head
emijrp 6 years ago
commit 4b483c695b

@ -39,17 +39,22 @@ except ImportError: # Python 2.4 compatibility
from md5 import new as md5
import os
import re
import subprocess
try:
import requests
except ImportError:
print "Please install or update the Requests module."
sys.exit(1)
try:
import wikitools
except ImportError:
print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
import time
import urllib
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
__VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org
__VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org
class PageMissingError(Exception):
def __init__(self, title, xml):
@ -150,7 +155,7 @@ def getNamespacesScraper(config={}, session=None):
namespacenames = {0: ''} # main is 0, no prefix
if namespaces:
r = session.post(
url=config['index'], data={'title': 'Special:Allpages'})
url=config['index'], data={'title': 'Special:Allpages'}, timeout=30)
raw = r.text
delay(config=config, session=session)
@ -191,7 +196,8 @@ def getNamespacesAPI(config={}, session=None):
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
'format': 'json'},
timeout=30
)
result = getJSON(r)
delay(config=config, session=session)
@ -249,7 +255,7 @@ def getPageTitlesAPI(config={}, session=None):
retryCount = 0
while retryCount < config["retries"]:
try:
r = session.post(url=config['api'], data=params)
r = session.post(url=config['api'], data=params, timeout=30)
break
except ConnectionError as err:
print "Connection error: %s" % (str(err),)
@ -301,7 +307,7 @@ def getPageTitlesScraper(config={}, session=None):
print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % (
config['index'], namespace)
r = session.get(url=url)
r = session.get(url=url, timeout=30)
raw = r.text
raw = cleanHTML(raw)
@ -353,7 +359,7 @@ def getPageTitlesScraper(config={}, session=None):
# to avoid reload dupe subpages links
checked_suballpages.append(name)
delay(config=config, session=session)
r2 = session.get(url=url)
r2 = session.get(url=url, timeout=10)
raw2 = r2.text
raw2 = cleanHTML(raw2)
rawacum += raw2 # merge it after removed junk
@ -386,7 +392,7 @@ def getPageTitles(config={}, session=None):
titles = []
if 'api' in config and config['api']:
r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'})
r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'}, timeout=30)
test = getJSON(r)
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
@ -436,33 +442,46 @@ def getXMLHeader(config={}, session=None):
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x....
randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError:
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
xml = None
try:
if config['api']:
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
xml = r.text
except requests.exceptions.RetryError:
pass
if not xml:
r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
xml = r.json()['query']['export']['*']
else:
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError:
pass
try:
if config['api']:
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'},
timeout=120
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass
header = xml.split('</mediawiki>')[0]
if not re.match(r"\s*<mediawiki", xml):
@ -512,7 +531,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if c > 0 and c < maxretries:
wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['pages'], wait)
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# limit = 1 from mother function)
@ -521,6 +540,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if c >= maxretries:
print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['pages'])
if config['failfast']:
print "Exit, it will be for another time"
sys.exit()
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save only the last,
# params['curonly'] should mean that we've already tried this
@ -550,7 +572,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
return '' # empty xml
# FIXME HANDLE HTTP Errors HERE
try:
r = session.post(url=config['index'], data=params, headers=headers)
r = session.post(url=config['index'], data=params, headers=headers, timeout=10)
handleStatusCode(r)
xml = fixBOM(r)
except requests.exceptions.ConnectionError as e:
@ -675,10 +697,9 @@ def cleanXML(xml=''):
def generateXMLDump(config={}, titles=[], start=None, session=None):
""" Generates a XML dump for a list of titles """
""" Generates a XML dump for a list of titles or from revision IDs """
# TODO: titles is now unused.
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
header, config = getXMLHeader(config=config, session=session)
footer = '</mediawiki>\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
@ -686,48 +707,106 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
config['curonly'] and 'current' or 'history')
xmlfile = ''
lock = True
if start:
print "Removing the last chunk of past XML dump: it is probably incomplete."
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
# requested complete xml dump
lock = False
if config['xmlrevisions']:
print 'Retrieving the XML for every page from the beginning'
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
delay(config=config, session=session)
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
for xml in getXMLRevisions(config=config, session=session):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
print "%d more revisions exported" % numrevs
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title.decode('utf-8'))
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
except AttributeError:
print "This wikitools module version is not working"
sys.exit()
else:
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
if start:
print "Removing the last chunk of past XML dump: it is probably incomplete."
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
# requested complete xml dump
lock = False
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
delay(config=config, session=session)
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title.decode('utf-8'))
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
xmlfile.write(footer)
xmlfile.close()
print 'XML dump saved at...', xmlfilename
def getXMLRevisions(config={}, session=None):
site = wikitools.wiki.Wiki(config['api'])
#if config['namespaces']:
# namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
#else:
namespaces = ['*']
for namespace in namespaces:
print "Exporting revisions from namespace %s" % namespace
# TODO: 500 would be nicer, but need to find the wiki's limits
params = {
'action': 'query',
'list': 'allrevisions',
'arvlimit': 50,
'arvprop': 'ids',
}
request = wikitools.api.APIRequest(site, params)
results = request.queryGen()
try:
for result in results:
revids = []
for page in result['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
print "50 more revisions listed, until %s" % revids[-1]
exportparams = {
'action': 'query',
'revids': '|'.join(revids),
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
except wikitools.api.APIError:
print "This wikitools version seems not to work for us. Exiting."
sys.exit()
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """
@ -866,7 +945,8 @@ def getImageNamesScraper(config={}, session=None):
data={
'title': 'Special:Imagelist',
'limit': limit,
'offset': offset})
'offset': offset},
timeout=30)
raw = r.text
delay(config=config, session=session)
# delicate wiki
@ -967,7 +1047,7 @@ def getImageNamesAPI(config={}, session=None):
'format': 'json',
'ailimit': 500}
# FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params)
r = session.post(url=config['api'], data=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
@ -1025,7 +1105,7 @@ def getImageNamesAPI(config={}, session=None):
'iiprop': 'user|url',
'format': 'json'}
# FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params)
r = session.post(url=config['api'], data=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
@ -1112,10 +1192,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
# saving description if any
try:
title = u'Image:%s' % (filename)
xmlfiledesc = getXMLFileDesc(
config=config,
title=title,
session=session) # use Image: for backwards compatibility
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
xmlfiledesc = r.text
else:
xmlfiledesc = getXMLFileDesc(
config=config,
title=title,
session=session) # use Image: for backwards compatibility
except PageMissingError:
xmlfiledesc = ''
logerror(
@ -1300,7 +1384,10 @@ def getParameters(params=[]):
action='store_true',
help="generates a full history XML dump (--xml --curonly for current revisions only)")
groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages')
help='store only the current version of pages; incompatible with --xmlrevisions')
groupDownload.add_argument('--xmlrevisions', action='store_true',
help='download all revisions from an API generator. Ignores the \
namespace selection')
groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump")
groupDownload.add_argument(
@ -1320,6 +1407,10 @@ def getParameters(params=[]):
'--get-wiki-engine',
action='store_true',
help="returns the wiki engine")
groupMeta.add_argument(
'--failfast',
action='store_true',
help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.")
args = parser.parse_args()
# print args
@ -1351,11 +1442,22 @@ def getParameters(params=[]):
print 'Using cookies from %s' % args.cookies
session = requests.Session()
try:
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
# Courtesy datashaman https://stackoverflow.com/a/35504626
__retries__ = Retry(total=5,
backoff_factor=2,
status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=__retries__))
session.mount('http://', HTTPAdapter(max_retries=__retries__))
except:
# Our urllib3/requests is too old
pass
session.cookies = cj
session.headers.update({'User-Agent': getUserAgent()})
if args.user and args.password:
session.auth = (args.user, args.password)
# session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
# check URLs
for url in [args.api, args.index, args.wiki]:
@ -1484,10 +1586,12 @@ def getParameters(params=[]):
'curonly': args.curonly,
'date': datetime.datetime.now().strftime('%Y%m%d'),
'api': api,
'failfast': args.failfast,
'index': index,
'images': args.images,
'logs': False,
'xml': args.xml,
'xmlrevisions': args.xmlrevisions,
'namespaces': namespaces,
'exnamespaces': exnamespaces,
'path': args.path and os.path.normpath(args.path) or '',
@ -1521,7 +1625,8 @@ def checkAPI(api=None, session=None):
data={
'action': 'query',
'meta': 'siteinfo',
'format': 'json'}
'format': 'json'},
timeout=30
)
if r.url == api:
break
@ -1532,7 +1637,7 @@ def checkAPI(api=None, session=None):
try:
result = getJSON(r)
index = None
if result['query']:
if result:
try:
index = result['query']['general']['server'] + \
result['query']['general']['script']
@ -1549,7 +1654,7 @@ def checkAPI(api=None, session=None):
def checkIndex(index=None, cookies=None, session=None):
""" Checking index.php availability """
r = session.post(url=index, data={'title': 'Special:Version'})
r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
raw = r.text
print 'Checking index.php...', index
# Workaround for issue 71
@ -1811,7 +1916,7 @@ def saveSpecialVersion(config={}, session=None):
else:
print 'Downloading Special:Version with extensions and other related info'
r = session.post(
url=config['index'], data={'title': 'Special:Version'})
url=config['index'], data={'title': 'Special:Version'}, timeout=10)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
@ -1826,7 +1931,7 @@ def saveIndexPHP(config={}, session=None):
print 'index.html exists, do not overwrite'
else:
print 'Downloading index.php (Main Page) as index.html'
r = session.post(url=config['index'], data={})
r = session.post(url=config['index'], data={}, timeout=10)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
@ -1851,7 +1956,8 @@ def saveSiteInfo(config={}, session=None):
'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
'sinumberingroup': 1,
'format': 'json'})
'format': 'json'},
timeout=10)
# MediaWiki 1.11-1.12
if not 'query' in getJSON(r):
r = session.post(
@ -1860,7 +1966,8 @@ def saveSiteInfo(config={}, session=None):
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
'format': 'json'})
'format': 'json'},
timeout=10)
# MediaWiki 1.8-1.10
if not 'query' in getJSON(r):
r = session.post(
@ -1869,7 +1976,8 @@ def saveSiteInfo(config={}, session=None):
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces',
'format': 'json'})
'format': 'json'},
timeout=10)
result = getJSON(r)
delay(config=config, session=session)
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
@ -1896,9 +2004,9 @@ def getWikiEngine(url=''):
session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url)
r = session.post(url=url, timeout=30)
if r.status_code == 405 or r.text == '':
r = session.get(url=url)
r = session.get(url=url, timeout=120)
result = r.text
wikiengine = 'Unknown'
@ -1981,7 +2089,7 @@ def mwGetAPIAndIndex(url=''):
index = ''
session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url)
r = session.post(url=url, timeout=120)
result = r.text
# API
@ -2042,7 +2150,10 @@ def main(params=[]):
# do not enter if resume is requested from begining
while not other['resume'] and os.path.isdir(config['path']):
print '\nWarning!: "%s" path exists' % (config['path'])
reply = ''
if config['failfast']:
retry = 'yes'
else:
reply = ''
while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input(
'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %

Loading…
Cancel
Save