|
|
|
@ -751,7 +751,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
|
|
|
|
|
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
|
|
|
|
|
c = 1
|
|
|
|
|
for title in readTitles(config, start):
|
|
|
|
|
if not title.strip():
|
|
|
|
|
if not title:
|
|
|
|
|
continue
|
|
|
|
|
if title == start: # start downloading from start, included
|
|
|
|
|
lock = False
|
|
|
|
@ -767,8 +767,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
|
|
|
|
|
except PageMissingError:
|
|
|
|
|
logerror(
|
|
|
|
|
config=config,
|
|
|
|
|
text=u'The page "%s" was missing in the wiki (probably deleted)' %
|
|
|
|
|
(title.decode('utf-8'))
|
|
|
|
|
text=u'The page "%s" was missing in the wiki (probably deleted)' % title
|
|
|
|
|
)
|
|
|
|
|
# here, XML is a correct <page> </page> chunk or
|
|
|
|
|
# an empty string due to a deleted page (logged in errors log) or
|
|
|
|
@ -906,7 +905,8 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
|
|
|
|
|
# End of continuation. We are done with this namespace.
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
except KeyError:
|
|
|
|
|
except (KeyError, mwclient.errors.InvalidResponse) as e:
|
|
|
|
|
print(e)
|
|
|
|
|
# TODO: check whether the KeyError was really for a missing arv API
|
|
|
|
|
print "Warning. Could not use allrevisions. Wiki too old?"
|
|
|
|
|
if config['curonly']:
|
|
|
|
@ -916,7 +916,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
|
|
|
|
|
c = 0
|
|
|
|
|
for title in readTitles(config, start=start):
|
|
|
|
|
# TODO: respect verbose flag, reuse output from getXMLPage
|
|
|
|
|
print(' {}'.format(title.strip()))
|
|
|
|
|
print(u' {}'.format(title))
|
|
|
|
|
# TODO: as we're doing one page and revision at a time, we might
|
|
|
|
|
# as well use xml format and exportnowrap=1 to use the string of,
|
|
|
|
|
# XML as is, but need to check how well the library handles it.
|
|
|
|
@ -948,18 +948,23 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
|
|
|
|
|
# refuses to return an arbitrary number of revisions (see above).
|
|
|
|
|
print("Getting titles to export all the revisions of each")
|
|
|
|
|
c = 0
|
|
|
|
|
for title in readTitles(config, start=start):
|
|
|
|
|
print(' {}'.format(title.strip()))
|
|
|
|
|
titlelist = []
|
|
|
|
|
# TODO: Decide a suitable number of a batched request. Careful:
|
|
|
|
|
# batched responses may not return all revisions.
|
|
|
|
|
for titlelist in readTitles(config, start=start, batch=False):
|
|
|
|
|
if type(titlelist) is not list:
|
|
|
|
|
titlelist = [titlelist]
|
|
|
|
|
for title in titlelist:
|
|
|
|
|
print(u' {}'.format(title))
|
|
|
|
|
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
|
|
|
|
|
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
|
|
|
|
|
pparams = {
|
|
|
|
|
'action': 'query',
|
|
|
|
|
'titles': title,
|
|
|
|
|
'titles': '|'.join(titlelist),
|
|
|
|
|
'prop': 'revisions',
|
|
|
|
|
'rvlimit': 50,
|
|
|
|
|
#'rvlimit': 50,
|
|
|
|
|
'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
|
|
|
|
|
}
|
|
|
|
|
# TODO: we could actually batch titles a bit here if desired. How many?
|
|
|
|
|
try:
|
|
|
|
|
prequest = site.api(http_method=config['http_method'], **pparams)
|
|
|
|
|
except requests.exceptions.HTTPError as e:
|
|
|
|
@ -967,6 +972,12 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
|
|
|
|
|
print("POST request to the API failed, retrying with GET")
|
|
|
|
|
config['http_method'] = "GET"
|
|
|
|
|
exportrequest = site.api(http_method=config['http_method'], **exportparams)
|
|
|
|
|
except mwclient.errors.InvalidResponse:
|
|
|
|
|
logerror(
|
|
|
|
|
config=config,
|
|
|
|
|
text=u'Error: page inaccessible? Could not export page: %s' % ("; ".join(titlelist))
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Be ready to iterate if there is continuation.
|
|
|
|
|
while True:
|
|
|
|
@ -978,7 +989,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
|
|
|
|
|
except KeyError:
|
|
|
|
|
logerror(
|
|
|
|
|
config=config,
|
|
|
|
|
text=u'Error: page inaccessible? Could not export page: %s' % (title.decode('utf-8'))
|
|
|
|
|
text=u'Error: page inaccessible? Could not export page: %s' % ("; ".join(titlelist))
|
|
|
|
|
)
|
|
|
|
|
break
|
|
|
|
|
# Go through the data we got to build the XML.
|
|
|
|
@ -989,14 +1000,15 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
|
|
|
|
|
except PageMissingError:
|
|
|
|
|
logerror(
|
|
|
|
|
config=config,
|
|
|
|
|
text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
|
|
|
|
|
text=u'Error: empty revision from API. Could not export page: %s' % ("; ".join(titlelist))
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Get next batch of revisions if there's more.
|
|
|
|
|
if 'continue' in prequest.keys():
|
|
|
|
|
print("Getting more revisions for page {}".format(title))
|
|
|
|
|
pparams['rvcontinue'] = prequest['continue']['rvcontinue']
|
|
|
|
|
print("Getting more revisions for the page")
|
|
|
|
|
for key, value in prequest['continue']:
|
|
|
|
|
params[key] = value
|
|
|
|
|
elif 'query-continue' in prequest.keys():
|
|
|
|
|
rvstartid = prequest['query-continue']['revisions']['rvstartid']
|
|
|
|
|
pparams['rvstartid'] = rvstartid
|
|
|
|
@ -1011,8 +1023,10 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
|
|
|
|
|
config['http_method'] = "GET"
|
|
|
|
|
prequest = site.api(http_method=config['http_method'], **pparams)
|
|
|
|
|
|
|
|
|
|
# We're done iterating for this title.
|
|
|
|
|
c += 1
|
|
|
|
|
# We're done iterating for this title or titles.
|
|
|
|
|
c += len(titlelist)
|
|
|
|
|
# Reset for the next batch.
|
|
|
|
|
titlelist = []
|
|
|
|
|
if c % 10 == 0:
|
|
|
|
|
print('Downloaded {} pages'.format(c))
|
|
|
|
|
|
|
|
|
@ -1042,7 +1056,6 @@ def makeXmlFromPage(page):
|
|
|
|
|
size = 0
|
|
|
|
|
revision = E.revision(
|
|
|
|
|
E.id(to_unicode(rev['revid'])),
|
|
|
|
|
E.parentid(to_unicode(rev['parentid'])),
|
|
|
|
|
E.timestamp(rev['timestamp']),
|
|
|
|
|
E.text(to_unicode(rev['*']), space="preserve", bytes=to_unicode(size)),
|
|
|
|
|
)
|
|
|
|
@ -1058,6 +1071,9 @@ def makeXmlFromPage(page):
|
|
|
|
|
revision.append(E.comment(to_unicode(rev['comment'])))
|
|
|
|
|
if 'contentmodel' in rev:
|
|
|
|
|
revision.append(E.model(rev['contentmodel']))
|
|
|
|
|
# Sometimes a missing parentid is not replaced with a 0 as it should.
|
|
|
|
|
if 'parentid' in rev:
|
|
|
|
|
revision.append(E.parentid(to_unicode(rev['parentid'])))
|
|
|
|
|
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
|
|
|
|
|
if 'sha1' in rev:
|
|
|
|
|
revision.append(E.sha1(rev['sha1']))
|
|
|
|
@ -1067,28 +1083,37 @@ def makeXmlFromPage(page):
|
|
|
|
|
raise PageMissingError(page['title'], e)
|
|
|
|
|
return etree.tostring(p, pretty_print=True, encoding='unicode')
|
|
|
|
|
|
|
|
|
|
def readTitles(config={}, start=None):
|
|
|
|
|
def readTitles(config={}, start=None, batch=False):
|
|
|
|
|
""" Read title list from a file, from the title "start" """
|
|
|
|
|
|
|
|
|
|
titlesfilename = '%s-%s-titles.txt' % (
|
|
|
|
|
domain2prefix(config=config), config['date'])
|
|
|
|
|
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')
|
|
|
|
|
|
|
|
|
|
titlelist = []
|
|
|
|
|
seeking = False
|
|
|
|
|
if start:
|
|
|
|
|
seeking = True
|
|
|
|
|
|
|
|
|
|
with titlesfile as f:
|
|
|
|
|
for line in f:
|
|
|
|
|
if line.strip() == '--END--':
|
|
|
|
|
title = line.decode("utf-8").strip()
|
|
|
|
|
if title == '--END--':
|
|
|
|
|
break
|
|
|
|
|
elif seeking and line.strip() != start:
|
|
|
|
|
elif seeking and title != start:
|
|
|
|
|
continue
|
|
|
|
|
elif seeking and line.strip() == start:
|
|
|
|
|
elif seeking and title == start:
|
|
|
|
|
seeking = False
|
|
|
|
|
yield line.strip()
|
|
|
|
|
|
|
|
|
|
if not batch:
|
|
|
|
|
yield title
|
|
|
|
|
else:
|
|
|
|
|
yield line.strip()
|
|
|
|
|
titlelist.append(title)
|
|
|
|
|
if len(titlelist) < batch:
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
yield titlelist
|
|
|
|
|
titlelist = []
|
|
|
|
|
|
|
|
|
|
def reverse_readline(filename, buf_size=8192, truncate=False):
|
|
|
|
|
"""a generator that returns the lines of a file in reverse order"""
|
|
|
|
@ -1953,7 +1978,7 @@ def checkIndex(index=None, cookies=None, session=None):
|
|
|
|
|
""" Checking index.php availability """
|
|
|
|
|
r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
|
|
|
|
|
if r.status_code >= 400:
|
|
|
|
|
print("ERROR: The wiki returned status code HTTP {}".format({}))
|
|
|
|
|
print("ERROR: The wiki returned status code HTTP {}".format(r.status_code))
|
|
|
|
|
return False
|
|
|
|
|
raw = r.text
|
|
|
|
|
print 'Checking index.php...', index
|
|
|
|
@ -2164,7 +2189,7 @@ def resumePreviousDump(config={}, other={}):
|
|
|
|
|
lastimage = lines[-1]
|
|
|
|
|
f.close()
|
|
|
|
|
except:
|
|
|
|
|
pass # probably file doesnot exists
|
|
|
|
|
pass # probably file does not exists
|
|
|
|
|
if lastimage == u'--END--':
|
|
|
|
|
print 'Image list was completed in the previous session'
|
|
|
|
|
else:
|
|
|
|
|