Merge branch 'master' into wikia

pull/373/head
nemobis 4 years ago committed by GitHub
commit 8a2116699e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -715,16 +715,20 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
lock = True
if config['xmlrevisions']:
print 'Retrieving the XML for every page from the beginning'
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
if start:
print("WARNING: will try to start the download from title: {}".format(start))
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
else:
print 'Retrieving the XML for every page from the beginning'
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
try:
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
for xml in getXMLRevisions(config=config, session=session):
for xml in getXMLRevisions(config=config, session=session, start=start):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
# TODO: get the page title and reuse the usual format "X title, y edits"
print "%d more revisions exported" % numrevs
print " %d more revisions exported" % numrevs
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except AttributeError as e:
@ -776,12 +780,13 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
xmlfile.close()
print 'XML dump saved at...', xmlfilename
def getXMLRevisions(config={}, session=None, allpages=False):
def getXMLRevisions(config={}, session=None, allpages=False, start=None):
# FIXME: actually figure out the various strategies for each MediaWiki version
apiurl = urlparse(config['api'])
# FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
# https://github.com/WikiTeam/wikiteam/issues/358
site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme)
if not 'all' in config['namespaces']:
namespaces = config['namespaces']
else:
@ -789,12 +794,12 @@ def getXMLRevisions(config={}, session=None, allpages=False):
try:
for namespace in namespaces:
print "Trying to export all revisions from namespace %s" % namespace
print("Trying to export all revisions from namespace %s" % namespace)
# arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
arvparams = {
'action': 'query',
'list': 'allrevisions',
'arvlimit': 500,
'arvlimit': 50,
'arvnamespace': namespace
}
if not config['curonly']:
@ -856,7 +861,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
for page in arvrequest['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
print "%d more revisions listed, until %s" % (len(revids), revids[-1])
print " %d more revisions listed, until %s" % (len(revids), revids[-1])
# We can now get the XML for one revision at a time
# FIXME: we can actually get them in batches as we used to
@ -909,7 +914,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# We could also use the allpages API as generator but let's be consistent.
print("Getting titles to export the latest revision for each")
c = 0
for title in readTitles(config):
for title in readTitles(config, start=start):
# TODO: respect verbose flag, reuse output from getXMLPage
print(' {}'.format(title.strip()))
# TODO: as we're doing one page and revision at a time, we might
@ -943,7 +948,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# refuses to return an arbitrary number of revisions (see above).
print("Getting titles to export all the revisions of each")
c = 0
for title in readTitles(config):
for title in readTitles(config, start=start):
print(' {}'.format(title.strip()))
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
@ -977,10 +982,16 @@ def getXMLRevisions(config={}, session=None, allpages=False):
continue
# Be ready to iterate if there is continuation.
while True:
# The array is called "pages" even if there's only one.
# TODO: we could actually batch titles a bit here if desired. How many?
try:
pages = prequest['query']['pages']
except KeyError:
raise PageMissingError(title, xml='')
# Go through the data we got to build the XML.
for page in pages:
for pageid in pages:
try:
xml = makeXmlFromPage(pages[page])
xml = makeXmlFromPage(pages[pageid])
yield xml
except PageMissingError:
logerror(
@ -993,19 +1004,24 @@ def getXMLRevisions(config={}, session=None, allpages=False):
if 'continue' in prequest.keys():
print("Getting more revisions for page {}".format(title))
pparams['rvcontinue'] = prequest['continue']['rvcontinue']
try:
prequest = site.api(http_method=config['http_method'], **pparams)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 405 and config['http_method'] == "POST":
print("POST request to the API failed, retrying with GET")
config['http_method'] = "GET"
prequest = site.api(http_method=config['http_method'], **pparams)
# mwclient seems to rewrite query-continue
#if 'query-continue' in prequest.keys():
# pparams['rvcontinue'] = prequest['query-continue']['revisions']['rvcontinue']
elif 'query-continue' in prequest.keys():
rvstartid = prequest['query-continue']['revisions']['rvstartid']
pparams['rvstartid'] = rvstartid
else:
break
try:
prequest = site.api(http_method=config['http_method'], **pparams)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 405 and config['http_method'] == "POST":
print("POST request to the API failed, retrying with GET")
config['http_method'] = "GET"
prequest = site.api(http_method=config['http_method'], **pparams)
c += 1
if c % 10 == 0:
print('Downloaded {} pages'.format(c))
except mwclient.errors.MwClientError as e:
print(e)
@ -1034,12 +1050,16 @@ def makeXmlFromPage(page):
E.id(to_unicode(rev['revid'])),
E.parentid(to_unicode(rev['parentid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(to_unicode(userid)),
E.username(to_unicode(rev['user'])),
),
E.text(rev['*'], space="preserve", bytes=to_unicode(size)),
E.text(to_unicode(rev['*']), space="preserve", bytes=to_unicode(size)),
)
# The username may be deleted/suppressed
if 'user' in rev:
revision.append(E.contributor(
E.username(to_unicode(rev['user'])),
E.id(to_unicode(userid)),
))
else:
revision.append(E.contributor(deleted="deleted"))
if 'comment' in rev:
revision.append(E.comment(to_unicode(rev['comment'])))
if 'contentmodel' in rev:
@ -1051,7 +1071,7 @@ def makeXmlFromPage(page):
except KeyError as e:
print(e)
raise PageMissingError(page['title'], e)
return etree.tostring(p, pretty_print=True)
return etree.tostring(p, pretty_print=True, encoding='unicode')
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """
@ -1291,7 +1311,7 @@ def getImageNamesAPI(config={}, session=None):
'aiprop': 'url|user',
'aifrom': aifrom,
'format': 'json',
'ailimit': 500}
'ailimit': 50}
# FIXME Handle HTTP Errors HERE
r = session.get(url=config['api'], params=params, timeout=30)
handleStatusCode(r)
@ -1345,7 +1365,7 @@ def getImageNamesAPI(config={}, session=None):
'action': 'query',
'generator': 'allpages',
'gapnamespace': 6,
'gaplimit': 500,
'gaplimit': 50,
'gapfrom': gapfrom,
'prop': 'imageinfo',
'iiprop': 'user|url',
@ -1737,23 +1757,13 @@ def getParameters(params=[]):
index2 = None
if api:
retry = 0
maxretries = args.retries
retrydelay = 20
check = None
while retry < maxretries:
try:
check = checkAPI(api=api, session=session)
break
except requests.exceptions.ConnectionError as e:
print 'Connection error: %s'%(str(e))
retry += 1
print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
time.sleep(retrydelay)
check, checkedapi = checkRetryAPI(api, args.retries, args.xmlrevisions, session)
if api and check:
# Replace the index URL we got from the API check
index2 = check[1]
api = check[2]
print 'API is OK: ' + api
api = checkedapi
print 'API is OK: ' + checkedapi
else:
if index and not args.wiki:
print 'API not available. Trying with index.php only.'
@ -1867,6 +1877,42 @@ def getParameters(params=[]):
return config, other
def checkRetryAPI(api=None, retries=5, apiclient=False, session=None):
""" Call checkAPI and mwclient if necessary """
retry = 0
retrydelay = 20
check = None
while retry < retries:
try:
check = checkAPI(api, session=session)
break
except requests.exceptions.ConnectionError as e:
print 'Connection error: %s'%(str(e))
retry += 1
print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
time.sleep(retrydelay)
if check and apiclient:
apiurl = urlparse(api)
try:
site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme)
except KeyError:
# Probably KeyError: 'query'
if apiurl.scheme == "https":
newscheme = "http"
api = api.replace("https://", "http://")
else:
newscheme = "https"
api = api.replace("http://", "https://")
print("WARNING: The provided API URL did not work with mwclient. Switched protocol to: {}".format(newscheme))
try:
site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=newscheme)
except KeyError:
check = False
return check, api
def checkAPI(api=None, session=None):
""" Checking API availability """
global cj

@ -1,4 +1,5 @@
argparse>=1.2.1
requests>=2.3.0
internetarchive
kitchen
mwclient
requests>=2.3.0

Loading…
Cancel
Save