Merge pull request #367 from nemobis/xmlrevisions

Make --xmlrevisions work on some more wikis
pull/373/head
nemobis 4 years ago committed by GitHub
commit 912450b606
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -782,6 +782,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
# https://github.com/WikiTeam/wikiteam/issues/358
site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme)
if not 'all' in config['namespaces']:
namespaces = config['namespaces']
else:
@ -794,7 +795,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
arvparams = {
'action': 'query',
'list': 'allrevisions',
'arvlimit': 500,
'arvlimit': 50,
'arvnamespace': namespace
}
if not config['curonly']:
@ -962,14 +963,14 @@ def getXMLRevisions(config={}, session=None, allpages=False):
config['http_method'] = "GET"
exportrequest = site.api(http_method=config['http_method'], **exportparams)
# The array is called "pages" even if there's only one.
# TODO: we could actually batch titles a bit here if desired. How many?
try:
pages = prequest['query']['pages']
except KeyError:
raise PageMissingError(title, xml='')
# Be ready to iterate if there is continuation.
while True:
# The array is called "pages" even if there's only one.
# TODO: we could actually batch titles a bit here if desired. How many?
try:
pages = prequest['query']['pages']
except KeyError:
raise PageMissingError(title, xml='')
# Go through the data we got to build the XML.
for pageid in pages:
try:
@ -1033,10 +1034,10 @@ def makeXmlFromPage(page):
E.parentid(to_unicode(rev['parentid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(to_unicode(userid)),
E.username(to_unicode(rev['user'])),
E.id(to_unicode(userid)),
),
E.text(rev['*'], space="preserve", bytes=to_unicode(size)),
E.text(to_unicode(rev['*']), space="preserve", bytes=to_unicode(size)),
)
if 'comment' in rev:
revision.append(E.comment(to_unicode(rev['comment'])))
@ -1049,7 +1050,7 @@ def makeXmlFromPage(page):
except KeyError as e:
print(e)
raise PageMissingError(page['title'], e)
return etree.tostring(p, pretty_print=True)
return etree.tostring(p, pretty_print=True, encoding='unicode')
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """
@ -1289,7 +1290,7 @@ def getImageNamesAPI(config={}, session=None):
'aiprop': 'url|user',
'aifrom': aifrom,
'format': 'json',
'ailimit': 500}
'ailimit': 50}
# FIXME Handle HTTP Errors HERE
r = session.get(url=config['api'], params=params, timeout=30)
handleStatusCode(r)
@ -1343,7 +1344,7 @@ def getImageNamesAPI(config={}, session=None):
'action': 'query',
'generator': 'allpages',
'gapnamespace': 6,
'gaplimit': 500,
'gaplimit': 50,
'gapfrom': gapfrom,
'prop': 'imageinfo',
'iiprop': 'user|url',
@ -1735,23 +1736,13 @@ def getParameters(params=[]):
index2 = None
if api:
retry = 0
maxretries = args.retries
retrydelay = 20
check = None
while retry < maxretries:
try:
check = checkAPI(api=api, session=session)
break
except requests.exceptions.ConnectionError as e:
print 'Connection error: %s'%(str(e))
retry += 1
print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
time.sleep(retrydelay)
check, checkedapi = checkRetryAPI(api, args.retries, args.xmlrevisions, session)
if api and check:
# Replace the index URL we got from the API check
index2 = check[1]
api = check[2]
print 'API is OK: ' + api
api = checkedapi
print 'API is OK: ' + checkedapi
else:
if index and not args.wiki:
print 'API not available. Trying with index.php only.'
@ -1865,6 +1856,42 @@ def getParameters(params=[]):
return config, other
def checkRetryAPI(api=None, retries=5, apiclient=False, session=None):
""" Call checkAPI and mwclient if necessary """
retry = 0
retrydelay = 20
check = None
while retry < retries:
try:
check = checkAPI(api, session=session)
break
except requests.exceptions.ConnectionError as e:
print 'Connection error: %s'%(str(e))
retry += 1
print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
time.sleep(retrydelay)
if check and apiclient:
apiurl = urlparse(api)
try:
site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme)
except KeyError:
# Probably KeyError: 'query'
if apiurl.scheme == "https":
newscheme = "http"
api = api.replace("https://", "http://")
else:
newscheme = "https"
api = api.replace("http://", "https://")
print("WARNING: The provided API URL did not work with mwclient. Switched protocol to: {}".format(newscheme))
try:
site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=newscheme)
except KeyError:
check = False
return check, api
def checkAPI(api=None, session=None):
""" Checking API availability """
global cj

Loading…
Cancel
Save