|
|
@ -181,7 +181,7 @@ def getNamespacesAPI(config={}, session=None):
|
|
|
|
'siprop': 'namespaces',
|
|
|
|
'siprop': 'namespaces',
|
|
|
|
'format': 'json'}
|
|
|
|
'format': 'json'}
|
|
|
|
)
|
|
|
|
)
|
|
|
|
result = json.loads(r.text)
|
|
|
|
result = getJSON(r)
|
|
|
|
delay(config=config, session=session)
|
|
|
|
delay(config=config, session=session)
|
|
|
|
|
|
|
|
|
|
|
|
if 'all' in namespaces:
|
|
|
|
if 'all' in namespaces:
|
|
|
@ -236,7 +236,7 @@ def getPageTitlesAPI(config={}, session=None):
|
|
|
|
r = session.post(url=config['api'], data=params)
|
|
|
|
r = session.post(url=config['api'], data=params)
|
|
|
|
handleStatusCode(r)
|
|
|
|
handleStatusCode(r)
|
|
|
|
# FIXME Handle HTTP errors here!
|
|
|
|
# FIXME Handle HTTP errors here!
|
|
|
|
jsontitles = json.loads(r.text)
|
|
|
|
jsontitles = getJSON(r)
|
|
|
|
apfrom = ''
|
|
|
|
apfrom = ''
|
|
|
|
if 'query-continue' in jsontitles and 'allpages' in jsontitles[
|
|
|
|
if 'query-continue' in jsontitles and 'allpages' in jsontitles[
|
|
|
|
'query-continue']:
|
|
|
|
'query-continue']:
|
|
|
@ -353,7 +353,7 @@ def getPageTitles(config={}, session=None):
|
|
|
|
titles = []
|
|
|
|
titles = []
|
|
|
|
if 'api' in config and config['api']:
|
|
|
|
if 'api' in config and config['api']:
|
|
|
|
r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'})
|
|
|
|
r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'})
|
|
|
|
test = json.loads(r.text)
|
|
|
|
test = getJSON(r)
|
|
|
|
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
|
|
|
|
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
|
|
|
|
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
|
|
|
|
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
|
|
|
|
titles = getPageTitlesScraper(config=config, session=session)
|
|
|
|
titles = getPageTitlesScraper(config=config, session=session)
|
|
|
@ -483,7 +483,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
r = session.post(url=config['index'], data=params, headers=headers)
|
|
|
|
r = session.post(url=config['index'], data=params, headers=headers)
|
|
|
|
handleStatusCode(r)
|
|
|
|
handleStatusCode(r)
|
|
|
|
xml = r.text
|
|
|
|
xml = fixBOM(r)
|
|
|
|
except requests.exceptions.ConnectionError as e:
|
|
|
|
except requests.exceptions.ConnectionError as e:
|
|
|
|
xml = ''
|
|
|
|
xml = ''
|
|
|
|
c += 1
|
|
|
|
c += 1
|
|
|
@ -854,7 +854,7 @@ def getImageNamesAPI(config={}, session=None):
|
|
|
|
# FIXME Handle HTTP Errors HERE
|
|
|
|
# FIXME Handle HTTP Errors HERE
|
|
|
|
r = session.post(url=config['api'], data=params)
|
|
|
|
r = session.post(url=config['api'], data=params)
|
|
|
|
handleStatusCode(r)
|
|
|
|
handleStatusCode(r)
|
|
|
|
jsonimages = json.loads(r.text)
|
|
|
|
jsonimages = getJSON(r)
|
|
|
|
delay(config=config, session=session)
|
|
|
|
delay(config=config, session=session)
|
|
|
|
|
|
|
|
|
|
|
|
if 'query' in jsonimages:
|
|
|
|
if 'query' in jsonimages:
|
|
|
@ -904,7 +904,7 @@ def getImageNamesAPI(config={}, session=None):
|
|
|
|
# FIXME Handle HTTP Errors HERE
|
|
|
|
# FIXME Handle HTTP Errors HERE
|
|
|
|
r = session.post(url=config['api'], data=params)
|
|
|
|
r = session.post(url=config['api'], data=params)
|
|
|
|
handleStatusCode(r)
|
|
|
|
handleStatusCode(r)
|
|
|
|
jsonimages = json.loads(r.text)
|
|
|
|
jsonimages = getJSON(r)
|
|
|
|
delay(config=config, session=session)
|
|
|
|
delay(config=config, session=session)
|
|
|
|
|
|
|
|
|
|
|
|
if 'query' in jsonimages:
|
|
|
|
if 'query' in jsonimages:
|
|
|
@ -1383,15 +1383,14 @@ def checkAPI(api=None, session=None):
|
|
|
|
'meta': 'siteinfo',
|
|
|
|
'meta': 'siteinfo',
|
|
|
|
'format': 'json'}
|
|
|
|
'format': 'json'}
|
|
|
|
)
|
|
|
|
)
|
|
|
|
resultText = r.text
|
|
|
|
|
|
|
|
if r.url == api:
|
|
|
|
if r.url == api:
|
|
|
|
break
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
api = r.url
|
|
|
|
api = r.url
|
|
|
|
if "MediaWiki API is not enabled for this site." in resultText:
|
|
|
|
if "MediaWiki API is not enabled for this site." in r.text:
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
result = json.loads(resultText)
|
|
|
|
result = getJSON(r)
|
|
|
|
index = None
|
|
|
|
index = None
|
|
|
|
if result['query']:
|
|
|
|
if result['query']:
|
|
|
|
try:
|
|
|
|
try:
|
|
|
@ -1402,6 +1401,7 @@ def checkAPI(api=None, session=None):
|
|
|
|
print "MediaWiki API seems to work but returned no index URL"
|
|
|
|
print "MediaWiki API seems to work but returned no index URL"
|
|
|
|
return (True, None, api)
|
|
|
|
return (True, None, api)
|
|
|
|
except ValueError:
|
|
|
|
except ValueError:
|
|
|
|
|
|
|
|
print repr(r.text)
|
|
|
|
print "MediaWiki API returned data we could not parse"
|
|
|
|
print "MediaWiki API returned data we could not parse"
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
return False
|
|
|
@ -1444,6 +1444,20 @@ def removeIP(raw=''):
|
|
|
|
return raw
|
|
|
|
return raw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getJSON(request):
|
|
|
|
|
|
|
|
"""Strip Unicode BOM"""
|
|
|
|
|
|
|
|
if request.text.startswith(u'\ufeff'):
|
|
|
|
|
|
|
|
request.encoding = 'utf-8-sig'
|
|
|
|
|
|
|
|
return request.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fixBOM(request):
|
|
|
|
|
|
|
|
"""Strip Unicode BOM"""
|
|
|
|
|
|
|
|
if request.text.startswith(u'\ufeff'):
|
|
|
|
|
|
|
|
request.encoding = 'utf-8-sig'
|
|
|
|
|
|
|
|
return request.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def checkXMLIntegrity(config={}, titles=[], session=None):
|
|
|
|
def checkXMLIntegrity(config={}, titles=[], session=None):
|
|
|
|
""" Check XML dump integrity, to detect broken XML chunks """
|
|
|
|
""" Check XML dump integrity, to detect broken XML chunks """
|
|
|
|
return
|
|
|
|
return
|
|
|
@ -1702,7 +1716,7 @@ def saveSiteInfo(config={}, session=None):
|
|
|
|
'sinumberingroup': 1,
|
|
|
|
'sinumberingroup': 1,
|
|
|
|
'format': 'json'})
|
|
|
|
'format': 'json'})
|
|
|
|
# MediaWiki 1.11-1.12
|
|
|
|
# MediaWiki 1.11-1.12
|
|
|
|
if not 'query' in json.loads(r.text):
|
|
|
|
if not 'query' in getJSON(r):
|
|
|
|
r = session.post(
|
|
|
|
r = session.post(
|
|
|
|
url=config['api'],
|
|
|
|
url=config['api'],
|
|
|
|
data={
|
|
|
|
data={
|
|
|
@ -1711,7 +1725,7 @@ def saveSiteInfo(config={}, session=None):
|
|
|
|
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
|
|
|
|
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
|
|
|
|
'format': 'json'})
|
|
|
|
'format': 'json'})
|
|
|
|
# MediaWiki 1.8-1.10
|
|
|
|
# MediaWiki 1.8-1.10
|
|
|
|
if not 'query' in json.loads(r.text):
|
|
|
|
if not 'query' in getJSON(r):
|
|
|
|
r = session.post(
|
|
|
|
r = session.post(
|
|
|
|
url=config['api'],
|
|
|
|
url=config['api'],
|
|
|
|
data={
|
|
|
|
data={
|
|
|
@ -1719,7 +1733,7 @@ def saveSiteInfo(config={}, session=None):
|
|
|
|
'meta': 'siteinfo',
|
|
|
|
'meta': 'siteinfo',
|
|
|
|
'siprop': 'general|namespaces',
|
|
|
|
'siprop': 'general|namespaces',
|
|
|
|
'format': 'json'})
|
|
|
|
'format': 'json'})
|
|
|
|
result = json.loads(r.text)
|
|
|
|
result = getJSON(r)
|
|
|
|
delay(config=config, session=session)
|
|
|
|
delay(config=config, session=session)
|
|
|
|
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
|
|
|
|
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
|
|
|
|
outfile.write(json.dumps(result, indent=4, sort_keys=True))
|
|
|
|
outfile.write(json.dumps(result, indent=4, sort_keys=True))
|
|
|
|