Strip ZWNBSP (U+FEFF) Byte-Order Mark from JSON/XML

pull/229/head
PiRSquared17 9 years ago
parent 711a88df59
commit 1c820dafb7

@ -181,7 +181,7 @@ def getNamespacesAPI(config={}, session=None):
'siprop': 'namespaces', 'siprop': 'namespaces',
'format': 'json'} 'format': 'json'}
) )
result = json.loads(r.text) result = getJSON(r)
delay(config=config, session=session) delay(config=config, session=session)
if 'all' in namespaces: if 'all' in namespaces:
@ -236,7 +236,7 @@ def getPageTitlesAPI(config={}, session=None):
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params)
handleStatusCode(r) handleStatusCode(r)
# FIXME Handle HTTP errors here! # FIXME Handle HTTP errors here!
jsontitles = json.loads(r.text) jsontitles = getJSON(r)
apfrom = '' apfrom = ''
if 'query-continue' in jsontitles and 'allpages' in jsontitles[ if 'query-continue' in jsontitles and 'allpages' in jsontitles[
'query-continue']: 'query-continue']:
@ -353,7 +353,7 @@ def getPageTitles(config={}, session=None):
titles = [] titles = []
if 'api' in config and config['api']: if 'api' in config and config['api']:
r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'}) r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'})
test = json.loads(r.text) test = getJSON(r)
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages'] if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'): and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
titles = getPageTitlesScraper(config=config, session=session) titles = getPageTitlesScraper(config=config, session=session)
@ -483,7 +483,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
try: try:
r = session.post(url=config['index'], data=params, headers=headers) r = session.post(url=config['index'], data=params, headers=headers)
handleStatusCode(r) handleStatusCode(r)
xml = r.text xml = fixBOM(r)
except requests.exceptions.ConnectionError as e: except requests.exceptions.ConnectionError as e:
xml = '' xml = ''
c += 1 c += 1
@ -854,7 +854,7 @@ def getImageNamesAPI(config={}, session=None):
# FIXME Handle HTTP Errors HERE # FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params)
handleStatusCode(r) handleStatusCode(r)
jsonimages = json.loads(r.text) jsonimages = getJSON(r)
delay(config=config, session=session) delay(config=config, session=session)
if 'query' in jsonimages: if 'query' in jsonimages:
@ -904,7 +904,7 @@ def getImageNamesAPI(config={}, session=None):
# FIXME Handle HTTP Errors HERE # FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params)
handleStatusCode(r) handleStatusCode(r)
jsonimages = json.loads(r.text) jsonimages = getJSON(r)
delay(config=config, session=session) delay(config=config, session=session)
if 'query' in jsonimages: if 'query' in jsonimages:
@ -1383,15 +1383,14 @@ def checkAPI(api=None, session=None):
'meta': 'siteinfo', 'meta': 'siteinfo',
'format': 'json'} 'format': 'json'}
) )
resultText = r.text
if r.url == api: if r.url == api:
break break
else: else:
api = r.url api = r.url
if "MediaWiki API is not enabled for this site." in resultText: if "MediaWiki API is not enabled for this site." in r.text:
return False return False
try: try:
result = json.loads(resultText) result = getJSON(r)
index = None index = None
if result['query']: if result['query']:
try: try:
@ -1402,6 +1401,7 @@ def checkAPI(api=None, session=None):
print "MediaWiki API seems to work but returned no index URL" print "MediaWiki API seems to work but returned no index URL"
return (True, None, api) return (True, None, api)
except ValueError: except ValueError:
print repr(r.text)
print "MediaWiki API returned data we could not parse" print "MediaWiki API returned data we could not parse"
return False return False
return False return False
@ -1444,6 +1444,20 @@ def removeIP(raw=''):
return raw return raw
def getJSON(request):
"""Strip Unicode BOM"""
if request.text.startswith(u'\ufeff'):
request.encoding = 'utf-8-sig'
return request.json()
def fixBOM(request):
"""Strip Unicode BOM"""
if request.text.startswith(u'\ufeff'):
request.encoding = 'utf-8-sig'
return request.text
def checkXMLIntegrity(config={}, titles=[], session=None): def checkXMLIntegrity(config={}, titles=[], session=None):
""" Check XML dump integrity, to detect broken XML chunks """ """ Check XML dump integrity, to detect broken XML chunks """
return return
@ -1702,7 +1716,7 @@ def saveSiteInfo(config={}, session=None):
'sinumberingroup': 1, 'sinumberingroup': 1,
'format': 'json'}) 'format': 'json'})
# MediaWiki 1.11-1.12 # MediaWiki 1.11-1.12
if not 'query' in json.loads(r.text): if not 'query' in getJSON(r):
r = session.post( r = session.post(
url=config['api'], url=config['api'],
data={ data={
@ -1711,7 +1725,7 @@ def saveSiteInfo(config={}, session=None):
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
'format': 'json'}) 'format': 'json'})
# MediaWiki 1.8-1.10 # MediaWiki 1.8-1.10
if not 'query' in json.loads(r.text): if not 'query' in getJSON(r):
r = session.post( r = session.post(
url=config['api'], url=config['api'],
data={ data={
@ -1719,7 +1733,7 @@ def saveSiteInfo(config={}, session=None):
'meta': 'siteinfo', 'meta': 'siteinfo',
'siprop': 'general|namespaces', 'siprop': 'general|namespaces',
'format': 'json'}) 'format': 'json'})
result = json.loads(r.text) result = getJSON(r)
delay(config=config, session=session) delay(config=config, session=session)
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
outfile.write(json.dumps(result, indent=4, sort_keys=True)) outfile.write(json.dumps(result, indent=4, sort_keys=True))

Loading…
Cancel
Save