mirror of
https://github.com/WikiTeam/wikiteam
synced 2024-11-16 21:27:46 +00:00
Wikia: make getXMLHeader() check more lenient
Otherwise we end up using Special:Export even though the export API would work perfectly well with --xmlrevisions. For some reason using the general requests session always got an empty response from the Wikia API. May also fix images on fandom.com: https://github.com/WikiTeam/wikiteam/issues/330
This commit is contained in:
parent
131e19979c
commit
17283113dd
@ -417,16 +417,29 @@ def getXMLHeader(config={}, session=None):
|
|||||||
print config['api']
|
print config['api']
|
||||||
xml = ''
|
xml = ''
|
||||||
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
|
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
|
||||||
xml = None
|
|
||||||
try:
|
try:
|
||||||
print 'Getting the XML header from the API'
|
print 'Getting the XML header from the API'
|
||||||
# Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.18
|
# Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.18
|
||||||
r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1', timeout=10)
|
r = requests.get(config['api'] + '?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1', timeout=10)
|
||||||
xml = r.text
|
xml = r.text
|
||||||
if not xml:
|
# Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19
|
||||||
|
if not re.match(r"\s*<mediawiki", xml):
|
||||||
|
r = requests.get(config['api'] + '?action=query&export=1&list=allpages&aplimit=1&format=json', timeout=10)
|
||||||
|
try:
|
||||||
|
xml = r.json()['query']['export']['*']
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
if not re.match(r"\s*<mediawiki", xml):
|
||||||
# Do without a generator, use our usual trick of a random page title
|
# Do without a generator, use our usual trick of a random page title
|
||||||
r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&titles=' + randomtitle, timeout=10)
|
r = requests.get(config['api'] + '?action=query&export=1&exportnowrap=1&titles=' + randomtitle, timeout=10)
|
||||||
xml = r.text
|
xml = r.text
|
||||||
|
# Again try without exportnowrap
|
||||||
|
if not re.match(r"\s*<mediawiki", xml):
|
||||||
|
r = requests.get(config['api'] + '?action=query&export=1&format=json&titles=' + randomtitle, timeout=10)
|
||||||
|
try:
|
||||||
|
xml = r.json()['query']['export']['*']
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
except requests.exceptions.RetryError:
|
except requests.exceptions.RetryError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -484,8 +497,8 @@ def getUserAgent():
|
|||||||
""" Return a cool user-agent to hide Python user-agent """
|
""" Return a cool user-agent to hide Python user-agent """
|
||||||
useragents = [
|
useragents = [
|
||||||
# firefox
|
# firefox
|
||||||
'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0',
|
'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
|
||||||
'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0',
|
'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
|
||||||
]
|
]
|
||||||
return useragents[0]
|
return useragents[0]
|
||||||
|
|
||||||
@ -990,7 +1003,8 @@ def getXMLRevisions(config={}, session=None, allpages=False):
|
|||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
except mwclient.errors.MwClientError:
|
except mwclient.errors.MwClientError as e:
|
||||||
|
print(e)
|
||||||
print "This mwclient version seems not to work for us. Exiting."
|
print "This mwclient version seems not to work for us. Exiting."
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
@ -1302,7 +1316,7 @@ def getImageNamesAPI(config={}, session=None):
|
|||||||
url = curateImageURL(config=config, url=url)
|
url = curateImageURL(config=config, url=url)
|
||||||
# encoding to ascii is needed to work around this horrible bug:
|
# encoding to ascii is needed to work around this horrible bug:
|
||||||
# http://bugs.python.org/issue8136
|
# http://bugs.python.org/issue8136
|
||||||
if 'api' in config and '.wikia.com' in config['api']:
|
if 'api' in config and ('.wikia.' in config['api'] or '.fandom.com' in config['api']):
|
||||||
#to avoid latest?cb=20120816112532 in filenames
|
#to avoid latest?cb=20120816112532 in filenames
|
||||||
filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')), 'utf-8')
|
filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')), 'utf-8')
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user