Wikia: make getXMLHeader() check more lenient,

Otherwise we end up using Special:Export even though the export API would work perfectly well with --xmlrevisions. May also fix images on fandom.com: https://github.com/WikiTeam/wikiteam/issues/330
4 years ago · 2c21eadf7c
parent 131e19979c
commit 2c21eadf7c
1 changed files with 15 additions and 1 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -423,10 +423,24 @@ def getXMLHeader(config={}, session=None):
            # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.18
            r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1', timeout=10)
            xml = r.text
+            # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19
+            if not xml:
+                r = session.get(config['api'] + '?action=query&export=1&list=allpages&aplimit=1&format=json', timeout=10)
+                try:
+                    xml = r.json()['query']['export']['*']
+                except KeyError:
+                    xml = None
            if not xml:
                # Do without a generator, use our usual trick of a random page title
                r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&titles=' + randomtitle, timeout=10)
                xml = r.text
+            # Again try without exportnowrap
+            if not xml:
+                r = session.get(config['api'] + '?action=query&export=1&format=json&titles=' + randomtitle, timeout=10)
+                try:
+                    xml = r.json()['query']['export']['*']
+                except KeyError:
+                    xml = None
        except requests.exceptions.RetryError:
            pass

@ -1302,7 +1316,7 @@ def getImageNamesAPI(config={}, session=None):
                url = curateImageURL(config=config, url=url)
                # encoding to ascii is needed to work around this horrible bug:
                # http://bugs.python.org/issue8136
-                if 'api' in config and '.wikia.com' in config['api']:
+                if 'api' in config and ('.wikia.' in config['api'] or '.fandom.com' in config['api']):
                    #to avoid latest?cb=20120816112532 in filenames
                    filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')), 'utf-8')
                else: