From 131e19979c35853a8a58aa97c0f1e69b80d452bb Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 22:13:21 +0200 Subject: [PATCH 1/3] Use mwclient generator for allpages Tested with MediaWiki 1.31 and 1.19. --- dumpgenerator.py | 68 +++++++----------------------------------------- 1 file changed, 10 insertions(+), 58 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index b197fb6..924a02e 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -257,65 +257,17 @@ def getPageTitlesAPI(config={}, session=None): c = 0 print ' Retrieving titles in the namespace %d' % (namespace) - apfrom = '' - while apfrom: - sys.stderr.write('.') # progress - params = { - 'action': 'query', - 'list': 'allpages', - 'apnamespace': namespace, - 'apfrom': apfrom, - 'format': 'json', - 'aplimit': 500} - - retryCount = 0 - while retryCount < config["retries"]: - try: - r = session.get(url=config['api'], params=params, timeout=30) - break - except requests.exceptions.ConnectionError as err: - print "Connection error: %s" % (str(err),) - retryCount += 1 - time.sleep(20) - handleStatusCode(r) - # FIXME Handle HTTP errors here! - jsontitles = getJSON(r) - apfrom = '' - if 'query-continue' in jsontitles and 'allpages' in jsontitles[ - 'query-continue']: - if 'apcontinue' in jsontitles['query-continue']['allpages']: - apfrom = jsontitles[ - 'query-continue']['allpages']['apcontinue'] - elif 'apfrom' in jsontitles['query-continue']['allpages']: - apfrom = jsontitles['query-continue']['allpages']['apfrom'] - elif 'continue' in jsontitles: - if 'apcontinue' in jsontitles['continue']: - apfrom = jsontitles['continue']['apcontinue'] - elif 'apfrom' in jsontitles['continue']: - apfrom = jsontitles['continue']['apfrom'] - - # print apfrom - # print jsontitles - try: - allpages = jsontitles['query']['allpages'] - except KeyError: - print "The allpages API returned nothing. Exit." - sys.exit(1) + apiurl = urlparse(config['api']) + site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme) + for page in site.allpages(namespace=namespace): + title = page.name + titles.append(title) + c += 1 + yield title - # Hack for old versions of MediaWiki API where result is dict - if isinstance(allpages, dict): - allpages = allpages.values() - for page in allpages: - title = page['title'] - titles.append(title) - yield title - c += len(allpages) - - if len(titles) != len(set(titles)): - print 'Probably a loop, switching to next namespace. Duplicate title:' - print title - titles = list(set(titles)) - apfrom = '' + if len(titles) != len(set(titles)): + print 'Probably a loop, switching to next namespace' + titles = list(set(titles)) delay(config=config, session=session) print ' %d titles retrieved in the namespace %d' % (c, namespace) From 2c21eadf7c456b7ea1efe233d5eeaa45e29d3ee3 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 22:32:01 +0200 Subject: [PATCH 2/3] Wikia: make getXMLHeader() check more lenient, Otherwise we end up using Special:Export even though the export API would work perfectly well with --xmlrevisions. May also fix images on fandom.com: https://github.com/WikiTeam/wikiteam/issues/330 --- dumpgenerator.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 924a02e..071a458 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -423,10 +423,24 @@ def getXMLHeader(config={}, session=None): # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.18 r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1', timeout=10) xml = r.text + # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19 + if not xml: + r = session.get(config['api'] + '?action=query&export=1&list=allpages&aplimit=1&format=json', timeout=10) + try: + xml = r.json()['query']['export']['*'] + except KeyError: + xml = None if not xml: # Do without a generator, use our usual trick of a random page title r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&titles=' + randomtitle, timeout=10) xml = r.text + # Again try without exportnowrap + if not xml: + r = session.get(config['api'] + '?action=query&export=1&format=json&titles=' + randomtitle, timeout=10) + try: + xml = r.json()['query']['export']['*'] + except KeyError: + xml = None except requests.exceptions.RetryError: pass @@ -1302,7 +1316,7 @@ def getImageNamesAPI(config={}, session=None): url = curateImageURL(config=config, url=url) # encoding to ascii is needed to work around this horrible bug: # http://bugs.python.org/issue8136 - if 'api' in config and '.wikia.com' in config['api']: + if 'api' in config and ('.wikia.' in config['api'] or '.fandom.com' in config['api']): #to avoid latest?cb=20120816112532 in filenames filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')), 'utf-8') else: From 17283113dda2d5553ef5d65fe83f9afe13e55307 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 22:32:01 +0200 Subject: [PATCH 3/3] Wikia: make getXMLHeader() check more lenient Otherwise we end up using Special:Export even though the export API would work perfectly well with --xmlrevisions. For some reason using the general requests session always got an empty response from the Wikia API. May also fix images on fandom.com: https://github.com/WikiTeam/wikiteam/issues/330 --- dumpgenerator.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 924a02e..29ccc69 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -417,16 +417,29 @@ def getXMLHeader(config={}, session=None): print config['api'] xml = '' if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"): - xml = None try: print 'Getting the XML header from the API' # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.18 - r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1', timeout=10) + r = requests.get(config['api'] + '?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1', timeout=10) xml = r.text - if not xml: + # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19 + if not re.match(r"\s*')[0] if not re.match(r"\s*