From 11507e931e4d6682d0c0a766fd0ea8833edb86e4 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 14:19:23 +0200 Subject: [PATCH 01/14] Initial switch to mwclient for the xmlrevisions option * Still maintained and available for python 3 as well. * Allows raw API requests as we need. * Does not provide handy generators, we need to do continuation. * Decides on its own which protocol and exact path to use, fails at it. * Appears to use POST by default unless asked otherwise, what to do? --- dumpgenerator.py | 96 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index e2d8082..4a87ded 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -46,9 +46,9 @@ except ImportError: print "Please install or update the Requests module." sys.exit(1) try: - import wikitools + import mwclient except ImportError: - print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions." + print "Please install the mwclient module if you want to use --xmlrevisions." try: from lxml import etree from lxml.builder import E @@ -714,8 +714,18 @@ def getXMLPage(config={}, title='', verbose=True, session=None): print ' %s, %d edits' % (title.strip(), numberofedits) +def makeXmlPageFromRaw(xml): + """ Discard the metadata around a element in string""" + root = etree.XML(xml) + find = etree.XPath("//*[local-name() = 'page']") + # The tag will inherit the namespace, like: + # + # FIXME: pretty_print doesn't seem to work, only adds a newline + return etree.tostring(find(root)[0], pretty_print=True) + + def cleanXML(xml=''): - """ Trim redundant info """ + """ Trim redundant info from the XML however it comes """ # do not touch XML codification, leave AS IS if re.search(r'\n', xml): xml = xml.split('\n')[1] @@ -748,8 +758,9 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): print "%d more revisions exported" % numrevs xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) - except AttributeError: - print "This wikitools module version is not working" + except AttributeError as e: + print(e) + print "This API library version is not working" sys.exit() else: print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') @@ -797,7 +808,10 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): print 'XML dump saved at...', xmlfilename def getXMLRevisions(config={}, session=None, allpages=False): - site = wikitools.wiki.Wiki(config['api']) + apiurl = urlparse(config['api']) + # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP? + # https://github.com/WikiTeam/wikiteam/issues/358 + site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", "")) if not 'all' in config['namespaces']: namespaces = config['namespaces'] else: @@ -806,6 +820,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): try: for namespace in namespaces: print "Trying to export all revisions from namespace %s" % namespace + # arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!) arvparams = { 'action': 'query', 'list': 'allrevisions', @@ -817,46 +832,71 @@ def getXMLRevisions(config={}, session=None, allpages=False): # Skip flags, presumably needed to add which is in the schema. # Also missing: parentid and contentformat. arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content' - arvrequest = wikitools.api.APIRequest(site, arvparams) - results = arvrequest.queryGen() - for result in results: - for page in result['query']['allrevisions']: + print("Trying to get wikitext from the allrevisions API and to build the XML") + while True: + arvrequest = site.api(**arvparams) + for page in arvrequest['query']['allrevisions']: yield makeXmlFromPage(page) + if 'continue' in arvrequest: + arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] + else: + # End of continuation. We are done with this namespace. + break + else: + # FIXME: this is not curonly, just different strategy to do all revisions # Just cycle through revision IDs and use the XML as is + print("Trying to list the revisions and to export them one by one") + # We only need the revision ID, all the rest will come from the raw export arvparams['arvprop'] = 'ids' - arvrequest = wikitools.api.APIRequest(site, arvparams) - arvresults = arvrequest.queryGen() - for result in arvresults: + # Repeat the arvrequest with new arvparams until done + while True: + # Reset revision IDs from the previous batch from arv revids = [] - for page in result['query']['allrevisions']: + # Get the new ones + arvrequest = site.api(**arvparams) + for page in arvrequest['query']['allrevisions']: for revision in page['revisions']: revids.append(str(revision['revid'])) print "%d more revisions listed, until %s" % (len(revids), revids[-1]) - + # We can now get the XML for one revision at a time + # FIXME: we can actually get them in batches as we used to + # but need to figure out the continuation and avoid that the API + # chooses to give us only the latest for each page exportparams = { 'action': 'query', - 'revids': '|'.join(revids), 'export': '1', } - exportrequest = wikitools.api.APIRequest(site, exportparams) - exportresults = exportrequest.queryGen() - for exportresult in exportresults: - yield exportresult['query']['export']['*'] + for revid in revids: + exportparams['revids'] = revid + exportrequest = site.api(**exportparams) + # This gives us a self-standing element + # but we only need the inner : we can live with + # duplication and non-ordering of page titles, but the + # repeated header is confusing and would not even be valid + xml = exportrequest['query']['export']['*'] + yield makeXmlPageFromRaw(xml) + + if 'continue' in arvrequest: + arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] + else: + # End of continuation. We are done with this namespace. + break + except KeyError: - print "Warning. Could not use allrevisions, wiki too old." + print "Warning. Could not use allrevisions. Wiki too old?" if config['curonly']: + # The raw XML export in the API gets a title and gives the latest revision for title in readTitles(config): exportparams = { 'action': 'query', 'titles': title, 'export': '1', } - exportrequest = wikitools.api.APIRequest(site, exportparams) - exportresults = exportrequest.queryGen() - for exportresult in exportresults: - yield exportresult['query']['export']['*'] + exportrequest = site.api(**exportparams) + xml = exportrequest['query']['export']['*'] + yield makeXmlPageFromRaw(xml) else: for title in readTitles(config): pparams = { @@ -867,7 +907,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', 'rawcontinue': 'yes' } - prequest = wikitools.api.APIRequest(site, pparams) + prequest = site.api(**pparams) try: results = prequest.query() pages = results['query']['pages'] @@ -884,8 +924,8 @@ def getXMLRevisions(config={}, session=None, allpages=False): continue yield xml - except wikitools.api.APIError: - print "This wikitools version seems not to work for us. Exiting." + except mwclient.errors.MwClientError: + print "This mwclient version seems not to work for us. Exiting." sys.exit() def makeXmlFromPage(page): From 3760501f74bc8007b56a8d7c1e0581588131feb5 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 15:25:40 +0200 Subject: [PATCH 02/14] Add a couple comments --- dumpgenerator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dumpgenerator.py b/dumpgenerator.py index 4a87ded..59221bc 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -808,6 +808,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): print 'XML dump saved at...', xmlfilename def getXMLRevisions(config={}, session=None, allpages=False): + # FIXME: actually figure out the various strategies for each MediaWiki version apiurl = urlparse(config['api']) # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP? # https://github.com/WikiTeam/wikiteam/issues/358 @@ -889,6 +890,8 @@ def getXMLRevisions(config={}, session=None, allpages=False): if config['curonly']: # The raw XML export in the API gets a title and gives the latest revision for title in readTitles(config): + # TODO: as we're doing one page and revision at a time, + # we might as well use xml format and exportnowrap=1 exportparams = { 'action': 'query', 'titles': title, From f10adb71af60697d7c7c8860ae768b21c54840ba Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 15:27:10 +0200 Subject: [PATCH 03/14] Don't try to add revisions if the namespace has none Traceback (most recent call last): File "dumpgenerator.py", line 2362, in File "dumpgenerator.py", line 2354, in main resumePreviousDump(config=config, other=other) File "dumpgenerator.py", line 1921, in createNewDump getPageTitles(config=config, session=other['session']) File "dumpgenerator.py", line 755, in generateXMLDump for xml in getXMLRevisions(config=config, session=session): File "dumpgenerator.py", line 861, in getXMLRevisions revids.append(str(revision['revid'])) IndexError: list index out of range --- dumpgenerator.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 59221bc..08f6400 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -850,12 +850,14 @@ def getXMLRevisions(config={}, session=None, allpages=False): print("Trying to list the revisions and to export them one by one") # We only need the revision ID, all the rest will come from the raw export arvparams['arvprop'] = 'ids' + arvrequest = site.api(**arvparams) + # Skip the namespace if it's empty + if len(arvrequest['query']['allrevisions']) < 1: + continue # Repeat the arvrequest with new arvparams until done while True: # Reset revision IDs from the previous batch from arv revids = [] - # Get the new ones - arvrequest = site.api(**arvparams) for page in arvrequest['query']['allrevisions']: for revision in page['revisions']: revids.append(str(revision['revid'])) @@ -879,19 +881,22 @@ def getXMLRevisions(config={}, session=None, allpages=False): yield makeXmlPageFromRaw(xml) if 'continue' in arvrequest: + # Get the new ones arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] + arvrequest = site.api(**arvparams) else: # End of continuation. We are done with this namespace. break - except KeyError: + # TODO: check whether the KeyError was really for a missing arv API print "Warning. Could not use allrevisions. Wiki too old?" if config['curonly']: # The raw XML export in the API gets a title and gives the latest revision for title in readTitles(config): - # TODO: as we're doing one page and revision at a time, - # we might as well use xml format and exportnowrap=1 + # TODO: as we're doing one page and revision at a time, we might + # as well use xml format and exportnowrap=1 to use the string of, + # XML as is, but need to check how well the library handles it. exportparams = { 'action': 'query', 'titles': title, @@ -899,8 +904,13 @@ def getXMLRevisions(config={}, session=None, allpages=False): } exportrequest = site.api(**exportparams) xml = exportrequest['query']['export']['*'] + # Because we got the fancy XML from the JSON format, clean it: yield makeXmlPageFromRaw(xml) else: + # This is the closest to what we usually do with Special:Export: + # take one title at a time and try to get all revisions exported. + # The XML needs to be made manually because the export=1 option + # refuses to return an arbitrary number of revisions (see above). for title in readTitles(config): pparams = { 'action': 'query', @@ -908,7 +918,6 @@ def getXMLRevisions(config={}, session=None, allpages=False): 'prop': 'revisions', 'rvlimit': 'max', 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', - 'rawcontinue': 'yes' } prequest = site.api(**pparams) try: From 6b12e20a9d97417c13f00de77de9d8053b4a641e Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 15:46:05 +0200 Subject: [PATCH 04/14] Actually convert the titles query method to mwclient too --- dumpgenerator.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 08f6400..4054b10 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -892,7 +892,8 @@ def getXMLRevisions(config={}, session=None, allpages=False): # TODO: check whether the KeyError was really for a missing arv API print "Warning. Could not use allrevisions. Wiki too old?" if config['curonly']: - # The raw XML export in the API gets a title and gives the latest revision + # The raw XML export in the API gets a title and gives the latest revision. + # We could also use the allpages API as generator but let's be consistent. for title in readTitles(config): # TODO: as we're doing one page and revision at a time, we might # as well use xml format and exportnowrap=1 to use the string of, @@ -909,6 +910,8 @@ def getXMLRevisions(config={}, session=None, allpages=False): else: # This is the closest to what we usually do with Special:Export: # take one title at a time and try to get all revisions exported. + # It differs from the allrevisions method because it actually needs + # to be input the page titles; otherwise, the requests are similar. # The XML needs to be made manually because the export=1 option # refuses to return an arbitrary number of revisions (see above). for title in readTitles(config): @@ -920,21 +923,31 @@ def getXMLRevisions(config={}, session=None, allpages=False): 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', } prequest = site.api(**pparams) + # The array is called "pages" even if there's only one. + # TODO: we could actually batch titles a bit here if desired. How many? try: - results = prequest.query() - pages = results['query']['pages'] + pages = prequest['query']['pages'] except KeyError: raise PageMissingError(title, xml='') - for page in pages: - try: - xml = makeXmlFromPage(pages[page]) - except PageMissingError: - logerror( - config=config, - text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8')) - ) - continue - yield xml + # Be ready to iterate if there is continuation. + while True: + # Go through the data we got to build the XML. + for page in pages: + try: + xml = makeXmlFromPage(pages[page]) + except PageMissingError: + logerror( + config=config, + text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8')) + ) + continue + yield xml + + # Get next batch of revisions if there's more. + if 'continue' in prequest: + pparams['rvcontinue'] = prequest['rvcontinue'] + prequest = site.api(**pparams) + except mwclient.errors.MwClientError: print "This mwclient version seems not to work for us. Exiting." From 0f35d03929883aee374f2d2049df56b4a368bb93 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 16:06:54 +0200 Subject: [PATCH 05/14] Remove rvlimit=max, fails in MediaWiki 1.16 For instance: "Exception Caught: Internal error in ApiResult::setElement: Attempting to add element revisions=50, existing value is 500" https://wiki.rabenthal.net/api.php?action=query&prop=revisions&titles=Hauptseite&rvprop=ids&rvlimit=max --- dumpgenerator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 4054b10..f628843 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -915,11 +915,12 @@ def getXMLRevisions(config={}, session=None, allpages=False): # The XML needs to be made manually because the export=1 option # refuses to return an arbitrary number of revisions (see above). for title in readTitles(config): + # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded: + # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}} pparams = { 'action': 'query', 'titles': title, 'prop': 'revisions', - 'rvlimit': 'max', 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', } prequest = site.api(**pparams) From 9ec6ce42d33d2dcaa12d59e0389467e839e30ace Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 16:42:22 +0200 Subject: [PATCH 06/14] Finish xmlrevisions option for older wikis * Actually proceed to the next page when no continuation. * Provide the same output as with the usual per-page export. Tested on a MediaWiki 1.16 wiki with success. --- dumpgenerator.py | 76 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 13 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index f628843..c6717df 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -755,6 +755,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): for xml in getXMLRevisions(config=config, session=session): numrevs = len(re.findall(r_timestamp, xml)) # Due to how generators work, it's expected this may be less + # TODO: get the page title and reuse the usual format "X title, y edits" print "%d more revisions exported" % numrevs xml = cleanXML(xml=xml) xmlfile.write(xml.encode('utf-8')) @@ -835,7 +836,18 @@ def getXMLRevisions(config={}, session=None, allpages=False): arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content' print("Trying to get wikitext from the allrevisions API and to build the XML") while True: - arvrequest = site.api(**arvparams) + try: + arvrequest = site.api(**arvparams) + except requests.exceptions.ReadTimeout as err: + # Hopefully temporary, just wait a bit and continue with the same request. + # No point putting a limit to retries, we'd need to abort everything. + # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient + # to use the retry adapter we use for our own requests session? + print("ERROR: {}".format(str(err))) + print("Sleeping for 20 seconds") + time.sleep(20) + continue + for page in arvrequest['query']['allrevisions']: yield makeXmlFromPage(page) if 'continue' in arvrequest: @@ -851,6 +863,10 @@ def getXMLRevisions(config={}, session=None, allpages=False): # We only need the revision ID, all the rest will come from the raw export arvparams['arvprop'] = 'ids' arvrequest = site.api(**arvparams) + exportparams = { + 'action': 'query', + 'export': '1', + } # Skip the namespace if it's empty if len(arvrequest['query']['allrevisions']) < 1: continue @@ -862,14 +878,11 @@ def getXMLRevisions(config={}, session=None, allpages=False): for revision in page['revisions']: revids.append(str(revision['revid'])) print "%d more revisions listed, until %s" % (len(revids), revids[-1]) + # We can now get the XML for one revision at a time # FIXME: we can actually get them in batches as we used to # but need to figure out the continuation and avoid that the API # chooses to give us only the latest for each page - exportparams = { - 'action': 'query', - 'export': '1', - } for revid in revids: exportparams['revids'] = revid exportrequest = site.api(**exportparams) @@ -883,7 +896,16 @@ def getXMLRevisions(config={}, session=None, allpages=False): if 'continue' in arvrequest: # Get the new ones arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] - arvrequest = site.api(**arvparams) + try: + arvrequest = site.api(**arvparams) + except requests.exceptions.ReadTimeout as err: + # As above + print("ERROR: {}".format(str(err))) + print("Sleeping for 20 seconds") + time.sleep(20) + # But avoid rewriting the same revisions + arvrequest['query']['allrevisions'] = [] + continue else: # End of continuation. We are done with this namespace. break @@ -894,7 +916,11 @@ def getXMLRevisions(config={}, session=None, allpages=False): if config['curonly']: # The raw XML export in the API gets a title and gives the latest revision. # We could also use the allpages API as generator but let's be consistent. + print("Getting titles to export the latest revision for each") + c = 0 for title in readTitles(config): + # TODO: respect verbose flag, reuse output from getXMLPage + print(' {}'.format(title.strip())) # TODO: as we're doing one page and revision at a time, we might # as well use xml format and exportnowrap=1 to use the string of, # XML as is, but need to check how well the library handles it. @@ -905,6 +931,9 @@ def getXMLRevisions(config={}, session=None, allpages=False): } exportrequest = site.api(**exportparams) xml = exportrequest['query']['export']['*'] + c += 1 + if c % 10 == 0: + print('Downloaded {} pages'.format(c)) # Because we got the fancy XML from the JSON format, clean it: yield makeXmlPageFromRaw(xml) else: @@ -914,16 +943,23 @@ def getXMLRevisions(config={}, session=None, allpages=False): # to be input the page titles; otherwise, the requests are similar. # The XML needs to be made manually because the export=1 option # refuses to return an arbitrary number of revisions (see above). + print("Getting titles to export all the revisions of each") + c = 0 for title in readTitles(config): + print(' {}'.format(title.strip())) # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded: # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}} pparams = { 'action': 'query', 'titles': title, 'prop': 'revisions', + 'rvlimit': 50, 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', } prequest = site.api(**pparams) + c += 1 + if c % 10 == 0: + print('Downloaded {} pages'.format(c)) # The array is called "pages" even if there's only one. # TODO: we could actually batch titles a bit here if desired. How many? try: @@ -936,18 +972,21 @@ def getXMLRevisions(config={}, session=None, allpages=False): for page in pages: try: xml = makeXmlFromPage(pages[page]) + yield xml except PageMissingError: logerror( config=config, text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8')) ) continue - yield xml # Get next batch of revisions if there's more. if 'continue' in prequest: + print("Getting more revisions for page {}".format(title)) pparams['rvcontinue'] = prequest['rvcontinue'] prequest = site.api(**pparams) + else: + break except mwclient.errors.MwClientError: @@ -958,30 +997,41 @@ def makeXmlFromPage(page): """ Output an XML document as a string from a page as in the API JSON """ try: p = E.page( - E.title(page['title']), + E.title(to_unicode(page['title'])), E.ns(to_unicode(page['ns'])), E.id(to_unicode(page['pageid'])), ) for rev in page['revisions']: + # Older releases like MediaWiki 1.16 do not return all fields. + if 'userid' in rev: + userid = rev['userid'] + else: + userid = 0 + if 'size' in rev: + size = rev['size'] + else: + size = 0 revision = E.revision( E.id(to_unicode(rev['revid'])), E.parentid(to_unicode(rev['parentid'])), E.timestamp(rev['timestamp']), E.contributor( - E.id(to_unicode(rev['userid'])), + E.id(to_unicode(userid)), E.username(to_unicode(rev['user'])), ), - E.comment(rev['comment']), - E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])), + E.text(rev['*'], space="preserve", bytes=to_unicode(size)), ) + if 'comment' in rev: + revision.append(E.comment(to_unicode(rev['comment']))) if 'contentmodel' in rev: revision.append(E.model(rev['contentmodel'])) # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia). if 'sha1' in rev: revision.append(E.sha1(rev['sha1'])) p.append(revision) - except KeyError: - raise PageMissingError(page['title'], '') + except KeyError as e: + print(e) + raise PageMissingError(page['title'], e) return etree.tostring(p, pretty_print=True) def readTitles(config={}, start=None): From f0436ee57cf92a1c3d303102797111d8400c7c43 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 17:59:03 +0200 Subject: [PATCH 07/14] Make mwclient respect the provided HTTP/HTTPS scheme Fixes https://github.com/WikiTeam/wikiteam/issues/358 --- dumpgenerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index c6717df..6aa2c9e 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -813,7 +813,7 @@ def getXMLRevisions(config={}, session=None, allpages=False): apiurl = urlparse(config['api']) # FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP? # https://github.com/WikiTeam/wikiteam/issues/358 - site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", "")) + site = mwclient.Site(apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme) if not 'all' in config['namespaces']: namespaces = config['namespaces'] else: From becd01b2714a51cd25e2f2f06800c4df5e2ccfec Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 18:00:43 +0200 Subject: [PATCH 08/14] Use defined requests.exceptions.ConnectionError Fixes https://github.com/WikiTeam/wikiteam/issues/356 --- dumpgenerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 6aa2c9e..87eca91 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -273,7 +273,7 @@ def getPageTitlesAPI(config={}, session=None): try: r = session.get(url=config['api'], params=params, timeout=30) break - except ConnectionError as err: + except requests.exceptions.ConnectionError as err: print "Connection error: %s" % (str(err),) retryCount += 1 time.sleep(20) From 0b37b39923c7258ac044730a2920a3263bfbd17f Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 18:05:42 +0200 Subject: [PATCH 09/14] Define xml header as empty first so that it can fail graciously Fixes https://github.com/WikiTeam/wikiteam/issues/355 --- dumpgenerator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dumpgenerator.py b/dumpgenerator.py index 87eca91..1b863ed 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -463,6 +463,7 @@ def getXMLHeader(config={}, session=None): # xmlns:x.... randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ print config['api'] + xml = '' if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"): xml = None try: From 1645c1d83272b726017dfbdcf4a5a5516135e85b Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 18:18:26 +0200 Subject: [PATCH 10/14] More robust XML header fetch for getXMLHeader() Avoid UnboundLocalError: local variable 'xml' referenced before assignment If the page exists, its XML export is returned by the API; otherwise only the header that we were looking for. Fixes https://github.com/WikiTeam/wikiteam/issues/355 --- dumpgenerator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 1b863ed..07301bf 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -468,10 +468,12 @@ def getXMLHeader(config={}, session=None): xml = None try: print 'Getting the XML header from the API' - r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10) - xml = r.json()['query']['export']['*'] + # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.18 + r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1', timeout=10) + xml = r.text if not xml: - r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10) + # Do without a generator, use our usual trick of a random page title + r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&titles=' + randomtitle, timeout=10) xml = r.text except requests.exceptions.RetryError: pass From 92da7388b0301cb496526c027b624066295f5f61 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 18:33:34 +0200 Subject: [PATCH 11/14] Avoid asking allpages API if API not available So that it doesn't have to iterate among non-existing titles. Fixes https://github.com/WikiTeam/wikiteam/issues/348 --- dumpgenerator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 07301bf..2fc8374 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -213,7 +213,7 @@ def getNamespacesAPI(config={}, session=None): try: nsquery = result['query']['namespaces'] except KeyError: - print "Error: could not get namespaces from the API request" + print "Error: could not get namespaces from the API request." print "HTTP %d" % r.status_code print r.text return None @@ -1741,6 +1741,7 @@ def getParameters(params=[]): else: if index and not args.wiki: print 'API not available. Trying with index.php only.' + args.api = None else: print 'Error in API. Please, provide a correct path to API' sys.exit(1) From 8b5378f9910d433343aecb35c3f7501c2f465ddd Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 19:33:10 +0200 Subject: [PATCH 12/14] Fix query prop=revisions continuation in MediaWiki 1.22 This wiki has the old query-continue format but it's not exposes here. --- dumpgenerator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 2fc8374..58c55cf 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -984,10 +984,13 @@ def getXMLRevisions(config={}, session=None, allpages=False): continue # Get next batch of revisions if there's more. - if 'continue' in prequest: + if 'continue' in prequest.keys(): print("Getting more revisions for page {}".format(title)) - pparams['rvcontinue'] = prequest['rvcontinue'] + pparams['rvcontinue'] = prequest['continue']['rvcontinue'] prequest = site.api(**pparams) + # mwclient seems to rewrite query-continue + #if 'query-continue' in prequest.keys(): + # pparams['rvcontinue'] = prequest['query-continue']['revisions']['rvcontinue'] else: break From 49017e3f209db2e6a897ac19fc6ade92431fcab8 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 20:50:02 +0200 Subject: [PATCH 13/14] Catch HTTP Error 405 and switch from POST to GET for API requests Seen on http://wiki.ainigma.eu/index.php?title=Hlavn%C3%AD_strana: HTTPError: HTTP Error 405: Method Not Allowed --- dumpgenerator.py | 58 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 58c55cf..3ba9222 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -840,7 +840,12 @@ def getXMLRevisions(config={}, session=None, allpages=False): print("Trying to get wikitext from the allrevisions API and to build the XML") while True: try: - arvrequest = site.api(**arvparams) + arvrequest = site.api(http_method=config['http_method'], **arvparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + continue except requests.exceptions.ReadTimeout as err: # Hopefully temporary, just wait a bit and continue with the same request. # No point putting a limit to retries, we'd need to abort everything. @@ -865,7 +870,13 @@ def getXMLRevisions(config={}, session=None, allpages=False): print("Trying to list the revisions and to export them one by one") # We only need the revision ID, all the rest will come from the raw export arvparams['arvprop'] = 'ids' - arvrequest = site.api(**arvparams) + try: + arvrequest = site.api(http_method=config['http_method'], **arvparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + continue exportparams = { 'action': 'query', 'export': '1', @@ -888,7 +899,14 @@ def getXMLRevisions(config={}, session=None, allpages=False): # chooses to give us only the latest for each page for revid in revids: exportparams['revids'] = revid - exportrequest = site.api(**exportparams) + try: + exportrequest = site.api(http_method=config['http_method'], **exportparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + exportrequest = site.api(http_method=config['http_method'], **exportparams) + # This gives us a self-standing element # but we only need the inner : we can live with # duplication and non-ordering of page titles, but the @@ -900,7 +918,12 @@ def getXMLRevisions(config={}, session=None, allpages=False): # Get the new ones arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue'] try: - arvrequest = site.api(**arvparams) + arvrequest = site.api(http_method=config['http_method'], **arvparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + arvrequest = site.api(http_method=config['http_method'], **arvparams) except requests.exceptions.ReadTimeout as err: # As above print("ERROR: {}".format(str(err))) @@ -932,7 +955,14 @@ def getXMLRevisions(config={}, session=None, allpages=False): 'titles': title, 'export': '1', } - exportrequest = site.api(**exportparams) + try: + exportrequest = site.api(http_method=config['http_method'], **exportparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + exportrequest = site.api(http_method=config['http_method'], **exportparams) + xml = exportrequest['query']['export']['*'] c += 1 if c % 10 == 0: @@ -959,7 +989,14 @@ def getXMLRevisions(config={}, session=None, allpages=False): 'rvlimit': 50, 'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content', } - prequest = site.api(**pparams) + try: + prequest = site.api(http_method=config['http_method'], **pparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + exportrequest = site.api(http_method=config['http_method'], **exportparams) + c += 1 if c % 10 == 0: print('Downloaded {} pages'.format(c)) @@ -987,7 +1024,13 @@ def getXMLRevisions(config={}, session=None, allpages=False): if 'continue' in prequest.keys(): print("Getting more revisions for page {}".format(title)) pparams['rvcontinue'] = prequest['continue']['rvcontinue'] - prequest = site.api(**pparams) + try: + prequest = site.api(http_method=config['http_method'], **pparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 405 and config['http_method'] == "POST": + print("POST request to the API failed, retrying with GET") + config['http_method'] = "GET" + prequest = site.api(http_method=config['http_method'], **pparams) # mwclient seems to rewrite query-continue #if 'query-continue' in prequest.keys(): # pparams['rvcontinue'] = prequest['query-continue']['revisions']['rvcontinue'] @@ -1826,6 +1869,7 @@ def getParameters(params=[]): 'date': datetime.datetime.now().strftime('%Y%m%d'), 'api': api, 'failfast': args.failfast, + 'http_method': "POST", 'index': index, 'images': args.images, 'logs': False, From faf0e31b4e33067b80f292186a39c8da91dbab59 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 21:19:01 +0200 Subject: [PATCH 14/14] Don't set apfrom in initial allpages request, use suggested continuation Helps with recent MediaWiki versions like 1.31 where variants of "!" can give a bad title error and the continuation wants apcontinue anyway. --- dumpgenerator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 3ba9222..b197fb6 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -257,14 +257,14 @@ def getPageTitlesAPI(config={}, session=None): c = 0 print ' Retrieving titles in the namespace %d' % (namespace) - apfrom = '!' + apfrom = '' while apfrom: sys.stderr.write('.') # progress params = { 'action': 'query', 'list': 'allpages', 'apnamespace': namespace, - 'apfrom': apfrom.encode('utf-8'), + 'apfrom': apfrom, 'format': 'json', 'aplimit': 500}