diff --git a/dumpgenerator.py b/dumpgenerator.py index 6eac8f5..6b70dcd 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -871,7 +871,15 @@ def getXMLRevisions(config={}, session=None, allpages=False): for result in results: pages = result['query']['pages'] for page in pages: - yield makeXmlFromPage(pages[page]) + try: + xml = makeXmlFromPage(pages[page]) + except PageMissingError: + logerror( + config=config, + text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8')) + ) + continue + yield xml except wikitools.api.APIError: print "This wikitools version seems not to work for us. Exiting." @@ -879,28 +887,31 @@ def getXMLRevisions(config={}, session=None, allpages=False): def makeXmlFromPage(page): """ Output an XML document as a string from a page as in the API JSON """ - p = E.page( - E.title(page['title']), - E.ns(to_unicode(page['ns'])), - E.id(to_unicode(page['pageid'])), - ) - for rev in page['revisions']: - revision = E.revision( - E.id(to_unicode(rev['revid'])), - E.timestamp(rev['timestamp']), - E.contributor( - E.id(to_unicode(rev['userid'])), - E.username(to_unicode(rev['user'])), - ), - E.comment(rev['comment']), - E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])), + try: + p = E.page( + E.title(page['title']), + E.ns(to_unicode(page['ns'])), + E.id(to_unicode(page['pageid'])), ) - if 'contentmodel' in rev: - revision.append(E.model(rev['contentmodel']) - # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia). - if 'sha1' in rev: - revision.append(E.sha1(rev['sha1'])) - p.append(revision) + for rev in page['revisions']: + revision = E.revision( + E.id(to_unicode(rev['revid'])), + E.timestamp(rev['timestamp']), + E.contributor( + E.id(to_unicode(rev['userid'])), + E.username(to_unicode(rev['user'])), + ), + E.comment(rev['comment']), + E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])), + ) + if 'contentmodel' in rev: + revision.append(E.model(rev['contentmodel'])) + # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia). + if 'sha1' in rev: + revision.append(E.sha1(rev['sha1'])) + p.append(revision) + except KeyError: + raise PageMissingError(page['title'], '') return etree.tostring(p, pretty_print=True) def readTitles(config={}, start=None):