diff --git a/dumpgenerator.py b/dumpgenerator.py
index 8a55e0f..1c99c24 100644
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -45,6 +45,8 @@ import urllib
__VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org
+class PageMissingError(Exception):
+ pass
def getVersion():
return(__VERSION__)
@@ -387,10 +389,10 @@ def getXMLHeader(config={}, session=None):
# similar to: ')[0]
- if not xml:
+ try:
+ xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+ header = xml.split('')[0]
+ except PageMissingError:
print 'XML export on this wiki is broken, quitting.'
sys.exit()
return header
@@ -399,12 +401,7 @@ def getXMLHeader(config={}, session=None):
def getXMLFileDesc(config={}, title='', session=None):
""" Get XML for image description page """
config['curonly'] = 1 # tricky to get only the most recent desc
- return getXMLPage(
- config=config,
- title=title,
- verbose=False,
- session=session
- )
+ return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
def getUserAgent():
@@ -510,10 +507,18 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
params['templates'] = 1
xml = getXMLPageCore(params=params, config=config, session=session)
+ if not xml:
+ raise PageMissingError
+
+ yield xml.split("")[0]
# if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
# else, warning about Special:Export truncating large page histories
r_timestamp = r'([^<]+)'
+
+ numberofedits = 0
+ numberofedits += len(re.findall(r_timestamp, xml))
+
# search for timestamps in xml to avoid analysing empty pages like
# Special:Allpages and the random one
if not config['curonly'] and re.search(r_timestamp, xml):
@@ -546,26 +551,27 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
"""
# offset is OK in this wiki, merge with the previous chunk
# of this page history and continue
- xml = xml.split(
- '')[0] + ' ' + (''.join(xml2.split('')[1:]))
+ xml2 = xml2.split("")[0]
+ yield ' ' + (''.join(xml2.split('')[1:]))
+ xml = xml2
+ numberofedits += len(re.findall(r_timestamp, xml))
else:
params['offset'] = '' # no more edits in this page history
+ yield "\n"
if verbose:
- numberofedits = len(re.findall(r_timestamp, xml))
if (numberofedits == 1):
- print ' %s, 1 edit' % (title.encode('utf-8'))
+ print ' %s, 1 edit' % (title.encode('utf-8'))
else:
- print ' %s, %d edits' % (title.encode('utf-8'), numberofedits)
-
- return xml
+ print ' %s, %d edits' % (title.encode('utf-8'), numberofedits)
def cleanXML(xml=''):
""" Trim redundant info """
# do not touch XML codification, leave AS IS
- if re.search(r'\n', xml) and re.search(r'', xml):
+ if re.search(r'\n', xml):
xml = xml.split('\n')[1]
+ if re.search(r'', xml):
xml = xml.split('')[0]
return xml
@@ -627,9 +633,11 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
delay(config=config, session=session)
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
- xml = getXMLPage(config=config, title=title, session=session)
- xml = cleanXML(xml=xml)
- if not xml:
+ try:
+ for xml in getXMLPage(config=config, title=title, session=session):
+ xml = cleanXML(xml=xml)
+ xmlfile.write(xml.encode('utf-8'))
+ except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
@@ -639,7 +647,6 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
- xmlfile.write(xml.encode('utf-8'))
c += 1
xmlfile.write(footer)
xmlfile.close()