From 8a1cf47a3bfe86f1bd3b15b77dde06a9d50d7aa7 Mon Sep 17 00:00:00 2001 From: emijrp Date: Tue, 5 Apr 2011 23:00:33 +0000 Subject: [PATCH] git-svn-id: https://wikiteam.googlecode.com/svn/trunk@7 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- dumpgenerator.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 5ce3baa..5e39893 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -130,21 +130,22 @@ def getXML(domain='', title='', curonly=False): f = urllib2.urlopen(req) xml = f.read() - #if complete history, check if this page history has > 1000 edits, if so, retrieve all using offset - if not curonly: - xml2 = xml - while len(re.findall(r'', xml2)) == limit: - #try to retrieve more, although perhaps it is exact 1000 edits - params['offset'] = re.findall(r'([^<]+)', xml2)[-1] + #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available + #else, warning about Special:Export truncating large page histories + r_timestamp = r'([^<]+)' + if not curonly and re.search(r_timestamp, xml): # to avoid empty pages: Special:Allpages and the random one + params['offset'] = re.findall(r_timestamp, xml)[-1] + while params['offset'] == re.findall(r_timestamp, xml)[-1]: data = urllib.urlencode(params) req2 = urllib2.Request(url=domain, data=data, headers=headers) f2 = urllib2.urlopen(req2) xml2 = f2.read() - if re.findall(r'([^<]+)', xml2)[-1] == params['offset']: - print 'ATTENTION: This wiki does not allow some parameters in Special:Export, so, pages with large histories can be truncated' + if re.findall(r_timestamp, xml2)[-1] == params['offset']: + print 'ATTENTION: This wiki does not allow some parameters in Special:Export, so, pages with large histories may be truncated' break - xml = xml.split('')[0]+xml2.split('\n')[1] - print title, len(xml2), re.findall('[^<]+', xml2) + else: + xml = xml.split('')[0]+xml2.split('\n')[1] + print title, len(xml2), re.findall(r_timestamp, xml2) return xml def cleanXML(xml=''):