git-svn-id: https://wikiteam.googlecode.com/svn/trunk@7 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95

2024-11-04 12:00:28 +00:00 · 2011-04-05 23:00:33 +00:00 · 2011-04-05 23:00:33 +00:00 · 8a1cf47a3b
commit 8a1cf47a3b
parent 56f9643d48
1 changed files with 11 additions and 10 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -130,21 +130,22 @@ def getXML(domain='', title='', curonly=False):
    f = urllib2.urlopen(req)
    xml = f.read()

-    #if complete history, check if this page history has > 1000 edits, if so, retrieve all using offset
-    if not curonly:
-        xml2 = xml
-        while len(re.findall(r'<revision>', xml2)) == limit:
-            #try to retrieve more, although perhaps it is exact 1000 edits
-            params['offset'] = re.findall(r'<timestamp>([^<]+)</timestamp>', xml2)[-1]
+    #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
+    #else, warning about Special:Export truncating large page histories
+    r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+    if not curonly and re.search(r_timestamp, xml): # to avoid empty pages: Special:Allpages and the random one
+        params['offset'] = re.findall(r_timestamp, xml)[-1]
+        while params['offset'] == re.findall(r_timestamp, xml)[-1]:
            data = urllib.urlencode(params)
            req2 = urllib2.Request(url=domain, data=data, headers=headers)
            f2 = urllib2.urlopen(req2)
            xml2 = f2.read()
-            if re.findall(r'<timestamp>([^<]+)</timestamp>', xml2)[-1] == params['offset']:
-                print 'ATTENTION: This wiki does not allow some parameters in Special:Export, so, pages with large histories can be truncated'
+            if re.findall(r_timestamp, xml2)[-1] == params['offset']:
+                print 'ATTENTION: This wiki does not allow some parameters in Special:Export, so, pages with large histories may be truncated'
                break
-            xml = xml.split('</page>')[0]+xml2.split('<page>\n')[1]
-            print title, len(xml2), re.findall('<timestamp>[^<]+</timestamp>', xml2)
+            else:
+                xml = xml.split('</page>')[0]+xml2.split('<page>\n')[1]
+            print title, len(xml2), re.findall(r_timestamp, xml2)
    return xml

 def cleanXML(xml=''):