Fix getPageTitlesScraper

Using the API and the Special:Allpages scraper should result in the same number of titles. Fix the detection of the next subpages on Special:Allpages. Change the max depth to 100 and implement an anti loop (could fail on non-western wiki).
4 years ago · b289f86243
parent 1048bc3275
commit b289f86243
1 changed files with 19 additions and 9 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -299,9 +299,10 @@ def getPageTitlesScraper(config={}, session=None):
        else:
            pass  # perhaps no subpages

-        # 3 is the current deep of English Wikipedia for Special:Allpages
-        deep = 3
+        # Should be enought subpages on Special:Allpages
+        deep = 50
        c = 0
+        oldfr = ''
        checked_suballpages = []
        rawacum = raw
        while r_suballpages and re.search(r_suballpages, raw) and c < deep:
@ -309,6 +310,11 @@ def getPageTitlesScraper(config={}, session=None):
            m = re.compile(r_suballpages).finditer(raw)
            for i in m:
                fr = i.group('from')
+                currfr = fr
+
+                if oldfr == currfr:
+                    # We are looping, exit the loop
+                    pass

                if r_suballpages == r_suballpages1:
                    to = i.group('to')
@ -329,19 +335,23 @@ def getPageTitlesScraper(config={}, session=None):
                    url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
                        config['index'], name, namespace)

+
+
                if name not in checked_suballpages:
                    # to avoid reload dupe subpages links
                    checked_suballpages.append(name)
                    delay(config=config, session=session)
-                    r2 = session.get(url=url, timeout=10)
-                    raw2 = r2.text
-                    raw2 = cleanHTML(raw2)
-                    rawacum += raw2  # merge it after removed junk
-                    print '    Reading', name, len(raw2), 'bytes', \
-                        len(re.findall(r_suballpages, raw2)), 'subpages', \
-                        len(re.findall(r_title, raw2)), 'pages'
+                    r = session.get(url=url, timeout=10)
+                    #print 'Fetching URL: ', url
+                    raw = r.text
+                    raw = cleanHTML(raw)
+                    rawacum += raw  # merge it after removed junk
+                    print '    Reading', name, len(raw), 'bytes', \
+                        len(re.findall(r_suballpages, raw)), 'subpages', \
+                        len(re.findall(r_title, raw)), 'pages'

                delay(config=config, session=session)
+            oldfr = currfr
            c += 1

        c = 0