fix: infinite loop on page title scraper

port from https://github.com/WikiTeam/wikiteam/pull/439
11 months ago · 283611dda4
parent 5791c56975
commit 283611dda4
1 changed files with 7 additions and 7 deletions
--- a/wikiteam3/dumpgenerator/api/page_titles.py
+++ b/wikiteam3/dumpgenerator/api/page_titles.py
@ -57,11 +57,11 @@ def getPageTitlesScraper(config: Config=None, session=None):
        raw = r.text
        raw = cleanHTML(raw)

-        r_title = 'title="(?P<title>[^>]+)">'
+        r_title = r'title="(?P<title>[^>]+)">'
        r_suballpages = ""
-        r_suballpages1 = '&amp;from=(?P<from>[^>]+)&amp;to=(?P<to>[^>]+)">'
-        r_suballpages2 = 'Special:Allpages/(?P<from>[^>]+)">'
-        r_suballpages3 = '&amp;from=(?P<from>[^>]+)" title="[^>]+">'
+        r_suballpages1 = r'&amp;from=(?P<from>[^>"]+)&amp;to=(?P<to>[^>"]+)">'
+        r_suballpages2 = r'Special:Allpages/(?P<from>[^>"]+)">'
+        r_suballpages3 = r'&amp;from=(?P<from>[^>"]+)" title="[^>]+">'
        if re.search(r_suballpages1, raw):
            r_suballpages = r_suballpages1
        elif re.search(r_suballpages2, raw):
@ -71,7 +71,7 @@ def getPageTitlesScraper(config: Config=None, session=None):
        else:
            pass  # perhaps no subpages

-        # Should be enought subpages on Special:Allpages
+        # Should be enough subpages on Special:Allpages
        deep = 50
        c = 0
        oldfr = ""
@ -98,8 +98,8 @@ def getPageTitlesScraper(config: Config=None, session=None):
                        fr,
                        to,
                    )  # do not put urllib.parse.quote in fr or to
-                # fix, esta regexp no carga bien todas? o falla el r_title en
-                # este tipo de subpag? (wikiindex)
+                # fix, this regexp doesn't properly save everything? or does r_title fail on this
+                # type of subpage? (wikiindex)
                elif r_suballpages == r_suballpages2:
                    # clean &amp;namespace=\d, sometimes happens
                    fr = fr.split("&amp;namespace=")[0]