From 4a5eef97da9235b171b25e9de63b162145cbdaa7 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Fri, 28 Aug 2020 06:09:20 +0200 Subject: [PATCH 1/8] Update the default user-agent A ModSecurity rule block the old UA so switch to the current Firefox 78 UA. --- dumpgenerator.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 3193fe2..455d462 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -497,8 +497,9 @@ def getUserAgent(): """ Return a cool user-agent to hide Python user-agent """ useragents = [ # firefox - 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', - 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', + #'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', + #'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', + 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0' ] return useragents[0] From 7675b0d17c96ff1a24644dc2bfe9a26e913348d3 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Fri, 28 Aug 2020 06:12:56 +0200 Subject: [PATCH 2/8] Add exception handler for requests.exceptions.ReadTimeout in getXMLPageCore() Treat a ReadTimeout the same as a ConnectionError (log the error & retry) --- dumpgenerator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dumpgenerator.py b/dumpgenerator.py index 455d462..1274e38 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -575,6 +575,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): except requests.exceptions.ConnectionError as e: print ' Connection error: %s'%(str(e[0])) xml = '' + except requests.exceptions.ReadTimeout as e: + print ' Read timeout: %s'%(str(e[0])) + xml = '' c += 1 return xml From eacaf08b2ff4420d9291c983cd30e321100b4fd9 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Fri, 28 Aug 2020 06:17:47 +0200 Subject: [PATCH 3/8] Try to fix a broken HTTP to HTTPS redirect in generateImageDump() Some wiki fail to do the HTTP to HTTPs redirect correctly so try it ourself. --- dumpgenerator.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 1274e38..390386f 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1475,7 +1475,29 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): print 'Filename is too long, truncating. Now it is:', filename2 filename3 = u'%s/%s' % (imagepath, filename2) imagefile = open(filename3, 'wb') - r = requests.get(url=url) + + r = session.head(url=url, allow_redirects=True) + original_url_redirected = len(r.history) > 0 + + if original_url_redirected: + #print 'Site is redirecting us to: ', r.url + original_url = url + url = r.url + + r = session.get(url=url, allow_redirects=False) + + # Try to fix a broken HTTP to HTTPS redirect + if r.status_code == 404 and original_url_redirected: + if original_url.split("://")[0] == "http" and url.split("://")[0] == "https": + url = 'https://' + original_url.split("://")[1] + #print 'Maybe a broken http to https redirect, trying ', url + r = session.get(url=url, allow_redirects=False) + + if r.status_code == 404: + logerror( + config=config, + text=u'File %s at URL %s is missing' % (filename2,url)) + imagefile.write(r.content) imagefile.close() # saving description if any From e4b43927b9c46f4f127c0db9f5c62a981e43d7d0 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Fri, 28 Aug 2020 06:42:22 +0200 Subject: [PATCH 4/8] Fixup description grab in generateImageDump getXMLPage() yield on "" so xmlfiledesc cannot contains "". Change the search to "" and inject "" if it is missing to fixup the XML --- dumpgenerator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 390386f..1c103bb 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1520,9 +1520,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): f = open('%s/%s.desc' % (imagepath, filename2), 'w') # Banner featuring SG1, SGA, SGU teams - if not re.search(r'', xmlfiledesc): + if not re.search(r'', xmlfiledesc): # failure when retrieving desc? then save it as empty .desc xmlfiledesc = '' + + # Fixup the XML + if xmlfiledesc is not '' and not re.search(r'', xmlfiledesc): + xmlfiledesc += '' + f.write(xmlfiledesc.encode('utf-8')) f.close() delay(config=config, session=session) From 320115fe5a7aac20bacd8710d689aa8425174d82 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Fri, 28 Aug 2020 07:01:27 +0200 Subject: [PATCH 5/8] Try to fix CI by using current URL for archiveteam.org In commit 966df37c54d2a12aaa603b7ea37a371fdea0a4d0, emijrp changed http://archiveteam.org/ to https://www.archiveteam.org/ Today, https://archiveteam.org/index.php?title=Special:Version show a canonical URL of https://archiveteam.org/ So try to fix the CI by doing a s/www.archiveteam.org/archiveteam.org/g --- testing/test_dumpgenerator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/testing/test_dumpgenerator.py b/testing/test_dumpgenerator.py index cb23661..08e6762 100644 --- a/testing/test_dumpgenerator.py +++ b/testing/test_dumpgenerator.py @@ -62,7 +62,7 @@ class TestDumpgenerator(unittest.TestCase): tests = [ # Alone wikis #['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'], - ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'], + ['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'], #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'], # Editthis wikifarm @@ -146,7 +146,7 @@ class TestDumpgenerator(unittest.TestCase): print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73 tests = [ # Alone wikis - ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'April Fools\' Day'], + ['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'April Fools\' Day'], #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'], # Test old allpages API behaviour @@ -273,7 +273,7 @@ class TestDumpgenerator(unittest.TestCase): print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73 tests = [ # Alone wikis - ['https://www.archiveteam.org', 'https://www.archiveteam.org/api.php', 'https://www.archiveteam.org/index.php'], + ['https://archiveteam.org', 'https://archiveteam.org/api.php', 'https://archiveteam.org/index.php'], #['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'], # Editthis wikifarm From 1048bc32755d8eb5fd3d7a6f0c4918bfe4c265ef Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Fri, 28 Aug 2020 07:07:55 +0200 Subject: [PATCH 6/8] skilledtests.com doesn't host a MediaWiki anymore http://skilledtests.com/wiki/ redirect to https://simcast.com, something 'Powered by Microsoft News' --- testing/test_dumpgenerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/test_dumpgenerator.py b/testing/test_dumpgenerator.py index 08e6762..4c771f2 100644 --- a/testing/test_dumpgenerator.py +++ b/testing/test_dumpgenerator.py @@ -206,7 +206,7 @@ class TestDumpgenerator(unittest.TestCase): tests = [ ['https://www.dokuwiki.org', 'DokuWiki'], #['http://wiki.openwrt.org', 'DokuWiki'], - ['http://skilledtests.com/wiki/', 'MediaWiki'], + #['http://skilledtests.com/wiki/', 'MediaWiki'], #['http://moinmo.in', 'MoinMoin'], ['https://wiki.debian.org', 'MoinMoin'], ['http://twiki.org/cgi-bin/view/', 'TWiki'], From b289f86243c67aebc8e714c043d7cc2c17ad2ca7 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Fri, 28 Aug 2020 08:25:33 +0200 Subject: [PATCH 7/8] Fix getPageTitlesScraper Using the API and the Special:Allpages scraper should result in the same number of titles. Fix the detection of the next subpages on Special:Allpages. Change the max depth to 100 and implement an anti loop (could fail on non-western wiki). --- dumpgenerator.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 1c103bb..536cd0e 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -299,9 +299,10 @@ def getPageTitlesScraper(config={}, session=None): else: pass # perhaps no subpages - # 3 is the current deep of English Wikipedia for Special:Allpages - deep = 3 + # Should be enought subpages on Special:Allpages + deep = 50 c = 0 + oldfr = '' checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: @@ -309,6 +310,11 @@ def getPageTitlesScraper(config={}, session=None): m = re.compile(r_suballpages).finditer(raw) for i in m: fr = i.group('from') + currfr = fr + + if oldfr == currfr: + # We are looping, exit the loop + pass if r_suballpages == r_suballpages1: to = i.group('to') @@ -329,19 +335,23 @@ def getPageTitlesScraper(config={}, session=None): url = '%s?title=Special:Allpages&from=%s&namespace=%s' % ( config['index'], name, namespace) + + if name not in checked_suballpages: # to avoid reload dupe subpages links checked_suballpages.append(name) delay(config=config, session=session) - r2 = session.get(url=url, timeout=10) - raw2 = r2.text - raw2 = cleanHTML(raw2) - rawacum += raw2 # merge it after removed junk - print ' Reading', name, len(raw2), 'bytes', \ - len(re.findall(r_suballpages, raw2)), 'subpages', \ - len(re.findall(r_title, raw2)), 'pages' + r = session.get(url=url, timeout=10) + #print 'Fetching URL: ', url + raw = r.text + raw = cleanHTML(raw) + rawacum += raw # merge it after removed junk + print ' Reading', name, len(raw), 'bytes', \ + len(re.findall(r_suballpages, raw)), 'subpages', \ + len(re.findall(r_title, raw)), 'pages' delay(config=config, session=session) + oldfr = currfr c += 1 c = 0 From 5986467b12a9eef5731142fadd98ac6ea67b3b85 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Fri, 28 Aug 2020 08:46:51 +0200 Subject: [PATCH 8/8] Cleanup of link rot Lot of wiki in test_dumpgenerator.py doesn't exist anymore. Remove them from the CI. --- testing/test_dumpgenerator.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/testing/test_dumpgenerator.py b/testing/test_dumpgenerator.py index 4c771f2..7dd2101 100644 --- a/testing/test_dumpgenerator.py +++ b/testing/test_dumpgenerator.py @@ -219,9 +219,9 @@ class TestDumpgenerator(unittest.TestCase): ['http://www.wasteflake.com/', 'TikiWiki'], ['http://foswiki.org/', 'FosWiki'], ['http://www.w3c.br/Home/WebHome', 'FosWiki'], - ['http://mojomojo.org/', 'MojoMojo'], - ['http://wiki.catalystframework.org/wiki/', 'MojoMojo'], - ['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'], + #['http://mojomojo.org/', 'MojoMojo'], + #['http://wiki.catalystframework.org/wiki/', 'MojoMojo'], + #['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'], #['https://web.archive.org/web/20080517021020id_/http://berlin.xwiki.com/xwiki/bin/view/Main/WebHome', 'XWiki'], ['http://www.xwiki.org/xwiki/bin/view/Main/WebHome', 'XWiki'], ['https://confluence.atlassian.com/', 'Confluence'], @@ -229,32 +229,32 @@ class TestDumpgenerator(unittest.TestCase): ['https://confluence.sakaiproject.org/', 'Confluence'], #['http://demo.bananadance.org/', 'Banana Dance'], ['http://wagn.org/', 'Wagn'], - ['http://wiki.ace-mod.net/', 'Wagn'], + #['http://wiki.ace-mod.net/', 'Wagn'], #['https://success.mindtouch.com/', 'MindTouch'], #['https://jspwiki.apache.org/', 'JSPWiki'], ['http://www.ihear.com/FreeCLAS/', 'JSPWiki'], ['http://www.wikkawiki.org/HomePage', 'WikkaWiki'], - ['http://puppylinux.org/wikka/', 'WikkaWiki'], - ['http://cs.netsville.com/wiki/wikka.php', 'WikkaWiki'], + #['http://puppylinux.org/wikka/', 'WikkaWiki'], + ['https://www.cybersphere.net/', 'MediaWiki'], #['http://web.archive.org/web/20060717202033id_/http://www.comawiki.org/CoMa.php?CoMa=startseite', 'CoMaWiki'], ['http://bootbook.de/CoMa.php', 'CoMaWiki'], #['http://wikini.net/wakka.php', 'WikiNi'], ['http://wiki.raydium.org/wiki/', 'WikiNi'], - ['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'], - ['http://wackowiki.sourceforge.net/test/', 'WackoWiki'], + #['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'], + #['http://wackowiki.sourceforge.net/test/', 'WackoWiki'], ['http://www.sw4me.com/wiki/', 'WackoWiki'], - ['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'], + #['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'], ['http://kw.pm.org/wiki/index.cgi', 'Kwiki'], ['http://wiki.wubi.org/index.cgi', 'Kwiki'], #['http://perl.bristolbath.org/index.cgi', 'Kwiki'], - ['http://www.anwiki.com/', 'Anwiki'], - ['http://www.anw.fr/', 'Anwiki'], + #['http://www.anwiki.com/', 'Anwiki'], + #['http://www.anw.fr/', 'Anwiki'], ['http://www.aneuch.org/', 'Aneuch'], ['http://doc.myunixhost.com/', 'Aneuch'], ['http://www.bitweaver.org/wiki/index.php', 'bitweaver'], ['http://wiki.e-shell.org/Home', 'Zwiki'], ['http://leo.zwiki.org/', 'Zwiki'], - ['http://accessibility4all.wikispaces.com/', 'Wikispaces'], + #['http://accessibility4all.wikispaces.com/', 'Wikispaces'], ['http://darksouls.wikidot.com/', 'Wikidot'], ['http://www.wikifoundrycentral.com/', 'Wetpaint'], ['http://wiki.openid.net/', 'PBworks'],