From d0f7f83ee525851238289af99ce1ad708bbbe520 Mon Sep 17 00:00:00 2001 From: emijrp Date: Fri, 10 Jul 2015 09:03:16 +0200 Subject: [PATCH] reading from IA --- not-archived.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/not-archived.py b/not-archived.py index 93eee71..c79f660 100644 --- a/not-archived.py +++ b/not-archived.py @@ -4,7 +4,7 @@ # not-archived.py List of not archived wikis, using WikiApiary data # NOTE: unreliable! https://github.com/WikiApiary/WikiApiary/issues/130 # -# Copyright (C) 2014 WikiTeam developers +# Copyright (C) 2015 WikiTeam developers # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or @@ -28,9 +28,10 @@ def getdomain(wiki): return domain def main(): - doneurl = 'https://raw.githubusercontent.com/WikiTeam/wikiteam/master/batchdownload/taskforce/mediawikis_done_2014.txt' + doneurl = 'https://archive.org/advancedsearch.php?q=collection%3A%28wikiteam%29+AND+originalurl%3A[%22http%22+TO+null]&fl[]=description&sort[]=&sort[]=&sort[]=&rows=100000&page=1&output=json&callback=callback' f = urllib.urlopen(doneurl) - donewikis = [getdomain(wiki) for wiki in f.read().splitlines()] + wikiurls = re.findall(ur'(?im)]+?)\\" rel=\\"nofollow\\">[^<]+? dumped with', f.read()) + donewikis = [getdomain(wikiurl) for wikiurl in wikiurls] #print 'Loaded %d done wikis' % len(donewikis) offset = 0 @@ -50,6 +51,5 @@ def main(): break offset += limit - if __name__ == '__main__': main()