From 9c5c55342da57cec1b69bbce30026b9691332b94 Mon Sep 17 00:00:00 2001 From: Liu Date: Tue, 12 Apr 2022 20:18:03 -0400 Subject: [PATCH] Update miraheze.org spider and remove duplicates --- listsofwikis/mediawiki/miraheze-spider.py | 44 ++-- listsofwikis/mediawiki/miraheze.org | 268 ++-------------------- 2 files changed, 44 insertions(+), 268 deletions(-) diff --git a/listsofwikis/mediawiki/miraheze-spider.py b/listsofwikis/mediawiki/miraheze-spider.py index f8d6019..a1400ba 100644 --- a/listsofwikis/mediawiki/miraheze-spider.py +++ b/listsofwikis/mediawiki/miraheze-spider.py @@ -1,35 +1,53 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (C) 2014-2017 WikiTeam developers +# Copyright (C) 2022 Simon Liu # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program. If not, see . import re +import time import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin + +def nextpage(soup): + try: + soup.find('span', text='Next page').parent['href'] + return True + except: + return False def main(): headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0', } - - url = 'https://meta.miraheze.org/wiki/Special:SiteMatrix' - r = requests.get(url, headers=headers) - raw = r.text - m = re.findall(ur'()?[^<]+', raw) - m.sort() - for i in m: - print 'https://' + i[1] + '/w/api.php' - + + req = requests.get('https://meta.miraheze.org/wiki/Special:WikiDiscover') + soup = BeautifulSoup(req.content, features='lxml') + wikis = re.findall(r']+?)\">', req.text) + + while nextpage(soup): + time.sleep(0.3) + req = requests.get(urljoin('https://meta.miraheze.org', soup.find('span', text='Next page').parent['href'])) + soup = BeautifulSoup(req.content, features='lxml') + wikis.extend(re.findall(r']+?)\">', req.text)) + + wikis = list(set(wikis)) + wikis.sort() + with open('miraheze.org', 'w') as f: + for wiki in wikis: + f.write(urljoin(wiki, 'w/api.php') + '\n') + if __name__ == '__main__': main() diff --git a/listsofwikis/mediawiki/miraheze.org b/listsofwikis/mediawiki/miraheze.org index 255266b..e0f4201 100644 --- a/listsofwikis/mediawiki/miraheze.org +++ b/listsofwikis/mediawiki/miraheze.org @@ -221,6 +221,7 @@ https://alternatetudorhistory.miraheze.org/w/api.php https://alternateworlds.miraheze.org/w/api.php https://alternativehistorypolandball.miraheze.org/w/api.php https://alternativereload.miraheze.org/w/api.php +https://alternativesc.miraheze.org/w/api.php https://alternatiwow.miraheze.org/w/api.php https://althistory.miraheze.org/w/api.php https://altirlfiction.miraheze.org/w/api.php @@ -692,6 +693,7 @@ https://brno.miraheze.org/w/api.php https://brochuresecu.miraheze.org/w/api.php https://brokenempires.miraheze.org/w/api.php https://bronzewright.miraheze.org/w/api.php +https://brookvalegroby.miraheze.org/w/api.php https://broternal.miraheze.org/w/api.php https://brucenet.miraheze.org/w/api.php https://bruschettopedia.miraheze.org/w/api.php @@ -860,6 +862,7 @@ https://chaos.miraheze.org/w/api.php https://chaosawakens.miraheze.org/w/api.php https://chaoshan.miraheze.org/w/api.php https://chaosmagick.miraheze.org/w/api.php +https://characterclasses.miraheze.org/w/api.php https://charactercompendium.miraheze.org/w/api.php https://characterpedia.miraheze.org/w/api.php https://charliethelegend.miraheze.org/w/api.php @@ -1196,6 +1199,7 @@ https://davidslist.miraheze.org/w/api.php https://daviejones.miraheze.org/w/api.php https://davshiq.miraheze.org/w/api.php https://dawera.miraheze.org/w/api.php +https://dawnoftheredyears.miraheze.org/w/api.php https://dawnpowers.miraheze.org/w/api.php https://daybreak.miraheze.org/w/api.php https://dbanimation.miraheze.org/w/api.php @@ -1285,6 +1289,7 @@ https://diitabikimarronmuseum.miraheze.org/w/api.php https://dimascenter.miraheze.org/w/api.php https://dimensional.miraheze.org/w/api.php https://dingedb.miraheze.org/w/api.php +https://diocesi.miraheze.org/w/api.php https://discontinuedcandy.miraheze.org/w/api.php https://discord.miraheze.org/w/api.php https://discordearthvision.miraheze.org/w/api.php @@ -1437,6 +1442,7 @@ https://ecoepi.miraheze.org/w/api.php https://ecole.science/w/api.php https://ecrosogames.miraheze.org/w/api.php https://ecsus.miraheze.org/w/api.php +https://ecyclopedia.miraheze.org/w/api.php https://ecype.miraheze.org/w/api.php https://edddcord.miraheze.org/w/api.php https://eddsworld.miraheze.org/w/api.php @@ -2293,7 +2299,7 @@ https://ideasng.miraheze.org/w/api.php https://idiomas.miraheze.org/w/api.php https://idiotentruppdnd.miraheze.org/w/api.php https://idiotpaedia.miraheze.org/w/api.php -https://idleon.info/w/api.php +https://idleon.miraheze.org/w/api.php https://idolish7.miraheze.org/w/api.php https://idwiki.miraheze.org/w/api.php https://ieeesp.miraheze.org/w/api.php @@ -2484,6 +2490,7 @@ https://jawp2ch.miraheze.org/w/api.php https://jawp5ch.miraheze.org/w/api.php https://jawpcross5ch.miraheze.org/w/api.php https://jawptest.miraheze.org/w/api.php +https://jaybirdmusicproject.miraheze.org/w/api.php https://jaydec02.miraheze.org/w/api.php https://jayeysheadcanon.miraheze.org/w/api.php https://jayuvandal.miraheze.org/w/api.php @@ -2706,6 +2713,7 @@ https://lanantm.miraheze.org/w/api.php https://lancer.miraheze.org/w/api.php https://landedifaerun.miraheze.org/w/api.php https://landev.miraheze.org/w/api.php +https://landofennui.miraheze.org/w/api.php https://landofliberos.miraheze.org/w/api.php https://landsofbjortolfia.miraheze.org/w/api.php https://landwikialliance.miraheze.org/w/api.php @@ -2956,6 +2964,7 @@ https://makropedia.miraheze.org/w/api.php https://mal.miraheze.org/w/api.php https://malaysia.miraheze.org/w/api.php https://malaysianschools.miraheze.org/w/api.php +https://malcolmverse.miraheze.org/w/api.php https://maldon.miraheze.org/w/api.php https://malesshoeloss.miraheze.org/w/api.php https://malleoandweegeefanon.miraheze.org/w/api.php @@ -3174,260 +3183,6 @@ https://miracraft.miraheze.org/w/api.php https://miraculous.miraheze.org/w/api.php https://mirae.miraheze.org/w/api.php https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php -https://miraheze.org/w/api.php https://mirahezium.miraheze.org/w/api.php https://mirapedia.miraheze.org/w/api.php https://mirinano.miraheze.org/w/api.php @@ -3775,6 +3530,7 @@ https://numtot.miraheze.org/w/api.php https://nusquwiki.miraheze.org/w/api.php https://nutatnik.miraheze.org/w/api.php https://nutscript.miraheze.org/w/api.php +https://nux.miraheze.org/w/api.php https://nuzloquest.miraheze.org/w/api.php https://nvc.miraheze.org/w/api.php https://nwkodaly.miraheze.org/w/api.php @@ -3918,6 +3674,7 @@ https://ourum.miraheze.org/w/api.php https://outlaws.miraheze.org/w/api.php https://outsideofthebox.miraheze.org/w/api.php https://ovenbreak.miraheze.org/w/api.php +https://overemployment.miraheze.org/w/api.php https://overon.miraheze.org/w/api.php https://overthegame.miraheze.org/w/api.php https://overworld.miraheze.org/w/api.php @@ -5455,6 +5212,7 @@ https://tot.wiki/w/api.php https://totaldramagame.miraheze.org/w/api.php https://totallyaccuraterealworld.miraheze.org/w/api.php https://touhou.miraheze.org/w/api.php +https://touhourebirth.miraheze.org/w/api.php https://tounae.miraheze.org/w/api.php https://tower.miraheze.org/w/api.php https://toweroffantasy.miraheze.org/w/api.php