first version of wikispaces spider

2024-11-12 07:12:41 +00:00 · 2016-07-27 17:39:31 +02:00 · 2016-07-27 17:39:31 +02:00 · 01ccacd138
commit 01ccacd138
parent 82e84d59ad
3 changed files with 279 additions and 0 deletions
--- a/listsofwikis/wikispaces/users.txt
+++ b/listsofwikis/wikispaces/users.txt
@ -0,0 +1,121 @@
+AliciaWaters,1
+Arenoosh,1
+BambuNatural,1
+BlackSheepInn,0
+BlancaRobleda,1
+BleachTeach,1
+Chase.Pereira,1
+Dan.Paleczny,1
+Dancombs98,1
+Deborah.McLaren,1
+DeborahMcLaren,0
+Diegoc93,1
+EVbusinessteacher,1
+Eurapart,0
+JessieRS,1
+JessikaTate,1
+JohnNobilski,1
+Joserios11,1
+Jrios885,1
+JuliaSanabria,1
+Justin.Dabill,1
+KevinGough11,1
+KevinMPA,1
+Lduncan107,1
+LilTlaloc,2
+MBKlein,1
+MFierros,1
+MIGUEOAX,1
+Marlenehrenberg,2
+MayraVazquez1,1
+Melissa63,2
+Moy1976,1
+MrPalmer67,1
+RondaGreen,1
+Ruukel,0
+SamanthaElizabeth,2
+ScottOsterholt1,1
+TylerZybach-DeBoer,1
+WINTAwiki,1
+Xixim,1
+abehl,1
+albabcn,1
+alex.villca,1
+andydrumm,0
+annafoster21,1
+annagmoore,1
+annaspenceley,1
+aseremomax,1
+ashleyrownd123,1
+astronomyteacher,5
+avillicana687,1
+ayuukchacha,1
+ayuukoax,1
+becari,1
+becaricampusqroaxaca,0
+biancagchan,1
+bicicletaspedromartinez,1
+bugambilias,1
+businesscoordinator,1
+bwaters23,2
+camatchitral,0
+carriehurtado,1
+celinabalasoto,1
+chacorunner,1
+charlotten22,1
+charolains,1
+chrismilnes,1
+chtopete,3
+consultoriaindigenaoaxaca,1
+cristinamartinez8,0
+despacharte,1
+dvgovteacher,1
+dvgovteacher1,1
+ecabanilla,1
+edgarbartolo,1
+edgarraygoza95,1
+englishcoordinator,1
+envia,1
+fcummings294,1
+florencio,1
+geoffb1,1
+georginatrout,1
+gerhardbuttner,2
+gregshirley,1
+hermantyler,1
+insitu1,1
+institutoamigosdelsol,1
+jamigo55,1
+jgonzalez631,1
+joannazemla,1
+joshdkirby,1
+justinrieger22,1
+katabel,1
+krestow,2
+lasmariposas,1
+liliacoronel,1
+lindaramirez3,1
+louisebranch,1
+ltimrott,1
+lulaa,1
+mariamcclain,1
+matthewmucha24,1
+nutti,1
+oaxdave,1
+oddyeti,1
+ojoqtv,1
+patwilson2,1
+planeta,0
+raylorscheider,1
+raymondkuntz,1
+respontour,0
+salliegrayson,1
+sandraluz2,1
+sergiolazomendoza,1
+sherrilivingston,1
+susanbeanaycock,1
+thistourismweek,1
+timeunlimited,1
+turismooaxaca,1
+victoria.alahuzos,1
+willcorning,1
--- a/listsofwikis/wikispaces/wikis.txt
+++ b/listsofwikis/wikispaces/wikis.txt
@ -0,0 +1,11 @@
+astronomylinks,35
+drkrestow,1
+dvaceacademy,16
+dvapphysics,1
+dvsra,1
+enviabusiness,4
+enviaenglish,8
+gccastronomy,1
+martiangovernment,2
+oaxaca,31
+planeta,35
--- a/listsofwikis/wikispaces/wikispaces-spider.py
+++ b/listsofwikis/wikispaces/wikispaces-spider.py
@ -0,0 +1,147 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2016 wikiTeam
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import csv
+import re
+import time
+import urllib2
+
+def loadUsers():
+    users = {}
+    f = open('users.txt', 'r')
+    for x in f.read().strip().splitlines():
+        username = x.split(',')[0]
+        numwikis = x.split(',')[1]
+        users[username] = numwikis
+    f.close()
+    return users
+
+def loadWikis():
+    wikis = {}
+    f = open('wikis.txt', 'r')
+    for x in f.read().strip().splitlines():
+        wikiname = x.split(',')[0]
+        numusers = x.split(',')[1]
+        wikis[wikiname] = numusers
+    f.close()
+    return wikis
+
+def saveUsers(users):
+    f = open('users.txt', 'w')
+    output = [u'%s,%s' % (x, y) for x, y in users.items()]
+    output.sort()
+    output = u'\n'.join(output)
+    f.write(output.encode('utf-8'))
+    f.close()
+    
+def saveWikis(wikis):
+    f = open('wikis.txt', 'w')
+    output = [u'%s,%s' % (x, y) for x, y in wikis.items()]
+    output.sort()
+    output = u'\n'.join(output)
+    f.write(output.encode('utf-8'))
+    f.close()
+
+def getUsers(wiki):
+    wikiurl = 'https://%s.wikispaces.com/wiki/members?utable=WikiTableMemberList&ut_csv=1' % (wiki)
+    try:
+        wikireq = urllib2.Request(wikiurl, headers={ 'User-Agent': 'Mozilla/5.0' })
+        wikicsv = urllib2.urlopen(wikireq)
+        reader = csv.reader(wikicsv, delimiter=',', quotechar='"')
+        headers = next(reader, None)
+        usersfound = {}
+        for row in reader:
+            usersfound[row[0]] = u'?'
+        return usersfound
+    except:
+        print 'Error reading', wikiurl
+        return {}
+
+def getWikis(user):
+    wikiurl = 'https://www.wikispaces.com/user/view/%s' % (user)
+    try:
+        wikireq = urllib2.Request(wikiurl, headers={ 'User-Agent': 'Mozilla/5.0' })
+        html = urllib2.urlopen(wikireq).read()
+        if 'Wikis: ' in html:
+            html = html.split('Wikis: ')[1].split('</div>')[0]
+            wikisfound = {}
+            for x in re.findall(ur'<a href="https://([^>]+).wikispaces.com/">', html):
+                wikisfound[x] = u'?'
+            return wikisfound
+        return {}
+    except:
+        print 'Error reading', wikiurl
+        return {}
+
+def main():
+    users = loadUsers()
+    wikis = loadWikis()
+    
+    usersc = len(users)
+    wikisc = len(wikis)
+    print 'Loading files'
+    print 'Loaded', usersc, 'users'
+    print 'Loaded', wikisc, 'wikis'
+    
+    # find more users
+    print 'Scanning wikis for more users'
+    for wiki, numusers in wikis.items():
+        if numusers != '?': #we have scanned this wiki before, skiping
+            continue
+        print 'Scanning https://%s.wikispaces.com for users' % (wiki)
+        users2 = getUsers(wiki)
+        wikis[wiki] = len(users2)
+        c = 0
+        for x2, y2 in users2.items():
+            if x2 not in users.keys():
+                users[x2] = u'?'
+                c += 1
+        print 'Found %s new users' % (c)
+        if c > 0:
+            saveUsers(users)
+            users = loadUsers()
+        saveWikis(wikis)
+        time.sleep(1)
+    wikis = loadWikis()
+    
+    # find more wikis
+    print 'Scanning users for more wikis'
+    for user, numwikis in users.items():
+        if numwikis != '?': #we have scanned this user before, skiping
+            continue
+        print 'Scanning https://www.wikispaces.com/user/view/%s for wikis' % (user)
+        wikis2 = getWikis(user)
+        users[user] = len(wikis2)
+        c = 0
+        for x2, y2 in wikis2.items():
+            if x2 not in wikis.keys():
+                wikis[x2] = u'?'
+                c += 1
+        print 'Found %s new wikis' % (c)
+        if c > 0:
+            saveWikis(wikis)
+            wikis = loadWikis()
+        saveUsers(users)
+        time.sleep(1)
+    users = loadUsers()
+    
+    print '\nSummary:'
+    print 'Found', len(users)-usersc, 'new users'
+    print 'Found', len(wikis)-wikisc, 'new wikis'
+
+if __name__ == '__main__':
+    main()