mirror of
https://github.com/WikiTeam/wikiteam
synced 2024-11-12 07:12:41 +00:00
first version of wikispaces spider
This commit is contained in:
parent
82e84d59ad
commit
01ccacd138
121
listsofwikis/wikispaces/users.txt
Normal file
121
listsofwikis/wikispaces/users.txt
Normal file
@ -0,0 +1,121 @@
|
||||
AliciaWaters,1
|
||||
Arenoosh,1
|
||||
BambuNatural,1
|
||||
BlackSheepInn,0
|
||||
BlancaRobleda,1
|
||||
BleachTeach,1
|
||||
Chase.Pereira,1
|
||||
Dan.Paleczny,1
|
||||
Dancombs98,1
|
||||
Deborah.McLaren,1
|
||||
DeborahMcLaren,0
|
||||
Diegoc93,1
|
||||
EVbusinessteacher,1
|
||||
Eurapart,0
|
||||
JessieRS,1
|
||||
JessikaTate,1
|
||||
JohnNobilski,1
|
||||
Joserios11,1
|
||||
Jrios885,1
|
||||
JuliaSanabria,1
|
||||
Justin.Dabill,1
|
||||
KevinGough11,1
|
||||
KevinMPA,1
|
||||
Lduncan107,1
|
||||
LilTlaloc,2
|
||||
MBKlein,1
|
||||
MFierros,1
|
||||
MIGUEOAX,1
|
||||
Marlenehrenberg,2
|
||||
MayraVazquez1,1
|
||||
Melissa63,2
|
||||
Moy1976,1
|
||||
MrPalmer67,1
|
||||
RondaGreen,1
|
||||
Ruukel,0
|
||||
SamanthaElizabeth,2
|
||||
ScottOsterholt1,1
|
||||
TylerZybach-DeBoer,1
|
||||
WINTAwiki,1
|
||||
Xixim,1
|
||||
abehl,1
|
||||
albabcn,1
|
||||
alex.villca,1
|
||||
andydrumm,0
|
||||
annafoster21,1
|
||||
annagmoore,1
|
||||
annaspenceley,1
|
||||
aseremomax,1
|
||||
ashleyrownd123,1
|
||||
astronomyteacher,5
|
||||
avillicana687,1
|
||||
ayuukchacha,1
|
||||
ayuukoax,1
|
||||
becari,1
|
||||
becaricampusqroaxaca,0
|
||||
biancagchan,1
|
||||
bicicletaspedromartinez,1
|
||||
bugambilias,1
|
||||
businesscoordinator,1
|
||||
bwaters23,2
|
||||
camatchitral,0
|
||||
carriehurtado,1
|
||||
celinabalasoto,1
|
||||
chacorunner,1
|
||||
charlotten22,1
|
||||
charolains,1
|
||||
chrismilnes,1
|
||||
chtopete,3
|
||||
consultoriaindigenaoaxaca,1
|
||||
cristinamartinez8,0
|
||||
despacharte,1
|
||||
dvgovteacher,1
|
||||
dvgovteacher1,1
|
||||
ecabanilla,1
|
||||
edgarbartolo,1
|
||||
edgarraygoza95,1
|
||||
englishcoordinator,1
|
||||
envia,1
|
||||
fcummings294,1
|
||||
florencio,1
|
||||
geoffb1,1
|
||||
georginatrout,1
|
||||
gerhardbuttner,2
|
||||
gregshirley,1
|
||||
hermantyler,1
|
||||
insitu1,1
|
||||
institutoamigosdelsol,1
|
||||
jamigo55,1
|
||||
jgonzalez631,1
|
||||
joannazemla,1
|
||||
joshdkirby,1
|
||||
justinrieger22,1
|
||||
katabel,1
|
||||
krestow,2
|
||||
lasmariposas,1
|
||||
liliacoronel,1
|
||||
lindaramirez3,1
|
||||
louisebranch,1
|
||||
ltimrott,1
|
||||
lulaa,1
|
||||
mariamcclain,1
|
||||
matthewmucha24,1
|
||||
nutti,1
|
||||
oaxdave,1
|
||||
oddyeti,1
|
||||
ojoqtv,1
|
||||
patwilson2,1
|
||||
planeta,0
|
||||
raylorscheider,1
|
||||
raymondkuntz,1
|
||||
respontour,0
|
||||
salliegrayson,1
|
||||
sandraluz2,1
|
||||
sergiolazomendoza,1
|
||||
sherrilivingston,1
|
||||
susanbeanaycock,1
|
||||
thistourismweek,1
|
||||
timeunlimited,1
|
||||
turismooaxaca,1
|
||||
victoria.alahuzos,1
|
||||
willcorning,1
|
11
listsofwikis/wikispaces/wikis.txt
Normal file
11
listsofwikis/wikispaces/wikis.txt
Normal file
@ -0,0 +1,11 @@
|
||||
astronomylinks,35
|
||||
drkrestow,1
|
||||
dvaceacademy,16
|
||||
dvapphysics,1
|
||||
dvsra,1
|
||||
enviabusiness,4
|
||||
enviaenglish,8
|
||||
gccastronomy,1
|
||||
martiangovernment,2
|
||||
oaxaca,31
|
||||
planeta,35
|
147
listsofwikis/wikispaces/wikispaces-spider.py
Normal file
147
listsofwikis/wikispaces/wikispaces-spider.py
Normal file
@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (C) 2016 wikiTeam
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import csv
|
||||
import re
|
||||
import time
|
||||
import urllib2
|
||||
|
||||
def loadUsers():
|
||||
users = {}
|
||||
f = open('users.txt', 'r')
|
||||
for x in f.read().strip().splitlines():
|
||||
username = x.split(',')[0]
|
||||
numwikis = x.split(',')[1]
|
||||
users[username] = numwikis
|
||||
f.close()
|
||||
return users
|
||||
|
||||
def loadWikis():
|
||||
wikis = {}
|
||||
f = open('wikis.txt', 'r')
|
||||
for x in f.read().strip().splitlines():
|
||||
wikiname = x.split(',')[0]
|
||||
numusers = x.split(',')[1]
|
||||
wikis[wikiname] = numusers
|
||||
f.close()
|
||||
return wikis
|
||||
|
||||
def saveUsers(users):
|
||||
f = open('users.txt', 'w')
|
||||
output = [u'%s,%s' % (x, y) for x, y in users.items()]
|
||||
output.sort()
|
||||
output = u'\n'.join(output)
|
||||
f.write(output.encode('utf-8'))
|
||||
f.close()
|
||||
|
||||
def saveWikis(wikis):
|
||||
f = open('wikis.txt', 'w')
|
||||
output = [u'%s,%s' % (x, y) for x, y in wikis.items()]
|
||||
output.sort()
|
||||
output = u'\n'.join(output)
|
||||
f.write(output.encode('utf-8'))
|
||||
f.close()
|
||||
|
||||
def getUsers(wiki):
|
||||
wikiurl = 'https://%s.wikispaces.com/wiki/members?utable=WikiTableMemberList&ut_csv=1' % (wiki)
|
||||
try:
|
||||
wikireq = urllib2.Request(wikiurl, headers={ 'User-Agent': 'Mozilla/5.0' })
|
||||
wikicsv = urllib2.urlopen(wikireq)
|
||||
reader = csv.reader(wikicsv, delimiter=',', quotechar='"')
|
||||
headers = next(reader, None)
|
||||
usersfound = {}
|
||||
for row in reader:
|
||||
usersfound[row[0]] = u'?'
|
||||
return usersfound
|
||||
except:
|
||||
print 'Error reading', wikiurl
|
||||
return {}
|
||||
|
||||
def getWikis(user):
|
||||
wikiurl = 'https://www.wikispaces.com/user/view/%s' % (user)
|
||||
try:
|
||||
wikireq = urllib2.Request(wikiurl, headers={ 'User-Agent': 'Mozilla/5.0' })
|
||||
html = urllib2.urlopen(wikireq).read()
|
||||
if 'Wikis: ' in html:
|
||||
html = html.split('Wikis: ')[1].split('</div>')[0]
|
||||
wikisfound = {}
|
||||
for x in re.findall(ur'<a href="https://([^>]+).wikispaces.com/">', html):
|
||||
wikisfound[x] = u'?'
|
||||
return wikisfound
|
||||
return {}
|
||||
except:
|
||||
print 'Error reading', wikiurl
|
||||
return {}
|
||||
|
||||
def main():
|
||||
users = loadUsers()
|
||||
wikis = loadWikis()
|
||||
|
||||
usersc = len(users)
|
||||
wikisc = len(wikis)
|
||||
print 'Loading files'
|
||||
print 'Loaded', usersc, 'users'
|
||||
print 'Loaded', wikisc, 'wikis'
|
||||
|
||||
# find more users
|
||||
print 'Scanning wikis for more users'
|
||||
for wiki, numusers in wikis.items():
|
||||
if numusers != '?': #we have scanned this wiki before, skiping
|
||||
continue
|
||||
print 'Scanning https://%s.wikispaces.com for users' % (wiki)
|
||||
users2 = getUsers(wiki)
|
||||
wikis[wiki] = len(users2)
|
||||
c = 0
|
||||
for x2, y2 in users2.items():
|
||||
if x2 not in users.keys():
|
||||
users[x2] = u'?'
|
||||
c += 1
|
||||
print 'Found %s new users' % (c)
|
||||
if c > 0:
|
||||
saveUsers(users)
|
||||
users = loadUsers()
|
||||
saveWikis(wikis)
|
||||
time.sleep(1)
|
||||
wikis = loadWikis()
|
||||
|
||||
# find more wikis
|
||||
print 'Scanning users for more wikis'
|
||||
for user, numwikis in users.items():
|
||||
if numwikis != '?': #we have scanned this user before, skiping
|
||||
continue
|
||||
print 'Scanning https://www.wikispaces.com/user/view/%s for wikis' % (user)
|
||||
wikis2 = getWikis(user)
|
||||
users[user] = len(wikis2)
|
||||
c = 0
|
||||
for x2, y2 in wikis2.items():
|
||||
if x2 not in wikis.keys():
|
||||
wikis[x2] = u'?'
|
||||
c += 1
|
||||
print 'Found %s new wikis' % (c)
|
||||
if c > 0:
|
||||
saveWikis(wikis)
|
||||
wikis = loadWikis()
|
||||
saveUsers(users)
|
||||
time.sleep(1)
|
||||
users = loadUsers()
|
||||
|
||||
print '\nSummary:'
|
||||
print 'Found', len(users)-usersc, 'new users'
|
||||
print 'Found', len(wikis)-wikisc, 'new wikis'
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user