first version of wikispaces spider

pull/287/head
emijrp 8 years ago
parent 82e84d59ad
commit 01ccacd138

@ -0,0 +1,121 @@
AliciaWaters,1
Arenoosh,1
BambuNatural,1
BlackSheepInn,0
BlancaRobleda,1
BleachTeach,1
Chase.Pereira,1
Dan.Paleczny,1
Dancombs98,1
Deborah.McLaren,1
DeborahMcLaren,0
Diegoc93,1
EVbusinessteacher,1
Eurapart,0
JessieRS,1
JessikaTate,1
JohnNobilski,1
Joserios11,1
Jrios885,1
JuliaSanabria,1
Justin.Dabill,1
KevinGough11,1
KevinMPA,1
Lduncan107,1
LilTlaloc,2
MBKlein,1
MFierros,1
MIGUEOAX,1
Marlenehrenberg,2
MayraVazquez1,1
Melissa63,2
Moy1976,1
MrPalmer67,1
RondaGreen,1
Ruukel,0
SamanthaElizabeth,2
ScottOsterholt1,1
TylerZybach-DeBoer,1
WINTAwiki,1
Xixim,1
abehl,1
albabcn,1
alex.villca,1
andydrumm,0
annafoster21,1
annagmoore,1
annaspenceley,1
aseremomax,1
ashleyrownd123,1
astronomyteacher,5
avillicana687,1
ayuukchacha,1
ayuukoax,1
becari,1
becaricampusqroaxaca,0
biancagchan,1
bicicletaspedromartinez,1
bugambilias,1
businesscoordinator,1
bwaters23,2
camatchitral,0
carriehurtado,1
celinabalasoto,1
chacorunner,1
charlotten22,1
charolains,1
chrismilnes,1
chtopete,3
consultoriaindigenaoaxaca,1
cristinamartinez8,0
despacharte,1
dvgovteacher,1
dvgovteacher1,1
ecabanilla,1
edgarbartolo,1
edgarraygoza95,1
englishcoordinator,1
envia,1
fcummings294,1
florencio,1
geoffb1,1
georginatrout,1
gerhardbuttner,2
gregshirley,1
hermantyler,1
insitu1,1
institutoamigosdelsol,1
jamigo55,1
jgonzalez631,1
joannazemla,1
joshdkirby,1
justinrieger22,1
katabel,1
krestow,2
lasmariposas,1
liliacoronel,1
lindaramirez3,1
louisebranch,1
ltimrott,1
lulaa,1
mariamcclain,1
matthewmucha24,1
nutti,1
oaxdave,1
oddyeti,1
ojoqtv,1
patwilson2,1
planeta,0
raylorscheider,1
raymondkuntz,1
respontour,0
salliegrayson,1
sandraluz2,1
sergiolazomendoza,1
sherrilivingston,1
susanbeanaycock,1
thistourismweek,1
timeunlimited,1
turismooaxaca,1
victoria.alahuzos,1
willcorning,1

@ -0,0 +1,11 @@
astronomylinks,35
drkrestow,1
dvaceacademy,16
dvapphysics,1
dvsra,1
enviabusiness,4
enviaenglish,8
gccastronomy,1
martiangovernment,2
oaxaca,31
planeta,35

@ -0,0 +1,147 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2016 wikiTeam
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import csv
import re
import time
import urllib2
def loadUsers():
users = {}
f = open('users.txt', 'r')
for x in f.read().strip().splitlines():
username = x.split(',')[0]
numwikis = x.split(',')[1]
users[username] = numwikis
f.close()
return users
def loadWikis():
wikis = {}
f = open('wikis.txt', 'r')
for x in f.read().strip().splitlines():
wikiname = x.split(',')[0]
numusers = x.split(',')[1]
wikis[wikiname] = numusers
f.close()
return wikis
def saveUsers(users):
f = open('users.txt', 'w')
output = [u'%s,%s' % (x, y) for x, y in users.items()]
output.sort()
output = u'\n'.join(output)
f.write(output.encode('utf-8'))
f.close()
def saveWikis(wikis):
f = open('wikis.txt', 'w')
output = [u'%s,%s' % (x, y) for x, y in wikis.items()]
output.sort()
output = u'\n'.join(output)
f.write(output.encode('utf-8'))
f.close()
def getUsers(wiki):
wikiurl = 'https://%s.wikispaces.com/wiki/members?utable=WikiTableMemberList&ut_csv=1' % (wiki)
try:
wikireq = urllib2.Request(wikiurl, headers={ 'User-Agent': 'Mozilla/5.0' })
wikicsv = urllib2.urlopen(wikireq)
reader = csv.reader(wikicsv, delimiter=',', quotechar='"')
headers = next(reader, None)
usersfound = {}
for row in reader:
usersfound[row[0]] = u'?'
return usersfound
except:
print 'Error reading', wikiurl
return {}
def getWikis(user):
wikiurl = 'https://www.wikispaces.com/user/view/%s' % (user)
try:
wikireq = urllib2.Request(wikiurl, headers={ 'User-Agent': 'Mozilla/5.0' })
html = urllib2.urlopen(wikireq).read()
if 'Wikis: ' in html:
html = html.split('Wikis: ')[1].split('</div>')[0]
wikisfound = {}
for x in re.findall(ur'<a href="https://([^>]+).wikispaces.com/">', html):
wikisfound[x] = u'?'
return wikisfound
return {}
except:
print 'Error reading', wikiurl
return {}
def main():
users = loadUsers()
wikis = loadWikis()
usersc = len(users)
wikisc = len(wikis)
print 'Loading files'
print 'Loaded', usersc, 'users'
print 'Loaded', wikisc, 'wikis'
# find more users
print 'Scanning wikis for more users'
for wiki, numusers in wikis.items():
if numusers != '?': #we have scanned this wiki before, skiping
continue
print 'Scanning https://%s.wikispaces.com for users' % (wiki)
users2 = getUsers(wiki)
wikis[wiki] = len(users2)
c = 0
for x2, y2 in users2.items():
if x2 not in users.keys():
users[x2] = u'?'
c += 1
print 'Found %s new users' % (c)
if c > 0:
saveUsers(users)
users = loadUsers()
saveWikis(wikis)
time.sleep(1)
wikis = loadWikis()
# find more wikis
print 'Scanning users for more wikis'
for user, numwikis in users.items():
if numwikis != '?': #we have scanned this user before, skiping
continue
print 'Scanning https://www.wikispaces.com/user/view/%s for wikis' % (user)
wikis2 = getWikis(user)
users[user] = len(wikis2)
c = 0
for x2, y2 in wikis2.items():
if x2 not in wikis.keys():
wikis[x2] = u'?'
c += 1
print 'Found %s new wikis' % (c)
if c > 0:
saveWikis(wikis)
wikis = loadWikis()
saveUsers(users)
time.sleep(1)
users = loadUsers()
print '\nSummary:'
print 'Found', len(users)-usersc, 'new users'
print 'Found', len(wikis)-wikisc, 'new wikis'
if __name__ == '__main__':
main()
Loading…
Cancel
Save