2
0
mirror of https://github.com/WikiTeam/wikiteam synced 2024-11-12 07:12:41 +00:00

duckduckgo spider

This commit is contained in:
emijrp 2018-05-09 13:41:13 +02:00
parent 83158d4506
commit 7280c89b3b
3 changed files with 9196 additions and 34 deletions

View File

@ -37,39 +37,50 @@ def main():
wikis.sort()
print('Loaded %d wikis from file' % (len(wikis)))
for word in words:
print('Word', word)
word_ = re.sub(' ', '+', word)
url = ''
r = random.randint(0, 3)
if r == 0:
url = 'https://duckduckgo.com/html/?q=%s%%20site:wikispaces.com' % (word_)
elif r == 1:
url = 'https://duckduckgo.com/html/?q=%s%%20wikispaces.com' % (word_)
elif r == 2:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(1000, 2000))
elif r == 3:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (random.randint(1000, 2000), word_)
else:
url = 'https://duckduckgo.com/html/?q=%s%%20site:wikispaces.com' % (word_)
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
sys.exit()
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
for wiki in m:
wiki = 'https://' + wiki
if not wiki in wikis:
wikis.append(wiki)
for i in range(1, 100):
random.shuffle(words)
for word in words:
print('Word', word)
word_ = re.sub(' ', '+', word)
url = ''
r = random.randint(0, 10)
if r == 0:
url = 'https://duckduckgo.com/html/?q=%s%%20site:wikispaces.com' % (word_)
elif r == 1:
url = 'https://duckduckgo.com/html/?q=%s%%20wikispaces.com' % (word_)
elif r == 2:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
elif r == 3:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (random.randint(100, 3000), word_)
else:
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20wikispaces.com' % (word_, random.randint(100, 3000))
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
sys.exit()
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
for wiki in m:
wiki = 'https://' + wiki
if not wiki in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open('wikispaces-duckduckgo.txt', 'w') as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r'https://www\.', 'https://', wiki)
if not wiki in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
print(wiki)
with open('wikispaces-duckduckgo.txt', 'w') as f:
wikis.sort()
f.write('\n'.join(wikis))
time.sleep(random.randint(5,15))
f.write('\n'.join(wikis))
print('%d wikis found' % (len(wikis)))
sleep = random.randint(5,20)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

View File

@ -104,5 +104,111 @@ mediterraneo
fenicios
griegos
cartagineses
palabras
numeros
escritura
isla
java
python
programacion
piramide
cuadrado
geometria
rectangulo
circulo
ciencia
marx
engels
platon
socrates
continente
tormenta
terremoto
proyecto
glosario
vocabulario
aprender
recursos
lectura
comunicacion
salud
bienestar
europeo
africano
asiatico
americano
wiki
wikis
documental
documentales
bibliografia
documentacion
ciencias
naturales
sociales
inteligencia
investigacion
cientifico
tecnico
cientifica
enlaces
antropologia
arqueologia
arqueologo
filologia
arduino
software
hardware
computador
ordenador
siglo xx
siglo xix
siglo xviii
siglo xvii
siglo xvi
siglo xv
libros
marte
tierra
mercurio
jupiter
saturno
urano
neptuno
pluton
cometa
asteroide
luna
pajaro
ave
aves
reptil
reptiles
flores
arboles
flor
dictadura
democracia
parlamento
universidad
universidades
empresa
comida
alimento
equipo
lampara
luz
bombilla
electricidad
frigorifico
lavadora
mueble
fregona
espacio
sol
estrella
fenomeno
hispanico
hispanica
biodiversidad
guerra fria