mirror of
https://github.com/WikiTeam/wikiteam
synced 2024-11-12 07:12:41 +00:00
Merge branch 'master' of https://github.com/WikiTeam/wikiteam
This commit is contained in:
commit
ebc02a3b45
9651
listsofwikis/mediawiki/mediawikis_2018-alive.txt
Normal file
9651
listsofwikis/mediawiki/mediawikis_2018-alive.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -55,16 +55,16 @@ def main():
|
||||
print dbname
|
||||
first = dbname[0]
|
||||
# There are one-letter dbnames; the second letter is replaced by an underscore
|
||||
# http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.gz
|
||||
# http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.7z
|
||||
try:
|
||||
second = dbname[1]
|
||||
except:
|
||||
second = '_'
|
||||
base = 'http://s3.amazonaws.com/wikia_xml_dumps/' + first + '/' \
|
||||
+ first + second + '/' + dbname
|
||||
full = base + '_pages_full.xml.gz'
|
||||
full = base + '_pages_full.xml.7z'
|
||||
print full
|
||||
current = base + '_pages_current.xml.gz'
|
||||
current = base + '_pages_current.xml.7z'
|
||||
images = base + '_images.tar'
|
||||
try:
|
||||
#subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -88,6 +88,7 @@ def getWikis(user):
|
||||
return {}
|
||||
|
||||
def main():
|
||||
sleep = 0.2
|
||||
users = loadUsers()
|
||||
wikis = loadWikis()
|
||||
|
||||
@ -115,7 +116,7 @@ def main():
|
||||
saveUsers(users)
|
||||
users = loadUsers()
|
||||
saveWikis(wikis)
|
||||
time.sleep(1)
|
||||
time.sleep(sleep)
|
||||
wikis = loadWikis()
|
||||
|
||||
# find more wikis
|
||||
@ -136,7 +137,7 @@ def main():
|
||||
saveWikis(wikis)
|
||||
wikis = loadWikis()
|
||||
saveUsers(users)
|
||||
time.sleep(1)
|
||||
time.sleep(sleep)
|
||||
users = loadUsers()
|
||||
|
||||
print '\nSummary:'
|
||||
|
1000
listsofwikis/wikispaces/wikispaces00
Normal file
1000
listsofwikis/wikispaces/wikispaces00
Normal file
File diff suppressed because it is too large
Load Diff
1000
listsofwikis/wikispaces/wikispaces01
Normal file
1000
listsofwikis/wikispaces/wikispaces01
Normal file
File diff suppressed because it is too large
Load Diff
1000
listsofwikis/wikispaces/wikispaces02
Normal file
1000
listsofwikis/wikispaces/wikispaces02
Normal file
File diff suppressed because it is too large
Load Diff
1000
listsofwikis/wikispaces/wikispaces03
Normal file
1000
listsofwikis/wikispaces/wikispaces03
Normal file
File diff suppressed because it is too large
Load Diff
1000
listsofwikis/wikispaces/wikispaces04
Normal file
1000
listsofwikis/wikispaces/wikispaces04
Normal file
File diff suppressed because it is too large
Load Diff
314
wikispaces.py
Normal file
314
wikispaces.py
Normal file
@ -0,0 +1,314 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright (C) 2018 WikiTeam developers
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
|
||||
# Documentation for developers: http://wikiteam.readthedocs.com
|
||||
|
||||
import csv
|
||||
import datetime
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
|
||||
# Requirements:
|
||||
# zip command (apt-get install zip)
|
||||
# ia command (pip install internetarchive, and configured properly)
|
||||
|
||||
def saveURL(wikidomain='', url='', filename='', path='', overwrite=False):
|
||||
filename2 = '%s/%s' % (wikidomain, filename)
|
||||
if path:
|
||||
filename2 = '%s/%s/%s' % (wikidomain, path, filename)
|
||||
if os.path.exists(filename2):
|
||||
if not overwrite:
|
||||
print('Warning: file exists on disk. Skipping download. Force download with parameter --overwrite')
|
||||
return
|
||||
opener = urllib.request.build_opener()
|
||||
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
|
||||
urllib.request.install_opener(opener)
|
||||
try:
|
||||
urllib.request.urlretrieve(url, filename2)
|
||||
except:
|
||||
sleep = 10 # seconds
|
||||
maxsleep = 30
|
||||
while sleep <= maxsleep:
|
||||
try:
|
||||
print('Error while retrieving: %s' % (url))
|
||||
print('Retry in %s seconds...' % (sleep))
|
||||
time.sleep(sleep)
|
||||
urllib.request.urlretrieve(url, filename2)
|
||||
return
|
||||
except:
|
||||
sleep = sleep * 2
|
||||
print('Download failed')
|
||||
|
||||
def undoHTMLEntities(text=''):
|
||||
""" Undo some HTML codes """
|
||||
|
||||
# i guess only < > & " ' need conversion
|
||||
# http://www.w3schools.com/html/html_entities.asp
|
||||
text = re.sub('<', '<', text)
|
||||
text = re.sub('>', '>', text)
|
||||
text = re.sub('&', '&', text)
|
||||
text = re.sub('"', '"', text)
|
||||
text = re.sub(''', '\'', text)
|
||||
|
||||
return text
|
||||
|
||||
def convertHTML2Wikitext(wikidomain='', filename='', path=''):
|
||||
wikitext = ''
|
||||
wikitextfile = '%s/%s/%s' % (wikidomain, path, filename)
|
||||
if not os.path.exists(wikitextfile):
|
||||
print('Error retrieving wikitext, page is a redirect probably')
|
||||
return
|
||||
with open(wikitextfile, 'r') as f:
|
||||
wikitext = f.read()
|
||||
with open(wikitextfile, 'w') as f:
|
||||
m = re.findall(r'(?im)<div class="WikispacesContent WikispacesBs3">\s*<pre>', wikitext)
|
||||
if m:
|
||||
try:
|
||||
wikitext = wikitext.split(m[0])[1].split('</pre>')[0].strip()
|
||||
wikitext = undoHTMLEntities(text=wikitext)
|
||||
except:
|
||||
pass
|
||||
f.write(wikitext)
|
||||
|
||||
def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False):
|
||||
pagenameplus = re.sub(' ', '+', pagename)
|
||||
pagename_ = urllib.parse.quote(pagename)
|
||||
|
||||
#page current revision (html & wikitext)
|
||||
pageurl = '%s/%s' % (wikiurl, pagename_)
|
||||
filename = '%s.html' % (pagenameplus)
|
||||
print('Downloading page: %s' % (filename))
|
||||
saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite)
|
||||
pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_)
|
||||
filename2 = '%s.wikitext' % (pagenameplus)
|
||||
print('Downloading page: %s' % (filename2))
|
||||
saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite)
|
||||
convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages')
|
||||
|
||||
#csv with page history
|
||||
csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_)
|
||||
csvfilename = '%s.history.csv' % (pagenameplus)
|
||||
print('Downloading page: %s' % (csvfilename))
|
||||
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite)
|
||||
|
||||
def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False):
|
||||
filenameplus = re.sub(' ', '+', filename)
|
||||
filename_ = urllib.parse.quote(filename)
|
||||
|
||||
#file full resolution
|
||||
fileurl = '%s/file/view/%s' % (wikiurl, filename_)
|
||||
filename = filenameplus
|
||||
print('Downloading file: %s' % (filename))
|
||||
saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite)
|
||||
|
||||
#csv with file history
|
||||
csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_)
|
||||
csvfilename = '%s.history.csv' % (filenameplus)
|
||||
print('Downloading file: %s' % (csvfilename))
|
||||
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite)
|
||||
|
||||
def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False):
|
||||
print('Downloading Pages and Files from %s' % (wikiurl))
|
||||
#csv all pages and files
|
||||
csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl)
|
||||
saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='')
|
||||
#download every page and file
|
||||
totallines = 0
|
||||
with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f:
|
||||
totallines = len(f.read().splitlines()) - 1
|
||||
with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile:
|
||||
filesc = 0
|
||||
pagesc = 0
|
||||
print('This wiki has %d pages and files' % (totallines))
|
||||
rows = csv.reader(csvfile, delimiter=',', quotechar='"')
|
||||
for row in rows:
|
||||
if row[0] == 'file':
|
||||
filesc += 1
|
||||
filename = row[1]
|
||||
downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite)
|
||||
elif row[0] == 'page':
|
||||
pagesc += 1
|
||||
pagename = row[1]
|
||||
downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite)
|
||||
if (filesc + pagesc) % 10 == 0:
|
||||
print(' Progress: %d of %d' % ((filesc + pagesc), totallines))
|
||||
print(' Progress: %d of %d' % ((filesc + pagesc), totallines))
|
||||
print('Downloaded %d pages' % (pagesc))
|
||||
print('Downloaded %d files' % (filesc))
|
||||
|
||||
def downloadSitemap(wikidomain='', wikiurl='', overwrite=False):
|
||||
print('Downloading sitemap.xml')
|
||||
saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite)
|
||||
|
||||
def downloadMainPage(wikidomain='', wikiurl='', overwrite=False):
|
||||
print('Downloading index.html')
|
||||
saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite)
|
||||
|
||||
def downloadLogo(wikidomain='', wikiurl='', overwrite=False):
|
||||
index = '%s/index.html' % (wikidomain)
|
||||
if os.path.exists(index):
|
||||
with open(index, 'r') as f:
|
||||
m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', f.read())
|
||||
if m:
|
||||
logourl = m[0]
|
||||
logofilename = logourl.split('/')[-1]
|
||||
print('Downloading logo')
|
||||
saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='', overwrite=overwrite)
|
||||
return logofilename
|
||||
return ''
|
||||
|
||||
def printhelp():
|
||||
helptext = """This script downloads (and uploads) WikiSpaces wikis.
|
||||
|
||||
Parameters available:
|
||||
|
||||
--upload: upload compressed file with downloaded wiki
|
||||
--admin: add item to WikiTeam collection (if you are an admin in that collection)
|
||||
--overwrite: download again even if files exists locally
|
||||
--overwrite-ia: upload again to Internet Archive even if item exists there
|
||||
--help: prints this help text
|
||||
|
||||
Examples:
|
||||
|
||||
python3 wikispaces.py https://mywiki.wikispaces.com
|
||||
It downloads that wiki
|
||||
|
||||
python3 wikispaces.py wikis.txt
|
||||
It downloads a list of wikis (file format is a URL per line)
|
||||
|
||||
python3 wikispaces.py https://mywiki.wikispaces.com --upload
|
||||
It downloads that wiki, compress it and uploading to Internet Archive
|
||||
"""
|
||||
print(helptext)
|
||||
sys.exit()
|
||||
|
||||
def main():
|
||||
upload = False
|
||||
isadmin = False
|
||||
overwrite = False
|
||||
overwriteia = False
|
||||
if len(sys.argv) < 2:
|
||||
printhelp()
|
||||
param = sys.argv[1]
|
||||
if not param:
|
||||
printhelp()
|
||||
if len(sys.argv) > 2:
|
||||
if '--upload' in sys.argv:
|
||||
upload = True
|
||||
if '--admin' in sys.argv:
|
||||
isadmin = True
|
||||
if '--overwrite' in sys.argv:
|
||||
overwrite = True
|
||||
if '--overwrite-ia' in sys.argv:
|
||||
overwriteia = True
|
||||
if '--help' in sys.argv:
|
||||
printhelp()
|
||||
|
||||
wikilist = []
|
||||
if '://' in param:
|
||||
wikilist.append(param.rstrip('/'))
|
||||
else:
|
||||
with open(param, 'r') as f:
|
||||
wikilist = f.read().strip().splitlines()
|
||||
wikilist2 = []
|
||||
for wiki in wikilist:
|
||||
wikilist2.append(wiki.rstrip('/'))
|
||||
wikilist = wikilist2
|
||||
|
||||
for wikiurl in wikilist:
|
||||
wikidomain = wikiurl.split('://')[1].split('/')[0]
|
||||
print('\n')
|
||||
print('#'*40,'\n Downloading:', wikiurl)
|
||||
print('#'*40,'\n')
|
||||
|
||||
if upload and not overwriteia:
|
||||
itemid = 'wiki-%s' % (wikidomain)
|
||||
try:
|
||||
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
|
||||
if not re.findall(r'Item cannot be found', iahtml):
|
||||
if not overwriteia:
|
||||
print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia')
|
||||
print('You can find it in https://archive.org/details/%s' % (itemid))
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
|
||||
dirfiles = '%s/files' % (wikidomain)
|
||||
if not os.path.exists(dirfiles):
|
||||
print('Creating directory %s' % (dirfiles))
|
||||
os.makedirs(dirfiles)
|
||||
dirpages = '%s/pages' % (wikidomain)
|
||||
if not os.path.exists(dirpages):
|
||||
print('Creating directory %s' % (dirpages))
|
||||
os.makedirs(dirpages)
|
||||
downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
|
||||
sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
|
||||
downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite)
|
||||
downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
|
||||
logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
|
||||
|
||||
if upload:
|
||||
itemid = 'wiki-%s' % (wikidomain)
|
||||
print('\nCompressing dump...')
|
||||
wikidir = wikidomain
|
||||
os.chdir(wikidir)
|
||||
print('Changed directory to', os.getcwd())
|
||||
wikizip = '%s.zip' % (wikidomain)
|
||||
subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
|
||||
os.chdir('..')
|
||||
print('Changed directory to', os.getcwd())
|
||||
|
||||
print('\nUploading to Internet Archive...')
|
||||
indexfilename = '%s/index.html' % (wikidir)
|
||||
if not os.path.exists(indexfilename):
|
||||
print('\nError dump incomplete, skipping upload\n')
|
||||
continue
|
||||
f = open(indexfilename, 'r')
|
||||
indexhtml = f.read()
|
||||
f.close()
|
||||
wikititle = ''
|
||||
try:
|
||||
wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
|
||||
except:
|
||||
wikititle = wikidomain
|
||||
if not wikititle:
|
||||
wikititle = wikidomain
|
||||
itemtitle = 'Wiki - %s' % wikititle
|
||||
itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
|
||||
itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
|
||||
itemoriginalurl = wikiurl
|
||||
itemlicenseurl = ''
|
||||
m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
|
||||
if m:
|
||||
itemlicenseurl = m[0]
|
||||
if not itemlicenseurl:
|
||||
itemtags.append('unknowncopyright')
|
||||
itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
|
||||
itemcollection = isadmin and 'wikiteam' or 'opensource'
|
||||
itemlang = 'Unknown'
|
||||
itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
|
||||
itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''
|
||||
subprocess.call('ia' + ' upload %s %s %s --metadata="mediatype:web" --metadata="collection:%s" --metadata="title:%s" --metadata="description:%s" --metadata="language:%s" --metadata="last-updated-date:%s" --metadata="originalurl:%s" %s %s' % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemoriginalurl, itemlicenseurl and '--metadata="licenseurl:%s"' % (itemlicenseurl) or '', itemtags_), shell=True)
|
||||
print('You can find it in https://archive.org/details/%s' % (itemid))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user