2018-05-05 00:03:51 +00:00
|
|
|
#!/usr/bin/env python2
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# Copyright (C) 2018 WikiTeam developers
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
|
|
|
|
# Documentation for developers: http://wikiteam.readthedocs.com
|
|
|
|
|
|
|
|
import csv
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import urllib.request
|
|
|
|
|
2018-05-05 08:11:21 +00:00
|
|
|
def saveURL(wikidomain='', url='', filename='', path=''):
|
|
|
|
filename2 = '%s/%s' % (wikidomain, filename)
|
2018-05-05 00:06:09 +00:00
|
|
|
if path:
|
|
|
|
filename2 = '%s/%s/%s' % (wikidomain, path, filename)
|
2018-05-05 08:11:21 +00:00
|
|
|
#print(wikidomain)
|
|
|
|
#print(url)
|
|
|
|
#print(filename2)
|
2018-05-05 00:03:51 +00:00
|
|
|
opener = urllib.request.build_opener()
|
|
|
|
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
|
|
|
|
urllib.request.install_opener(opener)
|
|
|
|
try:
|
|
|
|
urllib.request.urlretrieve(url, filename2)
|
|
|
|
except:
|
|
|
|
sleep = 10 # seconds
|
2018-05-05 18:20:06 +00:00
|
|
|
maxsleep = 60
|
2018-05-05 00:03:51 +00:00
|
|
|
while sleep <= maxsleep:
|
|
|
|
try:
|
|
|
|
print('Error while retrieving: %s' % (url))
|
|
|
|
print('Retry in %s seconds...' % (sleep))
|
|
|
|
time.sleep(sleep)
|
|
|
|
urllib.request.urlretrieve(url, filename2)
|
|
|
|
break
|
|
|
|
except:
|
|
|
|
sleep = sleep * 2
|
|
|
|
|
2018-05-05 08:11:21 +00:00
|
|
|
def undoHTMLEntities(text=''):
|
|
|
|
""" Undo some HTML codes """
|
|
|
|
|
|
|
|
# i guess only < > & " ' need conversion
|
|
|
|
# http://www.w3schools.com/html/html_entities.asp
|
|
|
|
text = re.sub('<', '<', text)
|
|
|
|
text = re.sub('>', '>', text)
|
|
|
|
text = re.sub('&', '&', text)
|
|
|
|
text = re.sub('"', '"', text)
|
|
|
|
text = re.sub(''', '\'', text)
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
def convertHTML2Wikitext(wikidomain='', filename='', path=''):
|
|
|
|
wikitext = ''
|
2018-05-05 18:20:06 +00:00
|
|
|
wikitextfile = '%s/%s/%s' % (wikidomain, path, filename)
|
|
|
|
if not os.path.exists(wikitextfile):
|
|
|
|
print('Error retrieving wikitext, page is a redirect probably')
|
|
|
|
return
|
|
|
|
with open(wikitextfile, 'r') as f:
|
2018-05-05 08:11:21 +00:00
|
|
|
wikitext = f.read()
|
2018-05-05 18:20:06 +00:00
|
|
|
with open(wikitextfile, 'w') as f:
|
2018-05-05 08:11:21 +00:00
|
|
|
m = re.findall(r'(?im)<div class="WikispacesContent WikispacesBs3">\s*<pre>', wikitext)
|
|
|
|
if m:
|
|
|
|
try:
|
|
|
|
wikitext = wikitext.split(m[0])[1].split('</pre>')[0].strip()
|
|
|
|
wikitext = undoHTMLEntities(text=wikitext)
|
|
|
|
except:
|
|
|
|
wikitext = ''
|
|
|
|
print('Error extracting wikitext.')
|
|
|
|
else:
|
|
|
|
wikitext = ''
|
|
|
|
print('Error extracting wikitext.')
|
|
|
|
f.write(wikitext)
|
|
|
|
|
|
|
|
def downloadPage(wikidomain='', wikiurl='', pagename=''):
|
2018-05-05 00:03:51 +00:00
|
|
|
pagenameplus = re.sub(' ', '+', pagename)
|
|
|
|
pagename_ = urllib.parse.quote(pagename)
|
2018-05-05 08:11:21 +00:00
|
|
|
|
|
|
|
#page current revision (html & wikitext)
|
|
|
|
pageurl = '%s/%s' % (wikiurl, pagename_)
|
|
|
|
filename = '%s.html' % (pagenameplus)
|
|
|
|
saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages')
|
|
|
|
pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_)
|
|
|
|
filename2 = '%s.wikitext' % (pagenameplus)
|
|
|
|
saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages')
|
|
|
|
convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages')
|
|
|
|
|
2018-05-05 00:03:51 +00:00
|
|
|
#csv with page history
|
|
|
|
csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_)
|
|
|
|
csvfilename = '%s.history.csv' % (pagenameplus)
|
2018-05-05 08:11:21 +00:00
|
|
|
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages')
|
2018-05-05 00:03:51 +00:00
|
|
|
|
2018-05-05 08:11:21 +00:00
|
|
|
def downloadFile(wikidomain='', wikiurl='', filename=''):
|
2018-05-05 00:03:51 +00:00
|
|
|
filenameplus = re.sub(' ', '+', filename)
|
|
|
|
filename_ = urllib.parse.quote(filename)
|
2018-05-05 08:11:21 +00:00
|
|
|
|
2018-05-05 00:03:51 +00:00
|
|
|
#file full resolution
|
|
|
|
fileurl = '%s/file/view/%s' % (wikiurl, filename_)
|
|
|
|
filename = filenameplus
|
2018-05-05 08:11:21 +00:00
|
|
|
saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files')
|
|
|
|
|
2018-05-05 00:03:51 +00:00
|
|
|
#csv with file history
|
|
|
|
csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_)
|
|
|
|
csvfilename = '%s.history.csv' % (filenameplus)
|
2018-05-05 08:11:21 +00:00
|
|
|
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files')
|
2018-05-05 00:03:51 +00:00
|
|
|
|
2018-05-05 08:11:21 +00:00
|
|
|
def downloadPagesAndFiles(wikidomain='', wikiurl=''):
|
2018-05-05 00:03:51 +00:00
|
|
|
print('Downloading Pages and Files from %s' % (wikiurl))
|
|
|
|
#csv all pages and files
|
|
|
|
csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl)
|
2018-05-05 08:11:21 +00:00
|
|
|
saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='')
|
2018-05-05 00:03:51 +00:00
|
|
|
#download every page and file
|
2018-05-05 08:11:21 +00:00
|
|
|
with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile:
|
2018-05-05 00:03:51 +00:00
|
|
|
filesc = 0
|
|
|
|
pagesc = 0
|
|
|
|
rows = csv.reader(csvfile, delimiter=',', quotechar='"')
|
|
|
|
for row in rows:
|
|
|
|
if row[0] == 'file':
|
|
|
|
filesc += 1
|
|
|
|
filename = row[1]
|
|
|
|
print('Downloading file: %s' % (filename))
|
2018-05-05 08:11:21 +00:00
|
|
|
downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename)
|
2018-05-05 00:03:51 +00:00
|
|
|
elif row[0] == 'page':
|
|
|
|
pagesc += 1
|
|
|
|
pagename = row[1]
|
|
|
|
print('Downloading page: %s' % (pagename))
|
2018-05-05 08:11:21 +00:00
|
|
|
downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename)
|
2018-05-05 00:03:51 +00:00
|
|
|
print('Downloaded %d pages' % (pagesc))
|
|
|
|
print('Downloaded %d files' % (filesc))
|
|
|
|
|
2018-05-05 08:11:21 +00:00
|
|
|
def downloadSitemap(wikidomain='', wikiurl=''):
|
|
|
|
saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='')
|
|
|
|
|
|
|
|
def downloadMainPage(wikidomain='', wikiurl=''):
|
|
|
|
saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='')
|
2018-05-05 00:03:51 +00:00
|
|
|
|
|
|
|
def main():
|
|
|
|
if len(sys.argv) < 2:
|
2018-05-05 18:20:06 +00:00
|
|
|
print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
|
2018-05-05 00:03:51 +00:00
|
|
|
sys.exit()
|
2018-05-05 18:20:06 +00:00
|
|
|
param = sys.argv[1]
|
|
|
|
if not param:
|
|
|
|
print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
|
2018-05-05 00:03:51 +00:00
|
|
|
sys.exit()
|
2018-05-05 18:20:06 +00:00
|
|
|
|
|
|
|
wikilist = []
|
|
|
|
if '://' in param:
|
2018-05-05 18:24:07 +00:00
|
|
|
wikilist.append(param.rstrip('/'))
|
2018-05-05 18:20:06 +00:00
|
|
|
else:
|
|
|
|
with open(param, 'r') as f:
|
|
|
|
wikilist = f.read().strip().splitlines()
|
|
|
|
wikilist2 = []
|
|
|
|
for wiki in wikilist:
|
|
|
|
wikilist2.append(wiki.rstrip('/'))
|
|
|
|
wikilist = wikilist2
|
|
|
|
|
|
|
|
for wikiurl in wikilist:
|
|
|
|
wikidomain = wikiurl.split('://')[1].split('/')[0]
|
|
|
|
print('#'*40,'\n Analyzing:', wikiurl)
|
|
|
|
print('#'*40,'\n')
|
|
|
|
print('Creating directories for %s' % (wikidomain))
|
|
|
|
if not os.path.exists('%s/files' % (wikidomain)):
|
|
|
|
os.makedirs('%s/files' % (wikidomain))
|
|
|
|
if not os.path.exists('%s/pages' % (wikidomain)):
|
|
|
|
os.makedirs('%s/pages' % (wikidomain))
|
|
|
|
downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl)
|
|
|
|
sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
|
|
|
|
downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl)
|
|
|
|
downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl)
|
2018-05-05 00:03:51 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|