#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see
', wikitext) if m: try: wikitext = wikitext.split(m[0])[1].split('')[0].strip() wikitext = undoHTMLEntities(text=wikitext) except: pass f.write(wikitext) def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False): pagenameplus = re.sub(' ', '+', pagename) pagename_ = urllib.parse.quote(pagename) #page current revision (html & wikitext) pageurl = '%s/%s' % (wikiurl, pagename_) filename = '%s.html' % (pagenameplus) print('Downloading page: %s' % (filename)) saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite) pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_) filename2 = '%s.wikitext' % (pagenameplus) print('Downloading page: %s' % (filename2)) saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite) convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages') #csv with page history csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_) csvfilename = '%s.history.csv' % (pagenameplus) print('Downloading page: %s' % (csvfilename)) saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite) def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False): filenameplus = re.sub(' ', '+', filename) filename_ = urllib.parse.quote(filename) #file full resolution fileurl = '%s/file/view/%s' % (wikiurl, filename_) filename = filenameplus print('Downloading file: %s' % (filename)) saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite) #csv with file history csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_) csvfilename = '%s.history.csv' % (filenameplus) print('Downloading file: %s' % (csvfilename)) saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite) def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False): print('Downloading Pages and Files from %s' % (wikiurl)) #csv all pages and files csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl) saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='') #download every page and file totallines = 0 with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f: totallines = len(f.read().splitlines()) - 1 with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile: filesc = 0 pagesc = 0 print('This wiki has %d pages and files' % (totallines)) rows = csv.reader(csvfile, delimiter=',', quotechar='"') for row in rows: if row[0] == 'file': filesc += 1 filename = row[1] downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite) elif row[0] == 'page': pagesc += 1 pagename = row[1] downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite) if (filesc + pagesc) % 10 == 0: print(' Progress: %d of %d' % ((filesc + pagesc), totallines)) print(' Progress: %d of %d' % ((filesc + pagesc), totallines)) print('Downloaded %d pages' % (pagesc)) print('Downloaded %d files' % (filesc)) def downloadSitemap(wikidomain='', wikiurl='', overwrite=False): print('Downloading sitemap.xml') saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite) def downloadMainPage(wikidomain='', wikiurl='', overwrite=False): print('Downloading index.html') saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite) def downloadLogo(wikidomain='', wikiurl='', overwrite=False): index = '%s/index.html' % (wikidomain) if os.path.exists(index): with open(index, 'r') as f: m = re.findall(r'class="WikiLogo WikiElement"> 2: if '--upload' in sys.argv: upload = True if '--admin' in sys.argv: isadmin = True if '--overwrite' in sys.argv: overwrite = True if '--overwrite-ia' in sys.argv: overwriteia = True if '--help' in sys.argv: printhelp() wikilist = [] if '://' in param: wikilist.append(param.rstrip('/')) else: with open(param, 'r') as f: wikilist = f.read().strip().splitlines() wikilist2 = [] for wiki in wikilist: wikilist2.append(wiki.rstrip('/')) wikilist = wikilist2 for wikiurl in wikilist: wikidomain = wikiurl.split('://')[1].split('/')[0] print('\n') print('#'*40,'\n Downloading:', wikiurl) print('#'*40,'\n') if upload and not overwriteia: itemid = 'wiki-%s' % (wikidomain) try: iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8') if not re.findall(r'Item cannot be found', iahtml): if not overwriteia: print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia') print('You can find it in https://archive.org/details/%s' % (itemid)) continue except: pass dirfiles = '%s/files' % (wikidomain) if not os.path.exists(dirfiles): print('Creating directory %s' % (dirfiles)) os.makedirs(dirfiles) dirpages = '%s/pages' % (wikidomain) if not os.path.exists(dirpages): print('Creating directory %s' % (dirpages)) os.makedirs(dirpages) downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) sitemapurl = 'https://%s/sitemap.xml' % (wikidomain) downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite) downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) if upload: itemid = 'wiki-%s' % (wikidomain) print('\nCompressing dump...') wikidir = wikidomain os.chdir(wikidir) print('Changed directory to', os.getcwd()) wikizip = '%s.zip' % (wikidomain) subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True) os.chdir('..') print('Changed directory to', os.getcwd()) print('\nUploading to Internet Archive...') indexfilename = '%s/index.html' % (wikidir) if not os.path.exists(indexfilename): print('\nError dump incomplete, skipping upload\n') continue f = open(indexfilename, 'r') indexhtml = f.read() f.close() wikititle = '' try: wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip() except: wikititle = wikidomain if not wikititle: wikititle = wikidomain itemtitle = 'Wiki - %s' % wikititle itemdesc = '%s dumped with WikiTeam tools.' % (wikiurl, wikititle) itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain] itemoriginalurl = wikiurl itemlicenseurl = '' m = re.findall(r'', indexhtml.split('