#!/usr/bin/env python2 # -*- coding: utf-8 -*- # Copyright (C) 2018 WikiTeam developers # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # Documentation for users: https://github.com/WikiTeam/wikiteam/wiki # Documentation for developers: http://wikiteam.readthedocs.com import csv import datetime import os import random import re import subprocess import sys import time import urllib.request #from internetarchive import get_item # Requirements: # zip command (apt-get install zip) # ia command (pip install internetarchive, and configured properly) """ # You need a file with access and secret keys, in two different lines iakeysfilename = '%s/.iakeys' % (os.path.expanduser('~')) if os.path.exists(iakeysfilename): accesskey = open(iakeysfilename, 'r').readlines()[0].strip() secretkey = open(iakeysfilename, 'r').readlines()[1].strip() else: print('Error, no %s file with S3 keys for Internet Archive account' % (iakeysfilename)) sys.exit() """ def saveURL(wikidomain='', url='', filename='', path='', overwrite=False, iteration=1): filename2 = '%s/%s' % (wikidomain, filename) if path: filename2 = '%s/%s/%s' % (wikidomain, path, filename) if os.path.exists(filename2): if not overwrite: print('Warning: file exists on disk. Skipping download. Force download with parameter --overwrite') return opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] urllib.request.install_opener(opener) try: urllib.request.urlretrieve(url, filename2) except: sleep = 10 # seconds maxsleep = 30 while sleep <= maxsleep: try: print('Error while retrieving: %s' % (url)) print('Retry in %s seconds...' % (sleep)) time.sleep(sleep) urllib.request.urlretrieve(url, filename2) return except: sleep = sleep * 2 print('Download failed') #sometimes wikispaces returns invalid data, redownload in that cases #only 'pages'. 'files' binaries are a pain to open and check if (os.path.exists(filename2) and 'pages' in path) or \ (os.path.exists(filename2) and path == '' and filename2.split('.')[-1] in ['xml', 'html', 'csv']): sleep2 = 60 * iteration raw = '' try: with open(filename2, 'r', encoding='utf-8') as f: raw = f.read() except: with open(filename2, 'r', encoding='latin-1') as f: raw = f.read() if re.findall(r'(?im)TES and THE Status', raw): print('Warning: invalid content. Waiting %d seconds and re-downloading' % (sleep2)) time.sleep(sleep2) saveURL(wikidomain=wikidomain, url=url, filename=filename, path=path, overwrite=overwrite, iteration=iteration+1) def undoHTMLEntities(text=''): """ Undo some HTML codes """ # i guess only < > & " ' need conversion # http://www.w3schools.com/html/html_entities.asp text = re.sub('<', '<', text) text = re.sub('>', '>', text) text = re.sub('&', '&', text) text = re.sub('"', '"', text) text = re.sub(''', '\'', text) return text def convertHTML2Wikitext(wikidomain='', filename='', path=''): wikitext = '' wikitextfile = '%s/%s/%s' % (wikidomain, path, filename) if not os.path.exists(wikitextfile): print('Error retrieving wikitext, page is a redirect probably') return with open(wikitextfile, 'r') as f: wikitext = f.read() with open(wikitextfile, 'w') as f: m = re.findall(r'(?im)
\s*
', wikitext)
        if m:
            try:
                wikitext = wikitext.split(m[0])[1].split('
')[0].strip() wikitext = undoHTMLEntities(text=wikitext) except: pass f.write(wikitext) def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False): pagenameplus = re.sub(' ', '+', pagename) pagename_ = urllib.parse.quote(pagename) #page current revision (html & wikitext) pageurl = '%s/%s' % (wikiurl, pagename_) filename = '%s.html' % (pagenameplus) print('Downloading page: %s' % (filename)) saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite) pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_) filename2 = '%s.wikitext' % (pagenameplus) print('Downloading page: %s' % (filename2)) saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite) convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages') #csv with page history csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_) csvfilename = '%s.history.csv' % (pagenameplus) print('Downloading page: %s' % (csvfilename)) saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite) def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False): filenameplus = re.sub(' ', '+', filename) filename_ = urllib.parse.quote(filename) #file full resolution fileurl = '%s/file/view/%s' % (wikiurl, filename_) filename = filenameplus print('Downloading file: %s' % (filename)) saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite) #csv with file history csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_) csvfilename = '%s.history.csv' % (filenameplus) print('Downloading file: %s' % (csvfilename)) saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite) def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False): print('Downloading Pages and Files from %s' % (wikiurl)) #csv all pages and files csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl) saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='') #download every page and file totallines = 0 with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f: totallines = len(f.read().splitlines()) - 1 with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile: filesc = 0 pagesc = 0 print('This wiki has %d pages and files' % (totallines)) rows = csv.reader(csvfile, delimiter=',', quotechar='"') for row in rows: if row[0] == 'file': filesc += 1 filename = row[1] downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite) elif row[0] == 'page': pagesc += 1 pagename = row[1] downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite) if (filesc + pagesc) % 10 == 0: print(' Progress: %d of %d' % ((filesc + pagesc), totallines)) print(' Progress: %d of %d' % ((filesc + pagesc), totallines)) print('Downloaded %d pages' % (pagesc)) print('Downloaded %d files' % (filesc)) def downloadSitemap(wikidomain='', wikiurl='', overwrite=False): print('Downloading sitemap.xml') saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite) def downloadMainPage(wikidomain='', wikiurl='', overwrite=False): print('Downloading index.html') saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite) def downloadLogo(wikidomain='', wikiurl='', overwrite=False): index = '%s/index.html' % (wikidomain) if os.path.exists(index): raw = '' try: with open(index, 'r', encoding='utf-8') as f: raw = f.read() except: with open(index, 'r', encoding='latin-1') as f: raw = f.read() m = re.findall(r'class="WikiLogo WikiElement"> 2: if '--upload' in sys.argv: upload = True if '--admin' in sys.argv: isadmin = True if '--overwrite' in sys.argv: overwrite = True if '--overwrite-ia' in sys.argv: overwriteia = True if '--help' in sys.argv: printhelp() wikilist = [] if '://' in param: wikilist.append(param.rstrip('/')) elif param.lower() == 'duckduckgo': wikilist = duckduckgo() #for wiki in wikilist: # print(wiki) else: with open(param, 'r') as f: wikilist = f.read().strip().splitlines() wikilist2 = [] for wiki in wikilist: wikilist2.append(wiki.rstrip('/')) wikilist = wikilist2 for wikiurl in wikilist: wikidomain = wikiurl.split('://')[1].split('/')[0] print('\n') print('#'*40,'\n Downloading:', wikiurl) print('#'*40,'\n') if upload and not overwriteia: itemid = 'wiki-%s' % (wikidomain) try: iahtml = '' try: iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8') except: time.sleep(10) iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8') if iahtml and not re.findall(r'(?im)Item cannot be found', iahtml): if not overwriteia: print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia') print('You can find it in https://archive.org/details/%s' % (itemid)) time.sleep(1) continue except: pass dirfiles = '%s/files' % (wikidomain) if not os.path.exists(dirfiles): print('Creating directory %s' % (dirfiles)) os.makedirs(dirfiles) dirpages = '%s/pages' % (wikidomain) if not os.path.exists(dirpages): print('Creating directory %s' % (dirpages)) os.makedirs(dirpages) sitemapurl = 'https://%s/sitemap.xml' % (wikidomain) downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite) if not os.path.exists('%s/sitemap.xml' % (wikidomain)): print('Error, wiki was probably deleted. Skiping wiki...') continue else: sitemapraw = '' try: with open('%s/sitemap.xml' % (wikidomain), encoding='utf-8') as g: sitemapraw = g.read() except: with open('%s/sitemap.xml' % (wikidomain), encoding='latin-1') as g: sitemapraw = g.read() if re.search(r'(?im)

This wiki has been deactivated

', sitemapraw): print('Error, wiki was deactivated. Skiping wiki...') continue downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) if not os.path.exists('%s/index.html' % (wikidomain)): print('Error, wiki was probably deleted or expired. Skiping wiki...') continue else: indexraw = '' try: with open('%s/index.html' % (wikidomain), encoding='utf-8') as g: indexraw = g.read() except: with open('%s/index.html' % (wikidomain), encoding='latin-1') as g: indexraw = g.read() if re.search(r'(?im)

Subscription Expired

', indexraw): print('Error, wiki subscription expired. Skiping wiki...') continue downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite) if upload: itemid = 'wiki-%s' % (wikidomain) print('\nCompressing dump...') wikidir = wikidomain os.chdir(wikidir) print('Changed directory to', os.getcwd()) wikizip = '%s.zip' % (wikidomain) subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True) os.chdir('..') print('Changed directory to', os.getcwd()) print('\nUploading to Internet Archive...') indexfilename = '%s/index.html' % (wikidir) if not os.path.exists(indexfilename): print('\nError dump incomplete, skipping upload\n') continue indexhtml = '' try: with open(indexfilename, 'r', encoding='utf-8') as f: indexhtml = f.read() except: with open(indexfilename, 'r', encoding='latin-1') as f: indexhtml = f.read() wikititle = '' try: wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip() except: wikititle = wikidomain if not wikititle: wikititle = wikidomain wikititle = wikititle.replace("\\'", " ") wikititle = wikititle.replace('\\"', " ") itemtitle = 'Wiki - %s' % wikititle itemdesc = '%s dumped with WikiTeam tools.' % (wikiurl, wikititle) itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain] itemoriginalurl = wikiurl itemlicenseurl = '' m = '' try: m = re.findall(r'', indexhtml.split('