From 2fe1c0b6b2c2c45d22c8a62a566ca32560dcedf1 Mon Sep 17 00:00:00 2001 From: emijrp Date: Sun, 6 May 2018 13:19:21 +0200 Subject: [PATCH] uploader included --- wikispaces.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/wikispaces.py b/wikispaces.py index 2b41532..07703e3 100644 --- a/wikispaces.py +++ b/wikispaces.py @@ -19,12 +19,18 @@ # Documentation for developers: http://wikiteam.readthedocs.com import csv +import datetime import os import re +import subprocess import sys import time import urllib.request +# Requirements: +# zip command (apt-get install zip) +# ia command (pip install internetarchive) + def saveURL(wikidomain='', url='', filename='', path=''): filename2 = '%s/%s' % (wikidomain, filename) if path: @@ -142,12 +148,29 @@ def downloadPagesAndFiles(wikidomain='', wikiurl=''): print('Downloaded %d files' % (filesc)) def downloadSitemap(wikidomain='', wikiurl=''): + print('Downloading sitemap.xml') saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='') def downloadMainPage(wikidomain='', wikiurl=''): + print('Downloading index.html') saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='') +def downloadLogo(wikidomain='', wikiurl=''): + index = '%s/index.html' % (wikidomain) + if os.path.exists(index): + with open(index, 'r') as f: + m = re.findall(r'class="WikiLogo WikiElement"> 2: + if '--upload' in sys.argv: + upload = True + if '--admin' in sys.argv: + isadmin = True wikilist = [] if '://' in param: @@ -169,17 +197,65 @@ def main(): for wikiurl in wikilist: wikidomain = wikiurl.split('://')[1].split('/')[0] - print('#'*40,'\n Analyzing:', wikiurl) + print('#'*40,'\n Downloading:', wikiurl) print('#'*40,'\n') - print('Creating directories for %s' % (wikidomain)) - if not os.path.exists('%s/files' % (wikidomain)): - os.makedirs('%s/files' % (wikidomain)) - if not os.path.exists('%s/pages' % (wikidomain)): - os.makedirs('%s/pages' % (wikidomain)) + dirfiles = '%s/files' % (wikidomain) + if not os.path.exists(dirfiles): + print('Creating directory %s' % (dirfiles)) + os.makedirs(dirfiles) + dirpages = '%s/pages' % (wikidomain) + if not os.path.exists(dirpages): + print('Creating directory %s' % (dirpages)) + os.makedirs(dirpages) downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl) sitemapurl = 'https://%s/sitemap.xml' % (wikidomain) downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl) downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl) + logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl) + + if upload: + print('\nCompressing dump...') + wikidir = wikidomain + os.chdir(wikidir) + print('Changed directory to', os.getcwd()) + wikizip = '%s.zip' % (wikidomain) + subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True) + os.chdir('..') + print('Changed directory to', os.getcwd()) + + print('\nUploading to Internet Archive...') + indexfilename = '%s/index.html' % (wikidir) + if not os.path.exists(indexfilename): + print('\nError dump incomplete, skipping upload\n') + continue + f = open(indexfilename, 'r') + indexhtml = f.read() + f.close() + itemid = 'wiki-%s' % (wikidomain) + wikititle = '' + try: + wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip() + except: + wikititle = wikidomain + if not wikititle: + wikititle = wikidomain + itemtitle = 'Wiki - %s' % wikititle + itemdesc = '%s dumped with WikiTeam tools.' % (wikiurl, wikititle) + itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain] + itemoriginalurl = wikiurl + itemlicenseurl = '' + m = re.findall(r'', indexhtml.split('