2018-05-05 00:03:51 +00:00
|
|
|
#!/usr/bin/env python2
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
# Copyright (C) 2018 WikiTeam developers
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
|
|
|
|
# Documentation for developers: http://wikiteam.readthedocs.com
|
|
|
|
|
|
|
|
import csv
|
2018-05-06 11:19:21 +00:00
|
|
|
import datetime
|
2018-05-05 00:03:51 +00:00
|
|
|
import os
|
2018-05-24 11:28:12 +00:00
|
|
|
import random
|
2018-05-05 00:03:51 +00:00
|
|
|
import re
|
2018-05-06 11:19:21 +00:00
|
|
|
import subprocess
|
2018-05-05 00:03:51 +00:00
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import urllib.request
|
2018-05-31 18:44:02 +00:00
|
|
|
#from internetarchive import get_item
|
2018-05-05 00:03:51 +00:00
|
|
|
|
2018-05-06 11:19:21 +00:00
|
|
|
# Requirements:
|
|
|
|
# zip command (apt-get install zip)
|
2018-05-06 12:07:48 +00:00
|
|
|
# ia command (pip install internetarchive, and configured properly)
|
2018-05-06 11:19:21 +00:00
|
|
|
|
2018-05-31 18:44:02 +00:00
|
|
|
"""
|
|
|
|
# You need a file with access and secret keys, in two different lines
|
|
|
|
iakeysfilename = '%s/.iakeys' % (os.path.expanduser('~'))
|
|
|
|
if os.path.exists(iakeysfilename):
|
|
|
|
accesskey = open(iakeysfilename, 'r').readlines()[0].strip()
|
|
|
|
secretkey = open(iakeysfilename, 'r').readlines()[1].strip()
|
|
|
|
else:
|
|
|
|
print('Error, no %s file with S3 keys for Internet Archive account' % (iakeysfilename))
|
|
|
|
sys.exit()
|
|
|
|
"""
|
|
|
|
|
2018-05-09 19:29:58 +00:00
|
|
|
def saveURL(wikidomain='', url='', filename='', path='', overwrite=False, iteration=1):
|
2018-05-05 08:11:21 +00:00
|
|
|
filename2 = '%s/%s' % (wikidomain, filename)
|
2018-05-05 00:06:09 +00:00
|
|
|
if path:
|
|
|
|
filename2 = '%s/%s/%s' % (wikidomain, path, filename)
|
2018-05-06 12:07:48 +00:00
|
|
|
if os.path.exists(filename2):
|
|
|
|
if not overwrite:
|
|
|
|
print('Warning: file exists on disk. Skipping download. Force download with parameter --overwrite')
|
|
|
|
return
|
2018-05-05 00:03:51 +00:00
|
|
|
opener = urllib.request.build_opener()
|
|
|
|
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
|
|
|
|
urllib.request.install_opener(opener)
|
|
|
|
try:
|
|
|
|
urllib.request.urlretrieve(url, filename2)
|
|
|
|
except:
|
|
|
|
sleep = 10 # seconds
|
2018-05-06 12:27:19 +00:00
|
|
|
maxsleep = 30
|
2018-05-05 00:03:51 +00:00
|
|
|
while sleep <= maxsleep:
|
|
|
|
try:
|
|
|
|
print('Error while retrieving: %s' % (url))
|
|
|
|
print('Retry in %s seconds...' % (sleep))
|
|
|
|
time.sleep(sleep)
|
|
|
|
urllib.request.urlretrieve(url, filename2)
|
2018-05-06 12:07:48 +00:00
|
|
|
return
|
2018-05-05 00:03:51 +00:00
|
|
|
except:
|
|
|
|
sleep = sleep * 2
|
2018-05-06 12:07:48 +00:00
|
|
|
print('Download failed')
|
2018-05-09 19:29:58 +00:00
|
|
|
|
|
|
|
#sometimes wikispaces returns invalid data, redownload in that cases
|
2018-05-10 07:04:08 +00:00
|
|
|
#only 'pages'. 'files' binaries are a pain to open and check
|
|
|
|
if (os.path.exists(filename2) and 'pages' in path) or \
|
|
|
|
(os.path.exists(filename2) and path == '' and filename2.split('.')[-1] in ['xml', 'html', 'csv']):
|
2018-05-09 19:29:58 +00:00
|
|
|
sleep2 = 60 * iteration
|
|
|
|
raw = ''
|
2018-05-11 20:25:41 +00:00
|
|
|
try:
|
|
|
|
with open(filename2, 'r', encoding='utf-8') as f:
|
|
|
|
raw = f.read()
|
|
|
|
except:
|
2018-05-12 08:50:17 +00:00
|
|
|
with open(filename2, 'r', encoding='latin-1') as f:
|
|
|
|
raw = f.read()
|
2018-05-09 19:29:58 +00:00
|
|
|
if re.findall(r'(?im)<title>TES and THE Status</title>', raw):
|
|
|
|
print('Warning: invalid content. Waiting %d seconds and re-downloading' % (sleep2))
|
|
|
|
time.sleep(sleep2)
|
|
|
|
saveURL(wikidomain=wikidomain, url=url, filename=filename, path=path, overwrite=overwrite, iteration=iteration+1)
|
2018-05-05 00:03:51 +00:00
|
|
|
|
2018-05-05 08:11:21 +00:00
|
|
|
def undoHTMLEntities(text=''):
|
|
|
|
""" Undo some HTML codes """
|
|
|
|
|
|
|
|
# i guess only < > & " ' need conversion
|
|
|
|
# http://www.w3schools.com/html/html_entities.asp
|
|
|
|
text = re.sub('<', '<', text)
|
|
|
|
text = re.sub('>', '>', text)
|
|
|
|
text = re.sub('&', '&', text)
|
|
|
|
text = re.sub('"', '"', text)
|
|
|
|
text = re.sub(''', '\'', text)
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
2018-05-06 15:53:32 +00:00
|
|
|
def convertHTML2Wikitext(wikidomain='', filename='', path=''):
|
2018-05-05 08:11:21 +00:00
|
|
|
wikitext = ''
|
2018-05-05 18:20:06 +00:00
|
|
|
wikitextfile = '%s/%s/%s' % (wikidomain, path, filename)
|
|
|
|
if not os.path.exists(wikitextfile):
|
|
|
|
print('Error retrieving wikitext, page is a redirect probably')
|
|
|
|
return
|
|
|
|
with open(wikitextfile, 'r') as f:
|
2018-05-05 08:11:21 +00:00
|
|
|
wikitext = f.read()
|
2018-05-05 18:20:06 +00:00
|
|
|
with open(wikitextfile, 'w') as f:
|
2018-05-05 08:11:21 +00:00
|
|
|
m = re.findall(r'(?im)<div class="WikispacesContent WikispacesBs3">\s*<pre>', wikitext)
|
|
|
|
if m:
|
|
|
|
try:
|
|
|
|
wikitext = wikitext.split(m[0])[1].split('</pre>')[0].strip()
|
|
|
|
wikitext = undoHTMLEntities(text=wikitext)
|
|
|
|
except:
|
2018-05-06 15:53:32 +00:00
|
|
|
pass
|
2018-05-05 08:11:21 +00:00
|
|
|
f.write(wikitext)
|
|
|
|
|
2018-05-06 12:07:48 +00:00
|
|
|
def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False):
|
2018-05-05 00:03:51 +00:00
|
|
|
pagenameplus = re.sub(' ', '+', pagename)
|
|
|
|
pagename_ = urllib.parse.quote(pagename)
|
2018-05-05 08:11:21 +00:00
|
|
|
|
|
|
|
#page current revision (html & wikitext)
|
|
|
|
pageurl = '%s/%s' % (wikiurl, pagename_)
|
|
|
|
filename = '%s.html' % (pagenameplus)
|
2018-05-06 12:07:48 +00:00
|
|
|
print('Downloading page: %s' % (filename))
|
|
|
|
saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite)
|
2018-05-05 08:11:21 +00:00
|
|
|
pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_)
|
|
|
|
filename2 = '%s.wikitext' % (pagenameplus)
|
2018-05-06 12:07:48 +00:00
|
|
|
print('Downloading page: %s' % (filename2))
|
|
|
|
saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite)
|
2018-05-06 15:53:32 +00:00
|
|
|
convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages')
|
2018-05-05 08:11:21 +00:00
|
|
|
|
2018-05-05 00:03:51 +00:00
|
|
|
#csv with page history
|
|
|
|
csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_)
|
|
|
|
csvfilename = '%s.history.csv' % (pagenameplus)
|
2018-05-06 12:07:48 +00:00
|
|
|
print('Downloading page: %s' % (csvfilename))
|
|
|
|
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite)
|
2018-05-05 00:03:51 +00:00
|
|
|
|
2018-05-06 12:07:48 +00:00
|
|
|
def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False):
|
2018-05-05 00:03:51 +00:00
|
|
|
filenameplus = re.sub(' ', '+', filename)
|
|
|
|
filename_ = urllib.parse.quote(filename)
|
2018-05-05 08:11:21 +00:00
|
|
|
|
2018-05-05 00:03:51 +00:00
|
|
|
#file full resolution
|
|
|
|
fileurl = '%s/file/view/%s' % (wikiurl, filename_)
|
|
|
|
filename = filenameplus
|
2018-05-06 12:07:48 +00:00
|
|
|
print('Downloading file: %s' % (filename))
|
|
|
|
saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite)
|
2018-05-05 08:11:21 +00:00
|
|
|
|
2018-05-05 00:03:51 +00:00
|
|
|
#csv with file history
|
|
|
|
csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_)
|
|
|
|
csvfilename = '%s.history.csv' % (filenameplus)
|
2018-05-06 12:07:48 +00:00
|
|
|
print('Downloading file: %s' % (csvfilename))
|
|
|
|
saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite)
|
2018-05-05 00:03:51 +00:00
|
|
|
|
2018-05-06 12:07:48 +00:00
|
|
|
def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False):
|
2018-05-05 00:03:51 +00:00
|
|
|
print('Downloading Pages and Files from %s' % (wikiurl))
|
|
|
|
#csv all pages and files
|
|
|
|
csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl)
|
2018-05-05 08:11:21 +00:00
|
|
|
saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='')
|
2018-05-05 00:03:51 +00:00
|
|
|
#download every page and file
|
2018-05-06 12:07:48 +00:00
|
|
|
totallines = 0
|
|
|
|
with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f:
|
|
|
|
totallines = len(f.read().splitlines()) - 1
|
2018-05-05 08:11:21 +00:00
|
|
|
with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile:
|
2018-05-05 00:03:51 +00:00
|
|
|
filesc = 0
|
|
|
|
pagesc = 0
|
2018-05-06 12:07:48 +00:00
|
|
|
print('This wiki has %d pages and files' % (totallines))
|
2018-05-05 00:03:51 +00:00
|
|
|
rows = csv.reader(csvfile, delimiter=',', quotechar='"')
|
|
|
|
for row in rows:
|
|
|
|
if row[0] == 'file':
|
|
|
|
filesc += 1
|
|
|
|
filename = row[1]
|
2018-05-06 12:07:48 +00:00
|
|
|
downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite)
|
2018-05-05 00:03:51 +00:00
|
|
|
elif row[0] == 'page':
|
|
|
|
pagesc += 1
|
|
|
|
pagename = row[1]
|
2018-05-06 12:07:48 +00:00
|
|
|
downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite)
|
|
|
|
if (filesc + pagesc) % 10 == 0:
|
|
|
|
print(' Progress: %d of %d' % ((filesc + pagesc), totallines))
|
|
|
|
print(' Progress: %d of %d' % ((filesc + pagesc), totallines))
|
2018-05-05 00:03:51 +00:00
|
|
|
print('Downloaded %d pages' % (pagesc))
|
|
|
|
print('Downloaded %d files' % (filesc))
|
|
|
|
|
2018-05-06 12:07:48 +00:00
|
|
|
def downloadSitemap(wikidomain='', wikiurl='', overwrite=False):
|
2018-05-06 11:19:21 +00:00
|
|
|
print('Downloading sitemap.xml')
|
2018-05-06 12:07:48 +00:00
|
|
|
saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite)
|
2018-05-05 08:11:21 +00:00
|
|
|
|
2018-05-06 12:07:48 +00:00
|
|
|
def downloadMainPage(wikidomain='', wikiurl='', overwrite=False):
|
2018-05-06 11:19:21 +00:00
|
|
|
print('Downloading index.html')
|
2018-05-06 12:07:48 +00:00
|
|
|
saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite)
|
2018-05-05 00:03:51 +00:00
|
|
|
|
2018-05-06 12:07:48 +00:00
|
|
|
def downloadLogo(wikidomain='', wikiurl='', overwrite=False):
|
2018-05-06 11:19:21 +00:00
|
|
|
index = '%s/index.html' % (wikidomain)
|
|
|
|
if os.path.exists(index):
|
2018-05-20 18:36:08 +00:00
|
|
|
raw = ''
|
|
|
|
try:
|
|
|
|
with open(index, 'r', encoding='utf-8') as f:
|
|
|
|
raw = f.read()
|
|
|
|
except:
|
|
|
|
with open(index, 'r', encoding='latin-1') as f:
|
|
|
|
raw = f.read()
|
|
|
|
m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', raw)
|
|
|
|
if m:
|
|
|
|
logourl = m[0]
|
|
|
|
logofilename = logourl.split('/')[-1]
|
|
|
|
print('Downloading logo')
|
|
|
|
saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='', overwrite=overwrite)
|
|
|
|
return logofilename
|
2018-05-06 11:19:21 +00:00
|
|
|
return ''
|
|
|
|
|
2018-05-06 12:07:48 +00:00
|
|
|
def printhelp():
|
|
|
|
helptext = """This script downloads (and uploads) WikiSpaces wikis.
|
|
|
|
|
|
|
|
Parameters available:
|
|
|
|
|
|
|
|
--upload: upload compressed file with downloaded wiki
|
|
|
|
--admin: add item to WikiTeam collection (if you are an admin in that collection)
|
|
|
|
--overwrite: download again even if files exists locally
|
|
|
|
--overwrite-ia: upload again to Internet Archive even if item exists there
|
|
|
|
--help: prints this help text
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
|
|
python3 wikispaces.py https://mywiki.wikispaces.com
|
|
|
|
It downloads that wiki
|
|
|
|
|
|
|
|
python3 wikispaces.py wikis.txt
|
|
|
|
It downloads a list of wikis (file format is a URL per line)
|
|
|
|
|
|
|
|
python3 wikispaces.py https://mywiki.wikispaces.com --upload
|
|
|
|
It downloads that wiki, compress it and uploading to Internet Archive
|
|
|
|
"""
|
|
|
|
print(helptext)
|
|
|
|
sys.exit()
|
|
|
|
|
2018-05-24 11:28:12 +00:00
|
|
|
def duckduckgo():
|
|
|
|
opener = urllib.request.build_opener()
|
|
|
|
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
|
|
|
|
urllib.request.install_opener(opener)
|
|
|
|
|
|
|
|
wikis = []
|
2018-05-28 20:12:15 +00:00
|
|
|
ignorewikis = [
|
|
|
|
'https://wikispaces.com',
|
|
|
|
'https://www.wikispaces.com',
|
|
|
|
'https://wikispaces.net',
|
|
|
|
'https://www.wikispaces.net',
|
|
|
|
]
|
2018-05-24 11:28:12 +00:00
|
|
|
for i in range(1, 100000):
|
|
|
|
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikispaces.com' % (random.randint(100, 5000), random.randint(1000, 9999))
|
|
|
|
print('URL search', url)
|
|
|
|
try:
|
|
|
|
html = urllib.request.urlopen(url).read().decode('utf-8')
|
|
|
|
except:
|
|
|
|
print('Search error')
|
|
|
|
time.sleep(30)
|
|
|
|
continue
|
|
|
|
html = urllib.parse.unquote(html)
|
|
|
|
m = re.findall(r'://([^/]+?\.wikispaces\.com)', html)
|
|
|
|
for wiki in m:
|
|
|
|
wiki = 'https://' + wiki
|
|
|
|
wiki = re.sub(r'https://www\.', 'https://', wiki)
|
2018-05-28 20:12:15 +00:00
|
|
|
if not wiki in wikis and not wiki in ignorewikis:
|
2018-05-24 11:28:12 +00:00
|
|
|
wikis.append(wiki)
|
|
|
|
yield wiki
|
|
|
|
sleep = random.randint(5,20)
|
|
|
|
print('Sleeping %d seconds' % (sleep))
|
|
|
|
time.sleep(sleep)
|
|
|
|
|
2018-05-05 00:03:51 +00:00
|
|
|
def main():
|
2018-05-06 11:19:21 +00:00
|
|
|
upload = False
|
|
|
|
isadmin = False
|
2018-05-06 12:07:48 +00:00
|
|
|
overwrite = False
|
|
|
|
overwriteia = False
|
2018-05-05 00:03:51 +00:00
|
|
|
if len(sys.argv) < 2:
|
2018-05-06 12:07:48 +00:00
|
|
|
printhelp()
|
2018-05-05 18:20:06 +00:00
|
|
|
param = sys.argv[1]
|
|
|
|
if not param:
|
2018-05-06 12:07:48 +00:00
|
|
|
printhelp()
|
2018-05-06 11:19:21 +00:00
|
|
|
if len(sys.argv) > 2:
|
|
|
|
if '--upload' in sys.argv:
|
|
|
|
upload = True
|
|
|
|
if '--admin' in sys.argv:
|
|
|
|
isadmin = True
|
2018-05-06 12:07:48 +00:00
|
|
|
if '--overwrite' in sys.argv:
|
|
|
|
overwrite = True
|
|
|
|
if '--overwrite-ia' in sys.argv:
|
|
|
|
overwriteia = True
|
|
|
|
if '--help' in sys.argv:
|
|
|
|
printhelp()
|
2018-05-05 18:20:06 +00:00
|
|
|
|
|
|
|
wikilist = []
|
|
|
|
if '://' in param:
|
2018-05-05 18:24:07 +00:00
|
|
|
wikilist.append(param.rstrip('/'))
|
2018-05-24 11:28:12 +00:00
|
|
|
elif param.lower() == 'duckduckgo':
|
|
|
|
wikilist = duckduckgo()
|
|
|
|
#for wiki in wikilist:
|
|
|
|
# print(wiki)
|
2018-05-05 18:20:06 +00:00
|
|
|
else:
|
|
|
|
with open(param, 'r') as f:
|
|
|
|
wikilist = f.read().strip().splitlines()
|
|
|
|
wikilist2 = []
|
|
|
|
for wiki in wikilist:
|
|
|
|
wikilist2.append(wiki.rstrip('/'))
|
|
|
|
wikilist = wikilist2
|
|
|
|
|
|
|
|
for wikiurl in wikilist:
|
|
|
|
wikidomain = wikiurl.split('://')[1].split('/')[0]
|
2018-05-06 12:27:19 +00:00
|
|
|
print('\n')
|
2018-05-06 11:19:21 +00:00
|
|
|
print('#'*40,'\n Downloading:', wikiurl)
|
2018-05-05 18:20:06 +00:00
|
|
|
print('#'*40,'\n')
|
2018-05-06 15:53:32 +00:00
|
|
|
|
|
|
|
if upload and not overwriteia:
|
|
|
|
itemid = 'wiki-%s' % (wikidomain)
|
|
|
|
try:
|
2018-05-10 07:04:08 +00:00
|
|
|
iahtml = ''
|
|
|
|
try:
|
|
|
|
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
|
|
|
|
except:
|
|
|
|
time.sleep(10)
|
|
|
|
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
|
2018-05-20 18:36:08 +00:00
|
|
|
if iahtml and not re.findall(r'(?im)Item cannot be found', iahtml):
|
2018-05-06 15:53:32 +00:00
|
|
|
if not overwriteia:
|
|
|
|
print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia')
|
2018-05-07 06:07:37 +00:00
|
|
|
print('You can find it in https://archive.org/details/%s' % (itemid))
|
2018-05-10 07:04:08 +00:00
|
|
|
time.sleep(1)
|
2018-05-06 15:53:32 +00:00
|
|
|
continue
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
2018-05-06 11:19:21 +00:00
|
|
|
dirfiles = '%s/files' % (wikidomain)
|
|
|
|
if not os.path.exists(dirfiles):
|
|
|
|
print('Creating directory %s' % (dirfiles))
|
|
|
|
os.makedirs(dirfiles)
|
|
|
|
dirpages = '%s/pages' % (wikidomain)
|
|
|
|
if not os.path.exists(dirpages):
|
|
|
|
print('Creating directory %s' % (dirpages))
|
|
|
|
os.makedirs(dirpages)
|
2018-05-05 18:20:06 +00:00
|
|
|
sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
|
2018-05-28 20:12:15 +00:00
|
|
|
|
2018-05-06 12:07:48 +00:00
|
|
|
downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite)
|
2018-05-23 12:25:51 +00:00
|
|
|
if not os.path.exists('%s/sitemap.xml' % (wikidomain)):
|
|
|
|
print('Error, wiki was probably deleted. Skiping wiki...')
|
|
|
|
continue
|
2018-05-25 21:04:38 +00:00
|
|
|
else:
|
|
|
|
sitemapraw = ''
|
|
|
|
try:
|
|
|
|
with open('%s/sitemap.xml' % (wikidomain), encoding='utf-8') as g:
|
|
|
|
sitemapraw = g.read()
|
|
|
|
except:
|
|
|
|
with open('%s/sitemap.xml' % (wikidomain), encoding='latin-1') as g:
|
|
|
|
sitemapraw = g.read()
|
|
|
|
if re.search(r'(?im)<h1>This wiki has been deactivated</h1>', sitemapraw):
|
|
|
|
print('Error, wiki was deactivated. Skiping wiki...')
|
|
|
|
continue
|
2018-05-28 20:12:15 +00:00
|
|
|
|
2018-05-06 12:07:48 +00:00
|
|
|
downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
|
2018-05-28 20:12:15 +00:00
|
|
|
if not os.path.exists('%s/index.html' % (wikidomain)):
|
|
|
|
print('Error, wiki was probably deleted or expired. Skiping wiki...')
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
indexraw = ''
|
|
|
|
try:
|
|
|
|
with open('%s/index.html' % (wikidomain), encoding='utf-8') as g:
|
|
|
|
indexraw = g.read()
|
|
|
|
except:
|
|
|
|
with open('%s/index.html' % (wikidomain), encoding='latin-1') as g:
|
|
|
|
indexraw = g.read()
|
|
|
|
if re.search(r'(?im)<h1>Subscription Expired</h1>', indexraw):
|
|
|
|
print('Error, wiki subscription expired. Skiping wiki...')
|
|
|
|
continue
|
|
|
|
|
|
|
|
downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
|
2018-05-06 12:07:48 +00:00
|
|
|
logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
|
2018-05-06 11:19:21 +00:00
|
|
|
|
|
|
|
if upload:
|
2018-05-06 12:07:48 +00:00
|
|
|
itemid = 'wiki-%s' % (wikidomain)
|
2018-05-06 11:19:21 +00:00
|
|
|
print('\nCompressing dump...')
|
|
|
|
wikidir = wikidomain
|
|
|
|
os.chdir(wikidir)
|
|
|
|
print('Changed directory to', os.getcwd())
|
|
|
|
wikizip = '%s.zip' % (wikidomain)
|
|
|
|
subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
|
|
|
|
os.chdir('..')
|
|
|
|
print('Changed directory to', os.getcwd())
|
|
|
|
|
|
|
|
print('\nUploading to Internet Archive...')
|
|
|
|
indexfilename = '%s/index.html' % (wikidir)
|
|
|
|
if not os.path.exists(indexfilename):
|
|
|
|
print('\nError dump incomplete, skipping upload\n')
|
|
|
|
continue
|
2018-05-20 18:36:08 +00:00
|
|
|
indexhtml = ''
|
|
|
|
try:
|
|
|
|
with open(indexfilename, 'r', encoding='utf-8') as f:
|
|
|
|
indexhtml = f.read()
|
|
|
|
except:
|
|
|
|
with open(indexfilename, 'r', encoding='latin-1') as f:
|
|
|
|
indexhtml = f.read()
|
|
|
|
|
2018-05-06 11:19:21 +00:00
|
|
|
wikititle = ''
|
|
|
|
try:
|
|
|
|
wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
|
|
|
|
except:
|
|
|
|
wikititle = wikidomain
|
|
|
|
if not wikititle:
|
|
|
|
wikititle = wikidomain
|
2018-05-31 18:44:02 +00:00
|
|
|
wikititle = wikititle.replace("\\'", " ")
|
|
|
|
wikititle = wikititle.replace('\\"', " ")
|
2018-05-06 11:19:21 +00:00
|
|
|
itemtitle = 'Wiki - %s' % wikititle
|
|
|
|
itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
|
|
|
|
itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
|
|
|
|
itemoriginalurl = wikiurl
|
|
|
|
itemlicenseurl = ''
|
2018-05-12 08:50:17 +00:00
|
|
|
m = ''
|
|
|
|
try:
|
|
|
|
m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
|
|
|
|
except:
|
|
|
|
m = ''
|
2018-05-06 11:19:21 +00:00
|
|
|
if m:
|
|
|
|
itemlicenseurl = m[0]
|
|
|
|
if not itemlicenseurl:
|
|
|
|
itemtags.append('unknowncopyright')
|
|
|
|
itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
|
|
|
|
itemcollection = isadmin and 'wikiteam' or 'opensource'
|
|
|
|
itemlang = 'Unknown'
|
|
|
|
itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
|
|
|
|
itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''
|
2018-05-31 18:44:02 +00:00
|
|
|
callplain = "ia upload %s %s %s --metadata='mediatype:web' --metadata='collection:%s' --metadata='title:%s' --metadata='description:%s' --metadata='language:%s' --metadata='last-updated-date:%s' --metadata='originalurl:%s' %s %s" % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemoriginalurl, itemlicenseurl and "--metadata='licenseurl:%s'" % (itemlicenseurl) or '', itemtags_)
|
|
|
|
print(callplain)
|
|
|
|
subprocess.call(callplain, shell=True)
|
|
|
|
|
|
|
|
"""
|
|
|
|
md = {
|
|
|
|
'mediatype': 'web',
|
|
|
|
'collection': itemcollection,
|
|
|
|
'title': itemtitle,
|
|
|
|
'description': itemdesc,
|
|
|
|
'language': itemlang,
|
|
|
|
'last-updated-date': itemdate,
|
|
|
|
'subject': '; '.join(itemtags),
|
|
|
|
'licenseurl': itemlicenseurl,
|
|
|
|
'originalurl': itemoriginalurl,
|
|
|
|
}
|
|
|
|
item = get_item(itemid)
|
|
|
|
item.upload(wikizip, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
|
|
|
|
item.modify_metadata(md)
|
|
|
|
if itemlogo:
|
|
|
|
item.upload(itemlogo, access_key=accesskey, secret_key=secretkey, verbose=True)
|
|
|
|
"""
|
|
|
|
|
2018-05-06 11:19:21 +00:00
|
|
|
print('You can find it in https://archive.org/details/%s' % (itemid))
|
2018-05-24 11:28:12 +00:00
|
|
|
os.remove(wikizip)
|
2018-05-05 00:03:51 +00:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|