mirror of
https://github.com/WikiTeam/wikiteam
synced 2024-11-12 07:12:41 +00:00
uploader included
This commit is contained in:
parent
254486af06
commit
2fe1c0b6b2
@ -19,12 +19,18 @@
|
|||||||
# Documentation for developers: http://wikiteam.readthedocs.com
|
# Documentation for developers: http://wikiteam.readthedocs.com
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
import datetime
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
||||||
|
# Requirements:
|
||||||
|
# zip command (apt-get install zip)
|
||||||
|
# ia command (pip install internetarchive)
|
||||||
|
|
||||||
def saveURL(wikidomain='', url='', filename='', path=''):
|
def saveURL(wikidomain='', url='', filename='', path=''):
|
||||||
filename2 = '%s/%s' % (wikidomain, filename)
|
filename2 = '%s/%s' % (wikidomain, filename)
|
||||||
if path:
|
if path:
|
||||||
@ -142,12 +148,29 @@ def downloadPagesAndFiles(wikidomain='', wikiurl=''):
|
|||||||
print('Downloaded %d files' % (filesc))
|
print('Downloaded %d files' % (filesc))
|
||||||
|
|
||||||
def downloadSitemap(wikidomain='', wikiurl=''):
|
def downloadSitemap(wikidomain='', wikiurl=''):
|
||||||
|
print('Downloading sitemap.xml')
|
||||||
saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='')
|
saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='')
|
||||||
|
|
||||||
def downloadMainPage(wikidomain='', wikiurl=''):
|
def downloadMainPage(wikidomain='', wikiurl=''):
|
||||||
|
print('Downloading index.html')
|
||||||
saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='')
|
saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='')
|
||||||
|
|
||||||
|
def downloadLogo(wikidomain='', wikiurl=''):
|
||||||
|
index = '%s/index.html' % (wikidomain)
|
||||||
|
if os.path.exists(index):
|
||||||
|
with open(index, 'r') as f:
|
||||||
|
m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', f.read())
|
||||||
|
if m:
|
||||||
|
logourl = m[0]
|
||||||
|
logofilename = logourl.split('/')[-1]
|
||||||
|
print('Downloading logo')
|
||||||
|
saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='')
|
||||||
|
return logofilename
|
||||||
|
return ''
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
upload = False
|
||||||
|
isadmin = False
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
|
print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
@ -155,6 +178,11 @@ def main():
|
|||||||
if not param:
|
if not param:
|
||||||
print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
|
print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
if len(sys.argv) > 2:
|
||||||
|
if '--upload' in sys.argv:
|
||||||
|
upload = True
|
||||||
|
if '--admin' in sys.argv:
|
||||||
|
isadmin = True
|
||||||
|
|
||||||
wikilist = []
|
wikilist = []
|
||||||
if '://' in param:
|
if '://' in param:
|
||||||
@ -169,17 +197,65 @@ def main():
|
|||||||
|
|
||||||
for wikiurl in wikilist:
|
for wikiurl in wikilist:
|
||||||
wikidomain = wikiurl.split('://')[1].split('/')[0]
|
wikidomain = wikiurl.split('://')[1].split('/')[0]
|
||||||
print('#'*40,'\n Analyzing:', wikiurl)
|
print('#'*40,'\n Downloading:', wikiurl)
|
||||||
print('#'*40,'\n')
|
print('#'*40,'\n')
|
||||||
print('Creating directories for %s' % (wikidomain))
|
dirfiles = '%s/files' % (wikidomain)
|
||||||
if not os.path.exists('%s/files' % (wikidomain)):
|
if not os.path.exists(dirfiles):
|
||||||
os.makedirs('%s/files' % (wikidomain))
|
print('Creating directory %s' % (dirfiles))
|
||||||
if not os.path.exists('%s/pages' % (wikidomain)):
|
os.makedirs(dirfiles)
|
||||||
os.makedirs('%s/pages' % (wikidomain))
|
dirpages = '%s/pages' % (wikidomain)
|
||||||
|
if not os.path.exists(dirpages):
|
||||||
|
print('Creating directory %s' % (dirpages))
|
||||||
|
os.makedirs(dirpages)
|
||||||
downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl)
|
downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl)
|
||||||
sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
|
sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
|
||||||
downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl)
|
downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl)
|
||||||
downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl)
|
downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl)
|
||||||
|
logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl)
|
||||||
|
|
||||||
|
if upload:
|
||||||
|
print('\nCompressing dump...')
|
||||||
|
wikidir = wikidomain
|
||||||
|
os.chdir(wikidir)
|
||||||
|
print('Changed directory to', os.getcwd())
|
||||||
|
wikizip = '%s.zip' % (wikidomain)
|
||||||
|
subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
|
||||||
|
os.chdir('..')
|
||||||
|
print('Changed directory to', os.getcwd())
|
||||||
|
|
||||||
|
print('\nUploading to Internet Archive...')
|
||||||
|
indexfilename = '%s/index.html' % (wikidir)
|
||||||
|
if not os.path.exists(indexfilename):
|
||||||
|
print('\nError dump incomplete, skipping upload\n')
|
||||||
|
continue
|
||||||
|
f = open(indexfilename, 'r')
|
||||||
|
indexhtml = f.read()
|
||||||
|
f.close()
|
||||||
|
itemid = 'wiki-%s' % (wikidomain)
|
||||||
|
wikititle = ''
|
||||||
|
try:
|
||||||
|
wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
|
||||||
|
except:
|
||||||
|
wikititle = wikidomain
|
||||||
|
if not wikititle:
|
||||||
|
wikititle = wikidomain
|
||||||
|
itemtitle = 'Wiki - %s' % wikititle
|
||||||
|
itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
|
||||||
|
itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
|
||||||
|
itemoriginalurl = wikiurl
|
||||||
|
itemlicenseurl = ''
|
||||||
|
m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
|
||||||
|
if m:
|
||||||
|
itemlicenseurl = m[0]
|
||||||
|
if not itemlicenseurl:
|
||||||
|
itemtags.append('unknowncopyright')
|
||||||
|
itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
|
||||||
|
itemcollection = isadmin and 'wikiteam' or 'opensource'
|
||||||
|
itemlang = 'Unknown'
|
||||||
|
itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
|
||||||
|
itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''
|
||||||
|
subprocess.call('ia' + ' upload %s %s %s --metadata="mediatype:web" --metadata="collection:%s" --metadata="title:%s" --metadata="description:%s" --metadata="language:%s" --metadata="last-updated-date:%s" %s %s' % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemlicenseurl and '--metadata="licenseurl:%s"' % (itemlicenseurl) or '', itemtags_), shell=True)
|
||||||
|
print('You can find it in https://archive.org/details/%s' % (itemid))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user