Merge branch 'master' of https://github.com/WikiTeam/wikiteam

2024-11-12 07:12:41 +00:00 · 2018-05-07 19:54:42 +00:00 · 2018-05-07 19:54:42 +00:00 · ebc02a3b45
commit ebc02a3b45
parent a8cbb357ff 60a0ba2e54
11 changed files with 363357 additions and 254 deletions
--- a/listsofwikis/mediawiki/mediawikis_2018-alive.txt
+++ b/listsofwikis/mediawiki/mediawikis_2018-alive.txt
--- a/listsofwikis/mediawiki/wikia.py
+++ b/listsofwikis/mediawiki/wikia.py
@ -55,16 +55,16 @@ def main():
        print dbname
        first = dbname[0]
        # There are one-letter dbnames; the second letter is replaced by an underscore
-        # http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.gz
+        # http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.7z
        try:
            second = dbname[1]
        except:
            second = '_'
        base = 'http://s3.amazonaws.com/wikia_xml_dumps/' + first + '/' \
            + first + second + '/' + dbname
-        full = base + '_pages_full.xml.gz'
+        full = base + '_pages_full.xml.7z'
        print full
-        current = base + '_pages_current.xml.gz'
+        current = base + '_pages_current.xml.7z'
        images = base + '_images.tar'
        try:
            #subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
--- a/listsofwikis/wikispaces/users.txt
+++ b/listsofwikis/wikispaces/users.txt
--- a/listsofwikis/wikispaces/wikis.txt
+++ b/listsofwikis/wikispaces/wikis.txt
--- a/listsofwikis/wikispaces/wikispaces-spider.py
+++ b/listsofwikis/wikispaces/wikispaces-spider.py
@ -88,6 +88,7 @@ def getWikis(user):
        return {}

 def main():
+    sleep = 0.2
    users = loadUsers()
    wikis = loadWikis()
    
@ -115,7 +116,7 @@ def main():
            saveUsers(users)
            users = loadUsers()
        saveWikis(wikis)
-        time.sleep(1)
+        time.sleep(sleep)
    wikis = loadWikis()
    
    # find more wikis
@ -136,7 +137,7 @@ def main():
            saveWikis(wikis)
            wikis = loadWikis()
        saveUsers(users)
-        time.sleep(1)
+        time.sleep(sleep)
    users = loadUsers()
    
    print '\nSummary:'
--- a/listsofwikis/wikispaces/wikispaces00
+++ b/listsofwikis/wikispaces/wikispaces00
--- a/listsofwikis/wikispaces/wikispaces01
+++ b/listsofwikis/wikispaces/wikispaces01
--- a/listsofwikis/wikispaces/wikispaces02
+++ b/listsofwikis/wikispaces/wikispaces02
--- a/listsofwikis/wikispaces/wikispaces03
+++ b/listsofwikis/wikispaces/wikispaces03
--- a/listsofwikis/wikispaces/wikispaces04
+++ b/listsofwikis/wikispaces/wikispaces04
--- a/wikispaces.py
+++ b/wikispaces.py
@ -0,0 +1,314 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2018 WikiTeam developers
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
+# Documentation for developers: http://wikiteam.readthedocs.com
+
+import csv
+import datetime
+import os
+import re
+import subprocess
+import sys
+import time
+import urllib.request
+
+# Requirements:
+# zip command (apt-get install zip)
+# ia command (pip install internetarchive, and configured properly)
+
+def saveURL(wikidomain='', url='', filename='', path='', overwrite=False):
+    filename2 = '%s/%s' % (wikidomain, filename)
+    if path:
+        filename2 = '%s/%s/%s' % (wikidomain, path, filename)
+    if os.path.exists(filename2):
+        if not overwrite:
+            print('Warning: file exists on disk. Skipping download. Force download with parameter --overwrite')
+            return
+    opener = urllib.request.build_opener()
+    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+    urllib.request.install_opener(opener)
+    try:
+        urllib.request.urlretrieve(url, filename2)
+    except:
+        sleep = 10 # seconds
+        maxsleep = 30
+        while sleep <= maxsleep:
+            try:
+                print('Error while retrieving: %s' % (url))
+                print('Retry in %s seconds...' % (sleep))
+                time.sleep(sleep)
+                urllib.request.urlretrieve(url, filename2)
+                return
+            except:
+                sleep = sleep * 2
+        print('Download failed')
+
+def undoHTMLEntities(text=''):
+    """ Undo some HTML codes """
+
+    # i guess only < > & " ' need conversion
+    # http://www.w3schools.com/html/html_entities.asp
+    text = re.sub('&lt;', '<', text)
+    text = re.sub('&gt;', '>', text)
+    text = re.sub('&amp;', '&', text)
+    text = re.sub('&quot;', '"', text)
+    text = re.sub('&#039;', '\'', text)
+
+    return text
+
+def convertHTML2Wikitext(wikidomain='', filename='', path=''):
+    wikitext = ''
+    wikitextfile = '%s/%s/%s' % (wikidomain, path, filename)
+    if not os.path.exists(wikitextfile):
+        print('Error retrieving wikitext, page is a redirect probably')
+        return
+    with open(wikitextfile, 'r') as f:
+        wikitext = f.read()
+    with open(wikitextfile, 'w') as f:
+        m = re.findall(r'(?im)<div class="WikispacesContent WikispacesBs3">\s*<pre>', wikitext)
+        if m:
+            try:
+                wikitext = wikitext.split(m[0])[1].split('</pre>')[0].strip()
+                wikitext = undoHTMLEntities(text=wikitext)
+            except:
+                pass
+        f.write(wikitext)
+
+def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False):
+    pagenameplus = re.sub(' ', '+', pagename)
+    pagename_ = urllib.parse.quote(pagename)
+    
+    #page current revision (html & wikitext)
+    pageurl = '%s/%s' % (wikiurl, pagename_)
+    filename = '%s.html' % (pagenameplus)
+    print('Downloading page: %s' % (filename))
+    saveURL(wikidomain=wikidomain, url=pageurl, filename=filename, path='pages', overwrite=overwrite)
+    pageurl2 = '%s/page/code/%s' % (wikiurl, pagename_)
+    filename2 = '%s.wikitext' % (pagenameplus)
+    print('Downloading page: %s' % (filename2))
+    saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite)
+    convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages')
+    
+    #csv with page history
+    csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_)
+    csvfilename = '%s.history.csv' % (pagenameplus)
+    print('Downloading page: %s' % (csvfilename))
+    saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='pages', overwrite=overwrite)
+
+def downloadFile(wikidomain='', wikiurl='', filename='', overwrite=False):
+    filenameplus = re.sub(' ', '+', filename)
+    filename_ = urllib.parse.quote(filename)
+    
+    #file full resolution
+    fileurl = '%s/file/view/%s' % (wikiurl, filename_)
+    filename = filenameplus
+    print('Downloading file: %s' % (filename))
+    saveURL(wikidomain=wikidomain, url=fileurl, filename=filename, path='files', overwrite=overwrite)
+    
+    #csv with file history
+    csvurl = '%s/file/detail/%s?utable=WikiTablePageList&ut_csv=1' % (wikiurl, filename_)
+    csvfilename = '%s.history.csv' % (filenameplus)
+    print('Downloading file: %s' % (csvfilename))
+    saveURL(wikidomain=wikidomain, url=csvurl, filename=csvfilename, path='files', overwrite=overwrite)
+
+def downloadPagesAndFiles(wikidomain='', wikiurl='', overwrite=False):
+    print('Downloading Pages and Files from %s' % (wikiurl))
+    #csv all pages and files
+    csvurl = '%s/space/content?utable=WikiTablePageList&ut_csv=1' % (wikiurl)
+    saveURL(wikidomain=wikidomain, url=csvurl, filename='pages-and-files.csv', path='')
+    #download every page and file
+    totallines = 0
+    with open('%s/pages-and-files.csv' % (wikidomain), 'r') as f:
+        totallines = len(f.read().splitlines()) - 1
+    with open('%s/pages-and-files.csv' % (wikidomain), 'r') as csvfile:
+        filesc = 0
+        pagesc = 0
+        print('This wiki has %d pages and files' % (totallines))
+        rows = csv.reader(csvfile, delimiter=',', quotechar='"')
+        for row in rows:
+            if row[0] == 'file':
+                filesc += 1
+                filename = row[1]
+                downloadFile(wikidomain=wikidomain, wikiurl=wikiurl, filename=filename, overwrite=overwrite)
+            elif row[0] == 'page':
+                pagesc += 1
+                pagename = row[1]
+                downloadPage(wikidomain=wikidomain, wikiurl=wikiurl, pagename=pagename, overwrite=overwrite)
+            if (filesc + pagesc) % 10 == 0:
+                print('  Progress: %d of %d' % ((filesc + pagesc), totallines))
+        print('  Progress: %d of %d' % ((filesc + pagesc), totallines))
+    print('Downloaded %d pages' % (pagesc))
+    print('Downloaded %d files' % (filesc))
+
+def downloadSitemap(wikidomain='', wikiurl='', overwrite=False):
+    print('Downloading sitemap.xml')
+    saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='', overwrite=overwrite)
+
+def downloadMainPage(wikidomain='', wikiurl='', overwrite=False):
+    print('Downloading index.html')
+    saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='', overwrite=overwrite)
+
+def downloadLogo(wikidomain='', wikiurl='', overwrite=False):
+    index = '%s/index.html' % (wikidomain)
+    if os.path.exists(index):
+        with open(index, 'r') as f:
+            m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', f.read())
+            if m:
+                logourl = m[0]
+                logofilename = logourl.split('/')[-1]
+                print('Downloading logo')
+                saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='', overwrite=overwrite)
+                return logofilename
+    return ''
+
+def printhelp():
+    helptext = """This script downloads (and uploads) WikiSpaces wikis.
+
+Parameters available:
+
+--upload: upload compressed file with downloaded wiki
+--admin: add item to WikiTeam collection (if you are an admin in that collection)
+--overwrite: download again even if files exists locally
+--overwrite-ia: upload again to Internet Archive even if item exists there
+--help: prints this help text
+
+Examples:
+
+python3 wikispaces.py https://mywiki.wikispaces.com
+   It downloads that wiki
+
+python3 wikispaces.py wikis.txt
+   It downloads a list of wikis (file format is a URL per line)
+
+python3 wikispaces.py https://mywiki.wikispaces.com --upload
+   It downloads that wiki, compress it and uploading to Internet Archive
+"""
+    print(helptext)
+    sys.exit()
+
+def main():
+    upload = False
+    isadmin = False
+    overwrite = False
+    overwriteia = False
+    if len(sys.argv) < 2:
+        printhelp()
+    param = sys.argv[1]
+    if not param:
+        printhelp()
+    if len(sys.argv) > 2:
+        if '--upload' in sys.argv:
+            upload = True
+        if '--admin' in sys.argv:
+            isadmin = True
+        if '--overwrite' in sys.argv:
+            overwrite = True
+        if '--overwrite-ia' in sys.argv:
+            overwriteia = True
+        if '--help' in sys.argv:
+            printhelp()
+    
+    wikilist = []
+    if '://' in param:
+        wikilist.append(param.rstrip('/'))
+    else:
+        with open(param, 'r') as f:
+            wikilist = f.read().strip().splitlines()
+            wikilist2 = []
+            for wiki in wikilist:
+                wikilist2.append(wiki.rstrip('/'))
+            wikilist = wikilist2
+    
+    for wikiurl in wikilist:
+        wikidomain = wikiurl.split('://')[1].split('/')[0]
+        print('\n')
+        print('#'*40,'\n Downloading:', wikiurl)
+        print('#'*40,'\n')
+        
+        if upload and not overwriteia:
+            itemid = 'wiki-%s' % (wikidomain)
+            try:
+                iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
+                if not re.findall(r'Item cannot be found', iahtml):
+                    if not overwriteia:
+                        print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia')
+                        print('You can find it in https://archive.org/details/%s' % (itemid))
+                        continue
+            except:
+                pass
+        
+        dirfiles = '%s/files' % (wikidomain)
+        if not os.path.exists(dirfiles):
+            print('Creating directory %s' % (dirfiles))
+            os.makedirs(dirfiles)
+        dirpages = '%s/pages' % (wikidomain)
+        if not os.path.exists(dirpages):
+            print('Creating directory %s' % (dirpages))
+            os.makedirs(dirpages)
+        downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
+        sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
+        downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl, overwrite=overwrite)
+        downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
+        logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl, overwrite=overwrite)
+        
+        if upload:
+            itemid = 'wiki-%s' % (wikidomain)
+            print('\nCompressing dump...')
+            wikidir = wikidomain
+            os.chdir(wikidir)
+            print('Changed directory to', os.getcwd())
+            wikizip = '%s.zip' % (wikidomain)
+            subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
+            os.chdir('..')
+            print('Changed directory to', os.getcwd())
+            
+            print('\nUploading to Internet Archive...')
+            indexfilename = '%s/index.html' % (wikidir)
+            if not os.path.exists(indexfilename):
+                print('\nError dump incomplete, skipping upload\n')
+                continue
+            f = open(indexfilename, 'r')
+            indexhtml = f.read()
+            f.close()
+            wikititle = ''
+            try:
+                wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
+            except:
+                wikititle = wikidomain
+            if not wikititle:
+                wikititle = wikidomain
+            itemtitle = 'Wiki - %s' % wikititle
+            itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
+            itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
+            itemoriginalurl = wikiurl
+            itemlicenseurl = ''
+            m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
+            if m:
+                itemlicenseurl = m[0]
+            if not itemlicenseurl:
+                itemtags.append('unknowncopyright')
+            itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
+            itemcollection = isadmin and 'wikiteam' or 'opensource'
+            itemlang = 'Unknown'
+            itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
+            itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''                
+            subprocess.call('ia' + ' upload %s %s %s --metadata="mediatype:web" --metadata="collection:%s" --metadata="title:%s" --metadata="description:%s" --metadata="language:%s" --metadata="last-updated-date:%s" --metadata="originalurl:%s" %s %s' % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemoriginalurl, itemlicenseurl and '--metadata="licenseurl:%s"' % (itemlicenseurl) or '', itemtags_), shell=True)
+            print('You can find it in https://archive.org/details/%s' % (itemid))
+
+if __name__ == "__main__":
+    main()