mirror of https://github.com/WikiTeam/wikiteam
Merge 8089f748b3
into 54d9d8051e
commit
1717a21088
@ -0,0 +1,389 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# dumpgenerator.py A generator of dumps for wikis
|
||||||
|
# Copyright (C) 2011-2014 WikiTeam developers
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
# To learn more, read the documentation:
|
||||||
|
# https://github.com/WikiTeam/wikiteam/wiki
|
||||||
|
|
||||||
|
try:
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
except:
|
||||||
|
print 'Need BeautifulSoup for current version. In the future it should use regex for scraping.'
|
||||||
|
|
||||||
|
import HTMLParser
|
||||||
|
import urlparse
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
import gzip
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def getTitles(url, ns=None):
|
||||||
|
"""Get titles given a doku.php URL and an (optional) namespace"""
|
||||||
|
titles = []
|
||||||
|
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
|
||||||
|
params = {'call': 'index'}
|
||||||
|
if ns:
|
||||||
|
params['idx'] = ns
|
||||||
|
else:
|
||||||
|
print 'Finding titles'
|
||||||
|
ns = ns or ''
|
||||||
|
depth = len(ns.split(':'))
|
||||||
|
if ns:
|
||||||
|
print '%sLooking in namespace %s' % (' ' * depth, ns)
|
||||||
|
r = requests.post(ajax, params)
|
||||||
|
if r.status_code != 200 or "AJAX call 'index' unknown!" in r.text:
|
||||||
|
return getTitlesOld(url, ns=None)
|
||||||
|
soup = BeautifulSoup(r.text)
|
||||||
|
for a in soup.findAll('a', href=True):
|
||||||
|
if a.has_key('title'):
|
||||||
|
title = a['title']
|
||||||
|
else:
|
||||||
|
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
|
||||||
|
title = (query['idx' if 'idx' in query else 'id'])[0]
|
||||||
|
if a['class'] == 'idx_dir':
|
||||||
|
titles += getTitles(url, title)
|
||||||
|
else:
|
||||||
|
titles.append(title)
|
||||||
|
time.sleep(1.5)
|
||||||
|
print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
|
||||||
|
return titles
|
||||||
|
|
||||||
|
|
||||||
|
def getTitlesOld(url, ns=None, ancient=False):
|
||||||
|
"""Get titles using the doku.php?do=index"""
|
||||||
|
|
||||||
|
titles = []
|
||||||
|
params = {'do': 'index'}
|
||||||
|
|
||||||
|
if ns:
|
||||||
|
params['idx'] = ns
|
||||||
|
ns = ns or ''
|
||||||
|
depth = len(ns.split(':'))
|
||||||
|
|
||||||
|
r = requests.get(url, params=params)
|
||||||
|
soup = BeautifulSoup(r.text).findAll('ul', {'class': 'idx'})[0]
|
||||||
|
attr = 'text' if ancient else 'title'
|
||||||
|
|
||||||
|
if ns:
|
||||||
|
print '%sSearching in namespace %s' % (' ' * depth, ns)
|
||||||
|
|
||||||
|
def match(href):
|
||||||
|
if not href:
|
||||||
|
return False
|
||||||
|
qs = urlparse.urlparse(href).query
|
||||||
|
qs = urlparse.parse_qs(qs)
|
||||||
|
return 'idx' in qs and qs['idx'][0] in (ns, ':' + ns)
|
||||||
|
result = soup.findAll(
|
||||||
|
'a', {
|
||||||
|
'class': 'idx_dir', 'href': match})[0].findAllPrevious('li')[0].findAll(
|
||||||
|
'a', {
|
||||||
|
'href': lambda x: x and not match(x)})
|
||||||
|
else:
|
||||||
|
print 'Finding titles (?do=index)'
|
||||||
|
result = soup.findAll('a')
|
||||||
|
|
||||||
|
for a in result:
|
||||||
|
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
|
||||||
|
if a['class'] == 'idx_dir':
|
||||||
|
titles += getTitlesOld(url, query['idx'][0])
|
||||||
|
else:
|
||||||
|
titles.append(query['id'][0])
|
||||||
|
|
||||||
|
print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
|
||||||
|
|
||||||
|
return titles
|
||||||
|
|
||||||
|
|
||||||
|
def getSourceExport(url, title, rev=''):
|
||||||
|
"""Export the raw source of a page (at a given revision)"""
|
||||||
|
|
||||||
|
r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'export_raw'})
|
||||||
|
return r.text
|
||||||
|
|
||||||
|
|
||||||
|
def getSourceEdit(url, title, rev=''):
|
||||||
|
"""Export the raw source of a page by scraping the edit box content. Yuck."""
|
||||||
|
|
||||||
|
r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'edit'})
|
||||||
|
soup = BeautifulSoup(r.text)
|
||||||
|
return ''.join(soup.find('textarea', {'name': 'wikitext'}).contents).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def domain2prefix(url):
|
||||||
|
""" Convert domain name to a valid prefix filename. """
|
||||||
|
|
||||||
|
domain = url
|
||||||
|
|
||||||
|
domain = domain.lower()
|
||||||
|
domain = re.sub(r'(https?://|www\.|/doku\.php)', '', domain)
|
||||||
|
domain = re.sub(r'/', '_', domain)
|
||||||
|
domain = re.sub(r'\.', '', domain)
|
||||||
|
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
|
||||||
|
|
||||||
|
return domain
|
||||||
|
|
||||||
|
|
||||||
|
def getRevisions(url, title, use_hidden_rev=False, select_revs=False):
|
||||||
|
""" Get the revisions of a page. This is nontrivial because different versions of DokuWiki return completely different revision HTML."""
|
||||||
|
|
||||||
|
revs = []
|
||||||
|
h = HTMLParser.HTMLParser()
|
||||||
|
if select_revs:
|
||||||
|
r = requests.get(url, params={'id': title, 'do': 'diff'})
|
||||||
|
soup = BeautifulSoup(r.text)
|
||||||
|
select = soup.find(
|
||||||
|
'select', {
|
||||||
|
'class': 'quickselect', 'name': 'rev2[1]'})
|
||||||
|
for option in select.findAll('option'):
|
||||||
|
text = option.text
|
||||||
|
date = ' '.join(text.split(' ')[:2])
|
||||||
|
username = len(text.split(' ')) > 2 and text.split(' ')[2]
|
||||||
|
summary = ' '.join(text.split(' ')[3:])
|
||||||
|
|
||||||
|
revs.append({'id': option['value'],
|
||||||
|
'user': username,
|
||||||
|
'sum': summary,
|
||||||
|
'date': date})
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
continue_index = -1
|
||||||
|
cont = True
|
||||||
|
|
||||||
|
while cont:
|
||||||
|
r = requests.get(
|
||||||
|
url,
|
||||||
|
params={
|
||||||
|
'id': title,
|
||||||
|
'do': 'revisions',
|
||||||
|
'first': continue_index})
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r.text)
|
||||||
|
lis = soup.findAll(
|
||||||
|
'div', {
|
||||||
|
'class': 'level1'})[0].findNext('ul').findAll('li')
|
||||||
|
|
||||||
|
for li in lis:
|
||||||
|
rev = {}
|
||||||
|
rev_hrefs = li.findAll(
|
||||||
|
'a', href=lambda href: href and (
|
||||||
|
'&rev=' in href or '?rev=' in href))
|
||||||
|
rev['minor'] = ('class', 'minor') in li.attrs
|
||||||
|
|
||||||
|
if rev_hrefs:
|
||||||
|
rev['id'] = urlparse.parse_qs(
|
||||||
|
urlparse.urlparse(
|
||||||
|
rev_hrefs[0]['href']).query)['rev'][0]
|
||||||
|
|
||||||
|
sum_span = li.findAll('span', {'class': 'sum'})
|
||||||
|
if sum_span and not select_revs:
|
||||||
|
sum_span = sum_span[0]
|
||||||
|
sum_text = sum_span.text.split(' ')[1:]
|
||||||
|
if sum_span.findAll('bdi'):
|
||||||
|
rev['sum'] = h.unescape(sum_span.find('bdi').text).strip()
|
||||||
|
else:
|
||||||
|
rev['sum'] = h.unescape(' '.join(sum_text)).strip()
|
||||||
|
elif not select_revs:
|
||||||
|
print repr(li.text)
|
||||||
|
wikilink1 = li.find('a', {'class': 'wikilink1'})
|
||||||
|
text_node = wikilink1 and wikilink1.next and wikilink1.next.next or ''
|
||||||
|
if text_node.strip:
|
||||||
|
rev['sum'] = h.unescape(text_node).strip(u'\u2013 \n')
|
||||||
|
|
||||||
|
date_span = li.find('span', {'class': 'date'})
|
||||||
|
if date_span:
|
||||||
|
rev['date'] = date_span.text.strip()
|
||||||
|
else:
|
||||||
|
rev['date'] = ' '.join(li.text.split(' ')[:2])
|
||||||
|
matches = re.findall(
|
||||||
|
r'([0-9./]+ [0-9]{1,2}:[0-9]{1,2})',
|
||||||
|
rev['date'])
|
||||||
|
if matches:
|
||||||
|
rev['date'] = matches[0]
|
||||||
|
|
||||||
|
if not (select_revs and len(revs) > i and revs[i]['user']):
|
||||||
|
user_span = li.find('span', {'class': 'user'})
|
||||||
|
if user_span:
|
||||||
|
rev['user'] = user_span.text
|
||||||
|
|
||||||
|
if select_revs and len(revs) > i:
|
||||||
|
revs[i].update(rev)
|
||||||
|
else:
|
||||||
|
revs.append(rev)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
first = soup.findAll('input', {'name': 'first', 'value': True})
|
||||||
|
continue_index = first and max(map(lambda x: x['value'], first))
|
||||||
|
cont = soup.find('input', {'class': 'button', 'accesskey': 'n'})
|
||||||
|
time.sleep(1.5)
|
||||||
|
|
||||||
|
if revs and use_hidden_rev and not select_revs:
|
||||||
|
soup2 = BeautifulSoup(requests.get(url, params={'id': title}).text)
|
||||||
|
revs[0]['id'] = soup2.find(
|
||||||
|
'input', {
|
||||||
|
'type': 'hidden', 'name': 'rev', 'value': True})['value']
|
||||||
|
|
||||||
|
return revs
|
||||||
|
|
||||||
|
|
||||||
|
def getFiles(url, ns=''):
|
||||||
|
""" Return a list of media filenames of a wiki """
|
||||||
|
files = set()
|
||||||
|
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
|
||||||
|
medialist = BeautifulSoup(
|
||||||
|
requests.post(
|
||||||
|
ajax, {
|
||||||
|
'call': 'medialist', 'ns': ns, 'do': 'media'}).text)
|
||||||
|
medians = BeautifulSoup(
|
||||||
|
requests.post(
|
||||||
|
ajax, {
|
||||||
|
'call': 'medians', 'ns': ns, 'do': 'media'}).text)
|
||||||
|
imagelinks = medialist.findAll(
|
||||||
|
'a',
|
||||||
|
href=lambda x: x and re.findall(
|
||||||
|
'[?&](media|image)=',
|
||||||
|
x))
|
||||||
|
for a in imagelinks:
|
||||||
|
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
|
||||||
|
key = 'media' if 'media' in query else 'image'
|
||||||
|
files.add(query[key][0])
|
||||||
|
files = list(files)
|
||||||
|
namespacelinks = medians.findAll('a', {'class': 'idx_dir', 'href': True})
|
||||||
|
for a in namespacelinks:
|
||||||
|
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
|
||||||
|
files += getFiles(url, query['ns'][0])
|
||||||
|
print 'Found %d files in namespace %s' % (len(files), ns or '(all)')
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
def dumpContent(url):
|
||||||
|
os.mkdir(domain2prefix(url) + '/pages')
|
||||||
|
os.mkdir(domain2prefix(url) + '/attic')
|
||||||
|
os.mkdir(domain2prefix(url) + '/meta')
|
||||||
|
|
||||||
|
titles = getTitles(url)
|
||||||
|
if not len(titles):
|
||||||
|
print 'Empty wiki'
|
||||||
|
return
|
||||||
|
|
||||||
|
r1 = requests.get(url, params={'id': titles[0], 'do': 'export_raw'})
|
||||||
|
r2 = requests.get(url, params={'id': titles[0]})
|
||||||
|
r3 = requests.get(url, params={'id': titles[0], 'do': 'diff'})
|
||||||
|
|
||||||
|
getSource = getSourceExport
|
||||||
|
if 'html' in r1.headers['content-type']:
|
||||||
|
getSource = getSourceEdit
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r2.text)
|
||||||
|
hidden_rev = soup.findAll(
|
||||||
|
'input', {
|
||||||
|
'type': 'hidden', 'name': 'rev', 'value': True})
|
||||||
|
use_hidden_rev = hidden_rev and hidden_rev[0]['value']
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r3.text)
|
||||||
|
select_revs = soup.findAll(
|
||||||
|
'select', {
|
||||||
|
'class': 'quickselect', 'name': 'rev2[0]'})
|
||||||
|
|
||||||
|
for title in titles:
|
||||||
|
titleparts = title.split(':')
|
||||||
|
for i in range(len(titleparts)):
|
||||||
|
dir = "/".join(titleparts[:i])
|
||||||
|
if not os.path.exists(domain2prefix(url) + '/pages/' + dir):
|
||||||
|
os.mkdir(domain2prefix(url) + '/pages/' + dir)
|
||||||
|
if not os.path.exists(domain2prefix(url) + '/meta/' + dir):
|
||||||
|
os.mkdir(domain2prefix(url) + '/meta/' + dir)
|
||||||
|
if not os.path.exists(domain2prefix(url) + '/attic/' + dir):
|
||||||
|
os.mkdir(domain2prefix(url) + '/attic/' + dir)
|
||||||
|
with open(domain2prefix(url) + '/pages/' + title.replace(':', '/') + '.txt', 'w') as f:
|
||||||
|
f.write(getSource(url, title).encode("utf-8"))
|
||||||
|
revs = getRevisions(url, title, use_hidden_rev, select_revs)
|
||||||
|
for rev in revs[1:]:
|
||||||
|
if 'id' in rev and rev['id']:
|
||||||
|
with gzip.open(domain2prefix(url) + '/attic/' + title.replace(':', '/') + '.' + rev['id'] + '.txt.gz', 'w') as f:
|
||||||
|
f.write(getSource(url, title, rev['id']).encode("utf-8"))
|
||||||
|
time.sleep(1.5)
|
||||||
|
print 'Revision %s of %s' % (rev['id'], title)
|
||||||
|
with open(domain2prefix(url) + '/meta/' + title.replace(':', '/') + '.changes', 'w') as f:
|
||||||
|
# Loop through revisions in reverse.
|
||||||
|
for rev in revs[::-1]:
|
||||||
|
print rev, title
|
||||||
|
sum = 'sum' in rev and rev['sum'].strip() or ''
|
||||||
|
id = 0
|
||||||
|
|
||||||
|
ip = '127.0.0.1'
|
||||||
|
user = ''
|
||||||
|
minor = 'minor' in rev and rev['minor']
|
||||||
|
|
||||||
|
if 'id' in rev and rev['id']:
|
||||||
|
id = rev['id']
|
||||||
|
else:
|
||||||
|
# Different date formats in different versions of DokuWiki.
|
||||||
|
# If no ID was found, make one up based on the date (since rev IDs are Unix times)
|
||||||
|
# Maybe this is evil. Not sure.
|
||||||
|
|
||||||
|
try:
|
||||||
|
date = datetime.strptime(rev['date'], "%Y/%m/%d %H:%M")
|
||||||
|
id = str(int(time.mktime(date.utctimetuple())))
|
||||||
|
except:
|
||||||
|
date = datetime.strptime(rev['date'], "%d.%m.%Y %H:%M")
|
||||||
|
id = str(int(time.mktime(date.utctimetuple())))
|
||||||
|
|
||||||
|
rev['user'] = rev['user'] if 'user' in rev else 'unknown'
|
||||||
|
try:
|
||||||
|
# inet_aton throws an exception if its argument is not an IPv4 address
|
||||||
|
socket.inet_aton(rev['user'])
|
||||||
|
ip = rev['user']
|
||||||
|
except socket.error:
|
||||||
|
user = rev['user']
|
||||||
|
|
||||||
|
row = '\t'.join([id, ip, 'e' if minor else 'E', title, user, sum])
|
||||||
|
row = row.replace('\n', ' ')
|
||||||
|
row = row.replace('\r', ' ')
|
||||||
|
|
||||||
|
f.write((row + '\n').encode("utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def dumpMedia(url):
|
||||||
|
prefix = domain2prefix(url)
|
||||||
|
os.mkdir(prefix + '/media')
|
||||||
|
os.mkdir(prefix + '/media_attic')
|
||||||
|
os.mkdir(prefix + '/media_meta')
|
||||||
|
|
||||||
|
fetch = urlparse.urljoin(url, 'lib/exe/fetch.php')
|
||||||
|
|
||||||
|
files = getFiles(url)
|
||||||
|
for title in files:
|
||||||
|
titleparts = title.split(':')
|
||||||
|
for i in range(len(titleparts)):
|
||||||
|
dir = "/".join(titleparts[:i])
|
||||||
|
if not os.path.exists(prefix + '/media/' + dir):
|
||||||
|
os.mkdir(prefix + '/media/' + dir)
|
||||||
|
with open(prefix + '/media/' + title.replace(':', '/'), 'wb') as f:
|
||||||
|
f.write(requests.get(fetch, params={'media': title}).content)
|
||||||
|
print 'File %s' % title
|
||||||
|
time.sleep(1.5)
|
||||||
|
|
||||||
|
|
||||||
|
def dump(url):
|
||||||
|
print domain2prefix(url)
|
||||||
|
os.mkdir(domain2prefix(url))
|
||||||
|
dumpContent(url)
|
||||||
|
dumpMedia(url)
|
Loading…
Reference in New Issue