mirror of https://github.com/WikiTeam/wikiteam
Merge 8089f748b3
into 54d9d8051e
commit
1717a21088
@ -0,0 +1,389 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# dumpgenerator.py A generator of dumps for wikis
|
||||
# Copyright (C) 2011-2014 WikiTeam developers
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# To learn more, read the documentation:
|
||||
# https://github.com/WikiTeam/wikiteam/wiki
|
||||
|
||||
try:
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
except:
|
||||
print 'Need BeautifulSoup for current version. In the future it should use regex for scraping.'
|
||||
|
||||
import HTMLParser
|
||||
import urlparse
|
||||
import requests
|
||||
import os
|
||||
import socket
|
||||
import re
|
||||
from datetime import datetime
|
||||
import gzip
|
||||
import time
|
||||
|
||||
|
||||
def getTitles(url, ns=None):
|
||||
"""Get titles given a doku.php URL and an (optional) namespace"""
|
||||
titles = []
|
||||
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
|
||||
params = {'call': 'index'}
|
||||
if ns:
|
||||
params['idx'] = ns
|
||||
else:
|
||||
print 'Finding titles'
|
||||
ns = ns or ''
|
||||
depth = len(ns.split(':'))
|
||||
if ns:
|
||||
print '%sLooking in namespace %s' % (' ' * depth, ns)
|
||||
r = requests.post(ajax, params)
|
||||
if r.status_code != 200 or "AJAX call 'index' unknown!" in r.text:
|
||||
return getTitlesOld(url, ns=None)
|
||||
soup = BeautifulSoup(r.text)
|
||||
for a in soup.findAll('a', href=True):
|
||||
if a.has_key('title'):
|
||||
title = a['title']
|
||||
else:
|
||||
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
|
||||
title = (query['idx' if 'idx' in query else 'id'])[0]
|
||||
if a['class'] == 'idx_dir':
|
||||
titles += getTitles(url, title)
|
||||
else:
|
||||
titles.append(title)
|
||||
time.sleep(1.5)
|
||||
print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
|
||||
return titles
|
||||
|
||||
|
||||
def getTitlesOld(url, ns=None, ancient=False):
|
||||
"""Get titles using the doku.php?do=index"""
|
||||
|
||||
titles = []
|
||||
params = {'do': 'index'}
|
||||
|
||||
if ns:
|
||||
params['idx'] = ns
|
||||
ns = ns or ''
|
||||
depth = len(ns.split(':'))
|
||||
|
||||
r = requests.get(url, params=params)
|
||||
soup = BeautifulSoup(r.text).findAll('ul', {'class': 'idx'})[0]
|
||||
attr = 'text' if ancient else 'title'
|
||||
|
||||
if ns:
|
||||
print '%sSearching in namespace %s' % (' ' * depth, ns)
|
||||
|
||||
def match(href):
|
||||
if not href:
|
||||
return False
|
||||
qs = urlparse.urlparse(href).query
|
||||
qs = urlparse.parse_qs(qs)
|
||||
return 'idx' in qs and qs['idx'][0] in (ns, ':' + ns)
|
||||
result = soup.findAll(
|
||||
'a', {
|
||||
'class': 'idx_dir', 'href': match})[0].findAllPrevious('li')[0].findAll(
|
||||
'a', {
|
||||
'href': lambda x: x and not match(x)})
|
||||
else:
|
||||
print 'Finding titles (?do=index)'
|
||||
result = soup.findAll('a')
|
||||
|
||||
for a in result:
|
||||
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
|
||||
if a['class'] == 'idx_dir':
|
||||
titles += getTitlesOld(url, query['idx'][0])
|
||||
else:
|
||||
titles.append(query['id'][0])
|
||||
|
||||
print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
|
||||
|
||||
return titles
|
||||
|
||||
|
||||
def getSourceExport(url, title, rev=''):
|
||||
"""Export the raw source of a page (at a given revision)"""
|
||||
|
||||
r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'export_raw'})
|
||||
return r.text
|
||||
|
||||
|
||||
def getSourceEdit(url, title, rev=''):
|
||||
"""Export the raw source of a page by scraping the edit box content. Yuck."""
|
||||
|
||||
r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'edit'})
|
||||
soup = BeautifulSoup(r.text)
|
||||
return ''.join(soup.find('textarea', {'name': 'wikitext'}).contents).strip()
|
||||
|
||||
|
||||
def domain2prefix(url):
|
||||
""" Convert domain name to a valid prefix filename. """
|
||||
|
||||
domain = url
|
||||
|
||||
domain = domain.lower()
|
||||
domain = re.sub(r'(https?://|www\.|/doku\.php)', '', domain)
|
||||
domain = re.sub(r'/', '_', domain)
|
||||
domain = re.sub(r'\.', '', domain)
|
||||
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
|
||||
|
||||
return domain
|
||||
|
||||
|
||||
def getRevisions(url, title, use_hidden_rev=False, select_revs=False):
|
||||
""" Get the revisions of a page. This is nontrivial because different versions of DokuWiki return completely different revision HTML."""
|
||||
|
||||
revs = []
|
||||
h = HTMLParser.HTMLParser()
|
||||
if select_revs:
|
||||
r = requests.get(url, params={'id': title, 'do': 'diff'})
|
||||
soup = BeautifulSoup(r.text)
|
||||
select = soup.find(
|
||||
'select', {
|
||||
'class': 'quickselect', 'name': 'rev2[1]'})
|
||||
for option in select.findAll('option'):
|
||||
text = option.text
|
||||
date = ' '.join(text.split(' ')[:2])
|
||||
username = len(text.split(' ')) > 2 and text.split(' ')[2]
|
||||
summary = ' '.join(text.split(' ')[3:])
|
||||
|
||||
revs.append({'id': option['value'],
|
||||
'user': username,
|
||||
'sum': summary,
|
||||
'date': date})
|
||||
|
||||
i = 0
|
||||
continue_index = -1
|
||||
cont = True
|
||||
|
||||
while cont:
|
||||
r = requests.get(
|
||||
url,
|
||||
params={
|
||||
'id': title,
|
||||
'do': 'revisions',
|
||||
'first': continue_index})
|
||||
|
||||
soup = BeautifulSoup(r.text)
|
||||
lis = soup.findAll(
|
||||
'div', {
|
||||
'class': 'level1'})[0].findNext('ul').findAll('li')
|
||||
|
||||
for li in lis:
|
||||
rev = {}
|
||||
rev_hrefs = li.findAll(
|
||||
'a', href=lambda href: href and (
|
||||
'&rev=' in href or '?rev=' in href))
|
||||
rev['minor'] = ('class', 'minor') in li.attrs
|
||||
|
||||
if rev_hrefs:
|
||||
rev['id'] = urlparse.parse_qs(
|
||||
urlparse.urlparse(
|
||||
rev_hrefs[0]['href']).query)['rev'][0]
|
||||
|
||||
sum_span = li.findAll('span', {'class': 'sum'})
|
||||
if sum_span and not select_revs:
|
||||
sum_span = sum_span[0]
|
||||
sum_text = sum_span.text.split(' ')[1:]
|
||||
if sum_span.findAll('bdi'):
|
||||
rev['sum'] = h.unescape(sum_span.find('bdi').text).strip()
|
||||
else:
|
||||
rev['sum'] = h.unescape(' '.join(sum_text)).strip()
|
||||
elif not select_revs:
|
||||
print repr(li.text)
|
||||
wikilink1 = li.find('a', {'class': 'wikilink1'})
|
||||
text_node = wikilink1 and wikilink1.next and wikilink1.next.next or ''
|
||||
if text_node.strip:
|
||||
rev['sum'] = h.unescape(text_node).strip(u'\u2013 \n')
|
||||
|
||||
date_span = li.find('span', {'class': 'date'})
|
||||
if date_span:
|
||||
rev['date'] = date_span.text.strip()
|
||||
else:
|
||||
rev['date'] = ' '.join(li.text.split(' ')[:2])
|
||||
matches = re.findall(
|
||||
r'([0-9./]+ [0-9]{1,2}:[0-9]{1,2})',
|
||||
rev['date'])
|
||||
if matches:
|
||||
rev['date'] = matches[0]
|
||||
|
||||
if not (select_revs and len(revs) > i and revs[i]['user']):
|
||||
user_span = li.find('span', {'class': 'user'})
|
||||
if user_span:
|
||||
rev['user'] = user_span.text
|
||||
|
||||
if select_revs and len(revs) > i:
|
||||
revs[i].update(rev)
|
||||
else:
|
||||
revs.append(rev)
|
||||
i += 1
|
||||
|
||||
first = soup.findAll('input', {'name': 'first', 'value': True})
|
||||
continue_index = first and max(map(lambda x: x['value'], first))
|
||||
cont = soup.find('input', {'class': 'button', 'accesskey': 'n'})
|
||||
time.sleep(1.5)
|
||||
|
||||
if revs and use_hidden_rev and not select_revs:
|
||||
soup2 = BeautifulSoup(requests.get(url, params={'id': title}).text)
|
||||
revs[0]['id'] = soup2.find(
|
||||
'input', {
|
||||
'type': 'hidden', 'name': 'rev', 'value': True})['value']
|
||||
|
||||
return revs
|
||||
|
||||
|
||||
def getFiles(url, ns=''):
|
||||
""" Return a list of media filenames of a wiki """
|
||||
files = set()
|
||||
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
|
||||
medialist = BeautifulSoup(
|
||||
requests.post(
|
||||
ajax, {
|
||||
'call': 'medialist', 'ns': ns, 'do': 'media'}).text)
|
||||
medians = BeautifulSoup(
|
||||
requests.post(
|
||||
ajax, {
|
||||
'call': 'medians', 'ns': ns, 'do': 'media'}).text)
|
||||
imagelinks = medialist.findAll(
|
||||
'a',
|
||||
href=lambda x: x and re.findall(
|
||||
'[?&](media|image)=',
|
||||
x))
|
||||
for a in imagelinks:
|
||||
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
|
||||
key = 'media' if 'media' in query else 'image'
|
||||
files.add(query[key][0])
|
||||
files = list(files)
|
||||
namespacelinks = medians.findAll('a', {'class': 'idx_dir', 'href': True})
|
||||
for a in namespacelinks:
|
||||
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
|
||||
files += getFiles(url, query['ns'][0])
|
||||
print 'Found %d files in namespace %s' % (len(files), ns or '(all)')
|
||||
return files
|
||||
|
||||
|
||||
def dumpContent(url):
|
||||
os.mkdir(domain2prefix(url) + '/pages')
|
||||
os.mkdir(domain2prefix(url) + '/attic')
|
||||
os.mkdir(domain2prefix(url) + '/meta')
|
||||
|
||||
titles = getTitles(url)
|
||||
if not len(titles):
|
||||
print 'Empty wiki'
|
||||
return
|
||||
|
||||
r1 = requests.get(url, params={'id': titles[0], 'do': 'export_raw'})
|
||||
r2 = requests.get(url, params={'id': titles[0]})
|
||||
r3 = requests.get(url, params={'id': titles[0], 'do': 'diff'})
|
||||
|
||||
getSource = getSourceExport
|
||||
if 'html' in r1.headers['content-type']:
|
||||
getSource = getSourceEdit
|
||||
|
||||
soup = BeautifulSoup(r2.text)
|
||||
hidden_rev = soup.findAll(
|
||||
'input', {
|
||||
'type': 'hidden', 'name': 'rev', 'value': True})
|
||||
use_hidden_rev = hidden_rev and hidden_rev[0]['value']
|
||||
|
||||
soup = BeautifulSoup(r3.text)
|
||||
select_revs = soup.findAll(
|
||||
'select', {
|
||||
'class': 'quickselect', 'name': 'rev2[0]'})
|
||||
|
||||
for title in titles:
|
||||
titleparts = title.split(':')
|
||||
for i in range(len(titleparts)):
|
||||
dir = "/".join(titleparts[:i])
|
||||
if not os.path.exists(domain2prefix(url) + '/pages/' + dir):
|
||||
os.mkdir(domain2prefix(url) + '/pages/' + dir)
|
||||
if not os.path.exists(domain2prefix(url) + '/meta/' + dir):
|
||||
os.mkdir(domain2prefix(url) + '/meta/' + dir)
|
||||
if not os.path.exists(domain2prefix(url) + '/attic/' + dir):
|
||||
os.mkdir(domain2prefix(url) + '/attic/' + dir)
|
||||
with open(domain2prefix(url) + '/pages/' + title.replace(':', '/') + '.txt', 'w') as f:
|
||||
f.write(getSource(url, title).encode("utf-8"))
|
||||
revs = getRevisions(url, title, use_hidden_rev, select_revs)
|
||||
for rev in revs[1:]:
|
||||
if 'id' in rev and rev['id']:
|
||||
with gzip.open(domain2prefix(url) + '/attic/' + title.replace(':', '/') + '.' + rev['id'] + '.txt.gz', 'w') as f:
|
||||
f.write(getSource(url, title, rev['id']).encode("utf-8"))
|
||||
time.sleep(1.5)
|
||||
print 'Revision %s of %s' % (rev['id'], title)
|
||||
with open(domain2prefix(url) + '/meta/' + title.replace(':', '/') + '.changes', 'w') as f:
|
||||
# Loop through revisions in reverse.
|
||||
for rev in revs[::-1]:
|
||||
print rev, title
|
||||
sum = 'sum' in rev and rev['sum'].strip() or ''
|
||||
id = 0
|
||||
|
||||
ip = '127.0.0.1'
|
||||
user = ''
|
||||
minor = 'minor' in rev and rev['minor']
|
||||
|
||||
if 'id' in rev and rev['id']:
|
||||
id = rev['id']
|
||||
else:
|
||||
# Different date formats in different versions of DokuWiki.
|
||||
# If no ID was found, make one up based on the date (since rev IDs are Unix times)
|
||||
# Maybe this is evil. Not sure.
|
||||
|
||||
try:
|
||||
date = datetime.strptime(rev['date'], "%Y/%m/%d %H:%M")
|
||||
id = str(int(time.mktime(date.utctimetuple())))
|
||||
except:
|
||||
date = datetime.strptime(rev['date'], "%d.%m.%Y %H:%M")
|
||||
id = str(int(time.mktime(date.utctimetuple())))
|
||||
|
||||
rev['user'] = rev['user'] if 'user' in rev else 'unknown'
|
||||
try:
|
||||
# inet_aton throws an exception if its argument is not an IPv4 address
|
||||
socket.inet_aton(rev['user'])
|
||||
ip = rev['user']
|
||||
except socket.error:
|
||||
user = rev['user']
|
||||
|
||||
row = '\t'.join([id, ip, 'e' if minor else 'E', title, user, sum])
|
||||
row = row.replace('\n', ' ')
|
||||
row = row.replace('\r', ' ')
|
||||
|
||||
f.write((row + '\n').encode("utf-8"))
|
||||
|
||||
|
||||
def dumpMedia(url):
|
||||
prefix = domain2prefix(url)
|
||||
os.mkdir(prefix + '/media')
|
||||
os.mkdir(prefix + '/media_attic')
|
||||
os.mkdir(prefix + '/media_meta')
|
||||
|
||||
fetch = urlparse.urljoin(url, 'lib/exe/fetch.php')
|
||||
|
||||
files = getFiles(url)
|
||||
for title in files:
|
||||
titleparts = title.split(':')
|
||||
for i in range(len(titleparts)):
|
||||
dir = "/".join(titleparts[:i])
|
||||
if not os.path.exists(prefix + '/media/' + dir):
|
||||
os.mkdir(prefix + '/media/' + dir)
|
||||
with open(prefix + '/media/' + title.replace(':', '/'), 'wb') as f:
|
||||
f.write(requests.get(fetch, params={'media': title}).content)
|
||||
print 'File %s' % title
|
||||
time.sleep(1.5)
|
||||
|
||||
|
||||
def dump(url):
|
||||
print domain2prefix(url)
|
||||
os.mkdir(domain2prefix(url))
|
||||
dumpContent(url)
|
||||
dumpMedia(url)
|
Loading…
Reference in New Issue