pull/243/merge
PiRSquared17 11 months ago committed by GitHub
commit 1717a21088
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,389 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# dumpgenerator.py A generator of dumps for wikis
# Copyright (C) 2011-2014 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# To learn more, read the documentation:
# https://github.com/WikiTeam/wikiteam/wiki
try:
from BeautifulSoup import BeautifulSoup
except:
print 'Need BeautifulSoup for current version. In the future it should use regex for scraping.'
import HTMLParser
import urlparse
import requests
import os
import socket
import re
from datetime import datetime
import gzip
import time
def getTitles(url, ns=None):
"""Get titles given a doku.php URL and an (optional) namespace"""
titles = []
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
params = {'call': 'index'}
if ns:
params['idx'] = ns
else:
print 'Finding titles'
ns = ns or ''
depth = len(ns.split(':'))
if ns:
print '%sLooking in namespace %s' % (' ' * depth, ns)
r = requests.post(ajax, params)
if r.status_code != 200 or "AJAX call 'index' unknown!" in r.text:
return getTitlesOld(url, ns=None)
soup = BeautifulSoup(r.text)
for a in soup.findAll('a', href=True):
if a.has_key('title'):
title = a['title']
else:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
title = (query['idx' if 'idx' in query else 'id'])[0]
if a['class'] == 'idx_dir':
titles += getTitles(url, title)
else:
titles.append(title)
time.sleep(1.5)
print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
return titles
def getTitlesOld(url, ns=None, ancient=False):
"""Get titles using the doku.php?do=index"""
titles = []
params = {'do': 'index'}
if ns:
params['idx'] = ns
ns = ns or ''
depth = len(ns.split(':'))
r = requests.get(url, params=params)
soup = BeautifulSoup(r.text).findAll('ul', {'class': 'idx'})[0]
attr = 'text' if ancient else 'title'
if ns:
print '%sSearching in namespace %s' % (' ' * depth, ns)
def match(href):
if not href:
return False
qs = urlparse.urlparse(href).query
qs = urlparse.parse_qs(qs)
return 'idx' in qs and qs['idx'][0] in (ns, ':' + ns)
result = soup.findAll(
'a', {
'class': 'idx_dir', 'href': match})[0].findAllPrevious('li')[0].findAll(
'a', {
'href': lambda x: x and not match(x)})
else:
print 'Finding titles (?do=index)'
result = soup.findAll('a')
for a in result:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
if a['class'] == 'idx_dir':
titles += getTitlesOld(url, query['idx'][0])
else:
titles.append(query['id'][0])
print '%sFound %d title(s) in namespace %s' % (' ' * depth, len(titles), ns or '(all)')
return titles
def getSourceExport(url, title, rev=''):
"""Export the raw source of a page (at a given revision)"""
r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'export_raw'})
return r.text
def getSourceEdit(url, title, rev=''):
"""Export the raw source of a page by scraping the edit box content. Yuck."""
r = requests.get(url, params={'id': title, 'rev': rev, 'do': 'edit'})
soup = BeautifulSoup(r.text)
return ''.join(soup.find('textarea', {'name': 'wikitext'}).contents).strip()
def domain2prefix(url):
""" Convert domain name to a valid prefix filename. """
domain = url
domain = domain.lower()
domain = re.sub(r'(https?://|www\.|/doku\.php)', '', domain)
domain = re.sub(r'/', '_', domain)
domain = re.sub(r'\.', '', domain)
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
return domain
def getRevisions(url, title, use_hidden_rev=False, select_revs=False):
""" Get the revisions of a page. This is nontrivial because different versions of DokuWiki return completely different revision HTML."""
revs = []
h = HTMLParser.HTMLParser()
if select_revs:
r = requests.get(url, params={'id': title, 'do': 'diff'})
soup = BeautifulSoup(r.text)
select = soup.find(
'select', {
'class': 'quickselect', 'name': 'rev2[1]'})
for option in select.findAll('option'):
text = option.text
date = ' '.join(text.split(' ')[:2])
username = len(text.split(' ')) > 2 and text.split(' ')[2]
summary = ' '.join(text.split(' ')[3:])
revs.append({'id': option['value'],
'user': username,
'sum': summary,
'date': date})
i = 0
continue_index = -1
cont = True
while cont:
r = requests.get(
url,
params={
'id': title,
'do': 'revisions',
'first': continue_index})
soup = BeautifulSoup(r.text)
lis = soup.findAll(
'div', {
'class': 'level1'})[0].findNext('ul').findAll('li')
for li in lis:
rev = {}
rev_hrefs = li.findAll(
'a', href=lambda href: href and (
'&rev=' in href or '?rev=' in href))
rev['minor'] = ('class', 'minor') in li.attrs
if rev_hrefs:
rev['id'] = urlparse.parse_qs(
urlparse.urlparse(
rev_hrefs[0]['href']).query)['rev'][0]
sum_span = li.findAll('span', {'class': 'sum'})
if sum_span and not select_revs:
sum_span = sum_span[0]
sum_text = sum_span.text.split(' ')[1:]
if sum_span.findAll('bdi'):
rev['sum'] = h.unescape(sum_span.find('bdi').text).strip()
else:
rev['sum'] = h.unescape(' '.join(sum_text)).strip()
elif not select_revs:
print repr(li.text)
wikilink1 = li.find('a', {'class': 'wikilink1'})
text_node = wikilink1 and wikilink1.next and wikilink1.next.next or ''
if text_node.strip:
rev['sum'] = h.unescape(text_node).strip(u'\u2013 \n')
date_span = li.find('span', {'class': 'date'})
if date_span:
rev['date'] = date_span.text.strip()
else:
rev['date'] = ' '.join(li.text.split(' ')[:2])
matches = re.findall(
r'([0-9./]+ [0-9]{1,2}:[0-9]{1,2})',
rev['date'])
if matches:
rev['date'] = matches[0]
if not (select_revs and len(revs) > i and revs[i]['user']):
user_span = li.find('span', {'class': 'user'})
if user_span:
rev['user'] = user_span.text
if select_revs and len(revs) > i:
revs[i].update(rev)
else:
revs.append(rev)
i += 1
first = soup.findAll('input', {'name': 'first', 'value': True})
continue_index = first and max(map(lambda x: x['value'], first))
cont = soup.find('input', {'class': 'button', 'accesskey': 'n'})
time.sleep(1.5)
if revs and use_hidden_rev and not select_revs:
soup2 = BeautifulSoup(requests.get(url, params={'id': title}).text)
revs[0]['id'] = soup2.find(
'input', {
'type': 'hidden', 'name': 'rev', 'value': True})['value']
return revs
def getFiles(url, ns=''):
""" Return a list of media filenames of a wiki """
files = set()
ajax = urlparse.urljoin(url, 'lib/exe/ajax.php')
medialist = BeautifulSoup(
requests.post(
ajax, {
'call': 'medialist', 'ns': ns, 'do': 'media'}).text)
medians = BeautifulSoup(
requests.post(
ajax, {
'call': 'medians', 'ns': ns, 'do': 'media'}).text)
imagelinks = medialist.findAll(
'a',
href=lambda x: x and re.findall(
'[?&](media|image)=',
x))
for a in imagelinks:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
key = 'media' if 'media' in query else 'image'
files.add(query[key][0])
files = list(files)
namespacelinks = medians.findAll('a', {'class': 'idx_dir', 'href': True})
for a in namespacelinks:
query = urlparse.parse_qs(urlparse.urlparse(a['href']).query)
files += getFiles(url, query['ns'][0])
print 'Found %d files in namespace %s' % (len(files), ns or '(all)')
return files
def dumpContent(url):
os.mkdir(domain2prefix(url) + '/pages')
os.mkdir(domain2prefix(url) + '/attic')
os.mkdir(domain2prefix(url) + '/meta')
titles = getTitles(url)
if not len(titles):
print 'Empty wiki'
return
r1 = requests.get(url, params={'id': titles[0], 'do': 'export_raw'})
r2 = requests.get(url, params={'id': titles[0]})
r3 = requests.get(url, params={'id': titles[0], 'do': 'diff'})
getSource = getSourceExport
if 'html' in r1.headers['content-type']:
getSource = getSourceEdit
soup = BeautifulSoup(r2.text)
hidden_rev = soup.findAll(
'input', {
'type': 'hidden', 'name': 'rev', 'value': True})
use_hidden_rev = hidden_rev and hidden_rev[0]['value']
soup = BeautifulSoup(r3.text)
select_revs = soup.findAll(
'select', {
'class': 'quickselect', 'name': 'rev2[0]'})
for title in titles:
titleparts = title.split(':')
for i in range(len(titleparts)):
dir = "/".join(titleparts[:i])
if not os.path.exists(domain2prefix(url) + '/pages/' + dir):
os.mkdir(domain2prefix(url) + '/pages/' + dir)
if not os.path.exists(domain2prefix(url) + '/meta/' + dir):
os.mkdir(domain2prefix(url) + '/meta/' + dir)
if not os.path.exists(domain2prefix(url) + '/attic/' + dir):
os.mkdir(domain2prefix(url) + '/attic/' + dir)
with open(domain2prefix(url) + '/pages/' + title.replace(':', '/') + '.txt', 'w') as f:
f.write(getSource(url, title).encode("utf-8"))
revs = getRevisions(url, title, use_hidden_rev, select_revs)
for rev in revs[1:]:
if 'id' in rev and rev['id']:
with gzip.open(domain2prefix(url) + '/attic/' + title.replace(':', '/') + '.' + rev['id'] + '.txt.gz', 'w') as f:
f.write(getSource(url, title, rev['id']).encode("utf-8"))
time.sleep(1.5)
print 'Revision %s of %s' % (rev['id'], title)
with open(domain2prefix(url) + '/meta/' + title.replace(':', '/') + '.changes', 'w') as f:
# Loop through revisions in reverse.
for rev in revs[::-1]:
print rev, title
sum = 'sum' in rev and rev['sum'].strip() or ''
id = 0
ip = '127.0.0.1'
user = ''
minor = 'minor' in rev and rev['minor']
if 'id' in rev and rev['id']:
id = rev['id']
else:
# Different date formats in different versions of DokuWiki.
# If no ID was found, make one up based on the date (since rev IDs are Unix times)
# Maybe this is evil. Not sure.
try:
date = datetime.strptime(rev['date'], "%Y/%m/%d %H:%M")
id = str(int(time.mktime(date.utctimetuple())))
except:
date = datetime.strptime(rev['date'], "%d.%m.%Y %H:%M")
id = str(int(time.mktime(date.utctimetuple())))
rev['user'] = rev['user'] if 'user' in rev else 'unknown'
try:
# inet_aton throws an exception if its argument is not an IPv4 address
socket.inet_aton(rev['user'])
ip = rev['user']
except socket.error:
user = rev['user']
row = '\t'.join([id, ip, 'e' if minor else 'E', title, user, sum])
row = row.replace('\n', ' ')
row = row.replace('\r', ' ')
f.write((row + '\n').encode("utf-8"))
def dumpMedia(url):
prefix = domain2prefix(url)
os.mkdir(prefix + '/media')
os.mkdir(prefix + '/media_attic')
os.mkdir(prefix + '/media_meta')
fetch = urlparse.urljoin(url, 'lib/exe/fetch.php')
files = getFiles(url)
for title in files:
titleparts = title.split(':')
for i in range(len(titleparts)):
dir = "/".join(titleparts[:i])
if not os.path.exists(prefix + '/media/' + dir):
os.mkdir(prefix + '/media/' + dir)
with open(prefix + '/media/' + title.replace(':', '/'), 'wb') as f:
f.write(requests.get(fetch, params={'media': title}).content)
print 'File %s' % title
time.sleep(1.5)
def dump(url):
print domain2prefix(url)
os.mkdir(domain2prefix(url))
dumpContent(url)
dumpMedia(url)
Loading…
Cancel
Save