2014-02-26 23:22:53 +00:00
#!/usr/bin/env python2
2011-04-05 16:18:18 +00:00
# -*- coding: utf-8 -*-
2014-06-25 20:53:46 +00:00
# dumpgenerator.py A generator of dumps for wikis
2016-01-29 15:14:34 +00:00
# Copyright (C) 2011-2016 WikiTeam developers
2011-04-05 16:18:18 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
2014-07-03 18:23:21 +00:00
#
2011-04-05 16:18:18 +00:00
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
2014-07-03 18:23:21 +00:00
#
2011-04-05 16:18:18 +00:00
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2014-06-25 16:08:33 +00:00
# To learn more, read the documentation:
# https://github.com/WikiTeam/wikiteam/wiki
2012-04-07 14:58:23 +00:00
2015-03-10 20:24:39 +00:00
try :
from kitchen . text . converters import getwriter
except ImportError :
print " Please install the kitchen module. "
2013-10-09 20:11:01 +00:00
import cookielib
2011-04-07 13:14:37 +00:00
import cPickle
2011-04-05 22:00:29 +00:00
import datetime
2014-07-01 17:26:57 +00:00
import sys
try :
import argparse
except ImportError :
print " Please install the argparse module. "
sys . exit ( 1 )
2014-06-25 20:53:46 +00:00
import json
2011-05-07 22:04:00 +00:00
try :
from hashlib import md5
except ImportError : # Python 2.4 compatibility
from md5 import new as md5
2011-04-05 22:00:29 +00:00
import os
2011-04-05 16:18:18 +00:00
import re
2018-05-07 19:05:26 +00:00
import subprocess
2014-07-01 17:26:57 +00:00
try :
import requests
except ImportError :
print " Please install or update the Requests module. "
sys . exit ( 1 )
2018-05-07 19:05:26 +00:00
try :
import wikitools
except ImportError :
print " Please install the wikitools 1.3+ module if you want to use --xmlrevisions. "
2011-04-07 15:56:48 +00:00
import time
2011-04-05 16:18:18 +00:00
import urllib
2015-03-10 20:24:39 +00:00
UTF8Writer = getwriter ( ' utf8 ' )
sys . stdout = UTF8Writer ( sys . stdout )
2011-04-05 16:18:18 +00:00
2014-07-05 14:23:15 +00:00
__VERSION__ = ' 0.3.0-alpha ' # major, minor, micro: semver.org
2014-07-03 18:23:21 +00:00
2015-02-07 01:19:24 +00:00
class PageMissingError ( Exception ) :
2015-02-11 00:56:14 +00:00
def __init__ ( self , title , xml ) :
self . title = title
self . xml = xml
def __str__ ( self ) :
return " page ' %s ' not found " % self . title
2014-06-25 20:53:46 +00:00
2015-03-08 18:20:12 +00:00
class ExportAbortedError ( Exception ) :
def __init__ ( self , index ) :
self . index = index
def __str__ ( self ) :
return " Export from ' %s ' did not return anything. " % self . index
2014-06-30 23:41:03 +00:00
def getVersion ( ) :
return ( __VERSION__ )
2014-06-25 20:53:46 +00:00
2014-06-25 19:33:02 +00:00
2011-04-11 19:09:55 +00:00
def truncateFilename ( other = { } , filename = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Truncate filenames when downloading images with large filenames """
2014-09-27 22:12:04 +00:00
return filename [ : other [ ' filenamelimit ' ] ] + \
2015-03-28 15:17:06 +00:00
md5 ( filename . encode ( ' utf-8 ' ) ) . hexdigest ( ) + ' . ' + filename . split ( ' . ' ) [ - 1 ]
2011-04-11 18:38:47 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def delay ( config = { } , session = None ) :
2012-08-06 14:48:30 +00:00
""" Add a delay if configured for that """
2011-04-07 17:28:08 +00:00
if config [ ' delay ' ] > 0 :
print ' Sleeping... %d seconds... ' % ( config [ ' delay ' ] )
time . sleep ( config [ ' delay ' ] )
2011-04-07 16:10:12 +00:00
2014-07-03 18:23:21 +00:00
2011-04-05 22:00:29 +00:00
def cleanHTML ( raw = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Extract only the real wiki content and remove rubbish """
""" This function is ONLY used to retrieve page titles and file names when no API is available """
2012-08-06 14:54:31 +00:00
""" DO NOT use this function to extract page content """
2014-07-03 18:23:21 +00:00
# different "tags" used by different MediaWiki versions to mark where
# starts and ends content
2011-04-12 16:31:50 +00:00
if re . search ( ' <!-- bodytext --> ' , raw ) :
2011-04-05 22:00:29 +00:00
raw = raw . split ( ' <!-- bodytext --> ' ) [ 1 ] . split ( ' <!-- /bodytext --> ' ) [ 0 ]
elif re . search ( ' <!-- start content --> ' , raw ) :
2014-07-03 18:23:21 +00:00
raw = raw . split (
' <!-- start content --> ' ) [ 1 ] . split ( ' <!-- end content --> ' ) [ 0 ]
2011-04-12 16:31:50 +00:00
elif re . search ( ' <!-- Begin Content Area --> ' , raw ) :
2014-07-03 18:23:21 +00:00
raw = raw . split (
' <!-- Begin Content Area --> ' ) [ 1 ] . split ( ' <!-- End Content Area --> ' ) [ 0 ]
2011-05-05 21:46:48 +00:00
elif re . search ( ' <!-- content --> ' , raw ) :
raw = raw . split ( ' <!-- content --> ' ) [ 1 ] . split ( ' <!-- mw_content --> ' ) [ 0 ]
2011-09-08 20:00:33 +00:00
elif re . search ( ' <article id= " WikiaMainContent " class= " WikiaMainContent " > ' , raw ) :
2014-09-27 22:12:04 +00:00
raw = raw . split ( ' <article id= " WikiaMainContent " class= " WikiaMainContent " > ' ) [
1 ] . split ( ' </article> ' ) [ 0 ]
2014-07-04 17:39:48 +00:00
elif re . search ( ' <body class= ' , raw ) :
2014-09-27 22:12:04 +00:00
raw = raw . split ( ' <body class= ' ) [ 1 ] . split (
' <div class= " printfooter " > ' ) [ 0 ]
2011-04-05 22:00:29 +00:00
else :
2011-06-04 11:10:45 +00:00
print raw [ : 250 ]
2012-05-20 08:08:28 +00:00
print ' This wiki doesn \' t use marks to split content '
2011-04-05 22:00:29 +00:00
sys . exit ( )
return raw
2014-07-03 18:23:21 +00:00
2014-07-01 17:26:57 +00:00
def handleStatusCode ( response ) :
statuscode = response . status_code
if statuscode > = 200 and statuscode < 300 :
return
print " HTTP Error %d . " % statuscode
if statuscode > = 300 and statuscode < 400 :
print " Redirect should happen automatically: please report this as a bug. "
print response . url
elif statuscode == 400 :
print " Bad Request: The wiki may be malfunctioning. "
print " Please try again later. "
print response . url
sys . exit ( 1 )
elif statuscode == 401 or statuscode == 403 :
print " Authentication required. "
print " Please use --userpass. "
print response . url
elif statuscode == 404 :
print " Not found. Is Special:Export enabled for this wiki? "
print response . url
sys . exit ( 1 )
elif statuscode == 429 or ( statuscode > = 500 and statuscode < 600 ) :
print " Server error, max retries exceeded. "
print " Please resume the dump later. "
print response . url
sys . exit ( 1 )
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getNamespacesScraper ( config = { } , session = None ) :
2013-03-27 22:11:51 +00:00
""" Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
""" Function called if no API is available """
2011-04-07 15:43:17 +00:00
namespaces = config [ ' namespaces ' ]
2014-07-03 18:23:21 +00:00
namespacenames = { 0 : ' ' } # main is 0, no prefix
2011-04-05 16:18:18 +00:00
if namespaces :
2014-07-03 18:23:21 +00:00
r = session . post (
2018-05-07 19:01:50 +00:00
url = config [ ' index ' ] , data = { ' title ' : ' Special:Allpages ' } , timeout = 30 )
2014-07-01 00:14:44 +00:00
raw = r . text
delay ( config = config , session = session )
2014-01-23 16:05:19 +00:00
2014-07-03 18:23:21 +00:00
# [^>]*? to include selected="selected"
m = re . compile (
r ' <option [^>]*?value= " (?P<namespaceid> \ d+) " [^>]*?>(?P<namespacename>[^<]+)</option> ' ) . finditer ( raw )
2011-04-05 16:18:18 +00:00
if ' all ' in namespaces :
namespaces = [ ]
for i in m :
namespaces . append ( int ( i . group ( " namespaceid " ) ) )
2014-07-03 18:23:21 +00:00
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group (
" namespacename " )
2011-04-05 16:18:18 +00:00
else :
2014-07-03 18:23:21 +00:00
# check if those namespaces really exist in this wiki
2011-04-05 16:18:18 +00:00
namespaces2 = [ ]
for i in m :
if int ( i . group ( " namespaceid " ) ) in namespaces :
namespaces2 . append ( int ( i . group ( " namespaceid " ) ) )
2014-07-03 18:23:21 +00:00
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group (
" namespacename " )
2011-04-05 16:18:18 +00:00
namespaces = namespaces2
else :
namespaces = [ 0 ]
2014-07-03 18:23:21 +00:00
namespaces = list ( set ( namespaces ) ) # uniques
2011-07-14 20:17:22 +00:00
print ' %d namespaces found ' % ( len ( namespaces ) )
2011-04-09 09:12:58 +00:00
return namespaces , namespacenames
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getNamespacesAPI ( config = { } , session = None ) :
2012-05-20 08:08:28 +00:00
""" Uses the API to get the list of namespaces names and ids """
namespaces = config [ ' namespaces ' ]
2014-07-03 18:23:21 +00:00
namespacenames = { 0 : ' ' } # main is 0, no prefix
2012-05-20 08:08:28 +00:00
if namespaces :
2014-09-27 22:12:04 +00:00
r = session . post (
url = config [ ' api ' ] ,
data = {
' action ' : ' query ' ,
' meta ' : ' siteinfo ' ,
' siprop ' : ' namespaces ' ,
2018-05-07 19:01:50 +00:00
' format ' : ' json ' } ,
timeout = 30
2014-09-27 22:12:04 +00:00
)
2015-03-24 01:58:01 +00:00
result = getJSON ( r )
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2014-01-23 16:05:19 +00:00
2012-05-20 08:08:28 +00:00
if ' all ' in namespaces :
namespaces = [ ]
2014-06-25 21:54:05 +00:00
for i in result [ ' query ' ] [ ' namespaces ' ] . keys ( ) :
2014-07-03 18:23:21 +00:00
if int ( i ) < 0 : # -1: Special, -2: Media, excluding
2014-06-25 21:54:05 +00:00
continue
namespaces . append ( int ( i ) )
namespacenames [ int ( i ) ] = result [ ' query ' ] [ ' namespaces ' ] [ i ] [ ' * ' ]
2012-05-20 08:08:28 +00:00
else :
2014-07-03 18:23:21 +00:00
# check if those namespaces really exist in this wiki
2012-05-20 08:08:28 +00:00
namespaces2 = [ ]
2014-06-25 21:54:05 +00:00
for i in result [ ' query ' ] [ ' namespaces ' ] . keys ( ) :
2014-09-27 22:12:04 +00:00
bi = i
i = int ( i )
if i < 0 : # -1: Special, -2: Media, excluding
2014-06-25 21:54:05 +00:00
continue
2014-09-27 22:12:04 +00:00
if i in namespaces :
namespaces2 . append ( i )
namespacenames [ i ] = result [ ' query ' ] [ ' namespaces ' ] [ bi ] [ ' * ' ]
2012-05-20 08:08:28 +00:00
namespaces = namespaces2
else :
namespaces = [ 0 ]
2014-07-03 18:23:21 +00:00
namespaces = list ( set ( namespaces ) ) # uniques
2012-05-20 08:08:28 +00:00
print ' %d namespaces found ' % ( len ( namespaces ) )
return namespaces , namespacenames
2011-04-09 09:12:58 +00:00
2014-07-03 18:23:21 +00:00
2016-07-31 15:37:31 +00:00
def getPageTitlesAPI ( config = { } , session = None ) :
2012-05-20 08:08:28 +00:00
""" Uses the API to get the list of page titles """
2011-04-05 16:18:18 +00:00
titles = [ ]
2014-07-03 18:23:21 +00:00
namespaces , namespacenames = getNamespacesAPI (
config = config , session = session )
2011-04-05 16:18:18 +00:00
for namespace in namespaces :
2011-04-13 19:44:35 +00:00
if namespace in config [ ' exnamespaces ' ] :
2014-06-25 20:53:46 +00:00
print ' Skipping namespace = %d ' % ( namespace )
2011-04-13 19:44:35 +00:00
continue
2014-07-03 18:23:21 +00:00
2011-04-09 09:12:58 +00:00
c = 0
2011-07-14 20:36:46 +00:00
print ' Retrieving titles in the namespace %d ' % ( namespace )
2011-04-09 09:12:58 +00:00
apfrom = ' ! '
while apfrom :
2014-07-03 18:23:21 +00:00
sys . stderr . write ( ' . ' ) # progress
2014-09-27 22:12:04 +00:00
params = {
' action ' : ' query ' ,
' list ' : ' allpages ' ,
' apnamespace ' : namespace ,
' apfrom ' : apfrom . encode ( ' utf-8 ' ) ,
' format ' : ' json ' ,
' aplimit ' : 500 }
2015-08-07 22:07:13 +00:00
retryCount = 0
while retryCount < config [ " retries " ] :
try :
2018-05-07 19:01:50 +00:00
r = session . post ( url = config [ ' api ' ] , data = params , timeout = 30 )
2015-08-07 22:07:13 +00:00
break
except ConnectionError as err :
print " Connection error: %s " % ( str ( err ) , )
retryCount + = 1
time . sleep ( 20 )
2014-07-01 17:26:57 +00:00
handleStatusCode ( r )
2014-07-03 18:23:21 +00:00
# FIXME Handle HTTP errors here!
2015-03-24 01:58:01 +00:00
jsontitles = getJSON ( r )
2014-06-25 20:53:46 +00:00
apfrom = ' '
2014-09-27 22:12:04 +00:00
if ' query-continue ' in jsontitles and ' allpages ' in jsontitles [
' query-continue ' ] :
2014-07-03 18:23:21 +00:00
if ' apcontinue ' in jsontitles [ ' query-continue ' ] [ ' allpages ' ] :
2014-09-27 22:12:04 +00:00
apfrom = jsontitles [
' query-continue ' ] [ ' allpages ' ] [ ' apcontinue ' ]
2014-07-03 18:23:21 +00:00
elif ' apfrom ' in jsontitles [ ' query-continue ' ] [ ' allpages ' ] :
2014-06-25 20:53:46 +00:00
apfrom = jsontitles [ ' query-continue ' ] [ ' allpages ' ] [ ' apfrom ' ]
2016-01-29 15:14:34 +00:00
elif ' continue ' in jsontitles :
if ' apcontinue ' in jsontitles [ ' continue ' ] :
apfrom = jsontitles [ ' continue ' ] [ ' apcontinue ' ]
elif ' apfrom ' in jsontitles [ ' continue ' ] :
apfrom = jsontitles [ ' continue ' ] [ ' apfrom ' ]
2014-07-03 18:23:21 +00:00
# print apfrom
# print jsontitles
2014-09-18 21:53:38 +00:00
allpages = jsontitles [ ' query ' ] [ ' allpages ' ]
# Hack for old versions of MediaWiki API where result is dict
if isinstance ( allpages , dict ) :
2014-09-18 21:56:22 +00:00
allpages = allpages . values ( )
2015-03-09 00:34:07 +00:00
for page in allpages :
yield page [ ' title ' ]
c + = len ( allpages )
2014-06-25 20:53:46 +00:00
if len ( titles ) != len ( set ( titles ) ) :
2014-07-03 18:23:21 +00:00
# probably we are in a loop, server returning dupe titles, stop
# it
2014-06-25 20:53:46 +00:00
print ' Probably a loop, finishing '
titles = list ( set ( titles ) )
2011-04-09 09:12:58 +00:00
apfrom = ' '
2015-03-09 00:34:07 +00:00
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2011-04-09 09:12:58 +00:00
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
2014-07-03 18:23:21 +00:00
2016-07-31 15:37:31 +00:00
def getPageTitlesScraper ( config = { } , session = None ) :
2016-01-29 15:38:31 +00:00
""" Scrape the list of page titles from Special:Allpages """
2011-04-09 09:12:58 +00:00
titles = [ ]
2014-07-03 18:23:21 +00:00
namespaces , namespacenames = getNamespacesScraper (
config = config , session = session )
2011-04-09 09:12:58 +00:00
for namespace in namespaces :
print ' Retrieving titles in the namespace ' , namespace
2014-07-03 18:23:21 +00:00
url = ' %s ?title=Special:Allpages&namespace= %s ' % (
config [ ' index ' ] , namespace )
2018-05-07 19:01:50 +00:00
r = session . get ( url = url , timeout = 30 )
2014-07-01 00:14:44 +00:00
raw = r . text
2011-04-05 22:00:29 +00:00
raw = cleanHTML ( raw )
2014-07-03 18:23:21 +00:00
2011-04-05 22:00:29 +00:00
r_title = r ' title= " (?P<title>[^>]+) " > '
2011-04-08 23:43:57 +00:00
r_suballpages = ' '
r_suballpages1 = r ' &from=(?P<from>[^>]+)&to=(?P<to>[^>]+) " > '
r_suballpages2 = r ' Special:Allpages/(?P<from>[^>]+) " > '
2016-01-29 15:38:31 +00:00
r_suballpages3 = r ' &from=(?P<from>[^>]+) " title= " [^>]+ " > '
2011-04-08 23:43:57 +00:00
if re . search ( r_suballpages1 , raw ) :
r_suballpages = r_suballpages1
elif re . search ( r_suballpages2 , raw ) :
r_suballpages = r_suballpages2
2016-01-29 15:38:31 +00:00
elif re . search ( r_suballpages3 , raw ) :
r_suballpages = r_suballpages3
2011-04-08 23:43:57 +00:00
else :
2014-07-03 18:23:21 +00:00
pass # perhaps no subpages
2016-01-29 15:38:31 +00:00
# 3 is the current deep of English Wikipedia for Special:Allpages
2014-07-03 18:23:21 +00:00
deep = 3
2011-04-05 22:00:29 +00:00
c = 0
checked_suballpages = [ ]
2011-04-05 22:08:53 +00:00
rawacum = raw
2011-04-08 23:43:57 +00:00
while r_suballpages and re . search ( r_suballpages , raw ) and c < deep :
2014-07-03 18:23:21 +00:00
# load sub-Allpages
2011-04-05 22:00:29 +00:00
m = re . compile ( r_suballpages ) . finditer ( raw )
for i in m :
fr = i . group ( ' from ' )
2014-07-03 18:23:21 +00:00
2011-04-08 23:43:57 +00:00
if r_suballpages == r_suballpages1 :
to = i . group ( ' to ' )
name = ' %s - %s ' % ( fr , to )
2014-07-03 18:23:21 +00:00
url = ' %s ?title=Special:Allpages&namespace= %s &from= %s &to= %s ' % (
config [ ' index ' ] , namespace , fr , to ) # do not put urllib.quote in fr or to
# fix, esta regexp no carga bien todas? o falla el r_title en
# este tipo de subpag? (wikiindex)
elif r_suballpages == r_suballpages2 :
# clean &namespace=\d, sometimes happens
fr = fr . split ( ' &namespace= ' ) [ 0 ]
2011-04-08 23:43:57 +00:00
name = fr
2014-07-03 18:23:21 +00:00
url = ' %s ?title=Special:Allpages/ %s &namespace= %s ' % (
config [ ' index ' ] , name , namespace )
2016-01-29 16:19:15 +00:00
elif r_suballpages == r_suballpages3 :
fr = fr . split ( ' &namespace= ' ) [ 0 ]
name = fr
url = ' %s ?title=Special:Allpages&from= %s &namespace= %s ' % (
config [ ' index ' ] , name , namespace )
2014-07-03 18:23:21 +00:00
if name not in checked_suballpages :
# to avoid reload dupe subpages links
checked_suballpages . append ( name )
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2018-05-07 19:01:50 +00:00
r2 = session . get ( url = url , timeout = 10 )
2014-07-01 00:14:44 +00:00
raw2 = r2 . text
2011-04-05 22:00:29 +00:00
raw2 = cleanHTML ( raw2 )
2014-07-03 18:23:21 +00:00
rawacum + = raw2 # merge it after removed junk
2014-09-27 22:12:04 +00:00
print ' Reading ' , name , len ( raw2 ) , ' bytes ' , \
len ( re . findall ( r_suballpages , raw2 ) ) , ' subpages ' , \
len ( re . findall ( r_title , raw2 ) ) , ' pages '
2014-01-23 16:05:19 +00:00
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2011-04-05 22:00:29 +00:00
c + = 1
2014-07-03 18:23:21 +00:00
2011-04-09 08:05:48 +00:00
c = 0
2011-04-05 22:00:29 +00:00
m = re . compile ( r_title ) . finditer ( rawacum )
2011-04-05 16:18:18 +00:00
for i in m :
2014-07-01 00:14:44 +00:00
t = undoHTMLEntities ( text = i . group ( ' title ' ) )
2014-06-27 14:34:48 +00:00
if not t . startswith ( ' Special: ' ) :
2014-07-03 18:23:21 +00:00
if t not in titles :
2014-06-27 14:34:48 +00:00
titles . append ( t )
2011-04-09 08:05:48 +00:00
c + = 1
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
2011-04-09 09:12:58 +00:00
return titles
2014-07-03 18:23:21 +00:00
2016-07-31 15:37:31 +00:00
def getPageTitles ( config = { } , session = None ) :
2014-06-25 20:53:46 +00:00
""" Get list of page titles """
2014-07-03 18:23:21 +00:00
# http://en.wikipedia.org/wiki/Special:AllPages
# http://archiveteam.org/index.php?title=Special:AllPages
# http://www.wikanda.es/wiki/Especial:Todas
2011-04-13 19:44:35 +00:00
print ' Loading page titles from namespaces = %s ' % ( config [ ' namespaces ' ] and ' , ' . join ( [ str ( i ) for i in config [ ' namespaces ' ] ] ) or ' None ' )
print ' Excluding titles from namespaces = %s ' % ( config [ ' exnamespaces ' ] and ' , ' . join ( [ str ( i ) for i in config [ ' exnamespaces ' ] ] ) or ' None ' )
2014-07-03 18:23:21 +00:00
2011-04-09 09:12:58 +00:00
titles = [ ]
2014-07-05 14:45:55 +00:00
if ' api ' in config and config [ ' api ' ] :
2018-05-07 19:01:50 +00:00
r = session . post ( config [ ' api ' ] , { ' action ' : ' query ' , ' list ' : ' allpages ' , ' format ' : ' json ' } , timeout = 30 )
2015-03-24 01:58:01 +00:00
test = getJSON ( r )
2014-09-23 19:49:57 +00:00
if ( ' warnings ' in test and ' allpages ' in test [ ' warnings ' ] and ' * ' in test [ ' warnings ' ] [ ' allpages ' ]
and test [ ' warnings ' ] [ ' allpages ' ] [ ' * ' ] == ' The " allpages " module has been disabled. ' ) :
2016-07-31 15:37:31 +00:00
titles = getPageTitlesScraper ( config = config , session = session )
2014-09-23 19:49:57 +00:00
else :
2016-07-31 15:37:31 +00:00
titles = getPageTitlesAPI ( config = config , session = session )
2014-07-05 14:45:55 +00:00
elif ' index ' in config and config [ ' index ' ] :
2016-07-31 15:37:31 +00:00
titles = getPageTitlesScraper ( config = config , session = session )
2014-07-03 18:23:21 +00:00
2015-03-09 00:34:07 +00:00
titlesfilename = ' %s - %s -titles.txt ' % (
domain2prefix ( config = config ) , config [ ' date ' ] )
2015-08-07 22:07:13 +00:00
titlesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , titlesfilename ) , ' wt ' )
2015-03-09 00:34:07 +00:00
c = 0
for title in titles :
titlesfile . write ( title . encode ( ' utf-8 ' ) + " \n " )
c + = 1
# TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
# main namespace and widget namespace.
# We can use sort -u in UNIX, but is it worth it?
titlesfile . write ( u ' --END-- \n ' )
titlesfile . close ( )
print ' Titles saved at... ' , titlesfilename
2011-04-05 16:18:18 +00:00
2015-03-09 00:34:07 +00:00
print ' %d page titles loaded ' % ( c )
2015-08-07 22:07:13 +00:00
return titlesfilename
2014-07-04 18:40:14 +00:00
def getImageNames ( config = { } , session = None ) :
""" Get list of image names """
2014-09-27 22:12:04 +00:00
2014-07-04 18:40:14 +00:00
print ' Retrieving image filenames '
images = [ ]
2014-07-05 14:45:55 +00:00
if ' api ' in config and config [ ' api ' ] :
2014-07-04 18:40:14 +00:00
images = getImageNamesAPI ( config = config , session = session )
2014-07-05 14:45:55 +00:00
elif ' index ' in config and config [ ' index ' ] :
2014-07-04 18:40:14 +00:00
images = getImageNamesScraper ( config = config , session = session )
2014-09-27 22:12:04 +00:00
# images = list(set(images)) # it is a list of lists
2014-07-04 18:40:14 +00:00
images . sort ( )
print ' %d image names loaded ' % ( len ( images ) )
return images
2014-07-01 00:14:44 +00:00
def getXMLHeader ( config = { } , session = None ) :
2011-07-15 18:07:50 +00:00
""" Retrieve a random page to extract XML headers (namespace info, etc) """
2014-07-03 18:23:21 +00:00
# get the header of a random page, to attach it in the complete XML backup
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x....
randomtitle = ' Main_Page ' # previously AMF5LKE43MNFGHKSDMRTJ
2015-02-07 01:19:24 +00:00
try :
xml = " " . join ( [ x for x in getXMLPage ( config = config , title = randomtitle , verbose = False , session = session ) ] )
2015-02-11 00:56:14 +00:00
except PageMissingError as pme :
2015-03-08 18:20:12 +00:00
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
2015-02-11 00:56:14 +00:00
xml = pme . xml
2015-03-08 18:20:12 +00:00
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError :
try :
if config [ ' api ' ] :
print " Trying the local name for the Special namespace instead "
r = session . post (
url = config [ ' api ' ] ,
data = {
' action ' : ' query ' ,
' meta ' : ' siteinfo ' ,
' siprop ' : ' namespaces ' ,
2018-05-07 19:01:50 +00:00
' format ' : ' json ' } ,
timeout = 120
2015-03-08 18:20:12 +00:00
)
config [ ' export ' ] = json . loads ( r . text ) [ ' query ' ] [ ' namespaces ' ] [ ' -1 ' ] [ ' * ' ] \
+ ' :Export '
xml = " " . join ( [ x for x in getXMLPage ( config = config , title = randomtitle , verbose = False , session = session ) ] )
except PageMissingError as pme :
xml = pme . xml
except ExportAbortedError :
pass
2015-02-11 00:56:14 +00:00
header = xml . split ( ' </mediawiki> ' ) [ 0 ]
2015-03-24 03:44:03 +00:00
if not re . match ( r " \ s*<mediawiki " , xml ) :
2012-05-28 09:35:56 +00:00
print ' XML export on this wiki is broken, quitting. '
2015-03-30 00:35:55 +00:00
logerror ( u ' XML export on this wiki is broken, quitting. ' )
2012-05-28 09:35:56 +00:00
sys . exit ( )
2015-03-08 18:20:12 +00:00
return header , config
2011-04-05 22:00:29 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getXMLFileDesc ( config = { } , title = ' ' , session = None ) :
2013-03-27 22:11:51 +00:00
""" Get XML for image description page """
2014-07-03 18:23:21 +00:00
config [ ' curonly ' ] = 1 # tricky to get only the most recent desc
2015-02-07 01:19:24 +00:00
return ( " " . join ( [ x for x in getXMLPage ( config = config , title = title , verbose = False , session = session ) ] ) )
2011-04-08 13:39:14 +00:00
2014-07-03 18:23:21 +00:00
2011-04-09 08:05:48 +00:00
def getUserAgent ( ) :
2011-07-12 16:54:37 +00:00
""" Return a cool user-agent to hide Python user-agent """
2014-06-26 08:13:59 +00:00
useragents = [
2014-07-03 18:23:21 +00:00
# firefox
' Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0 ' ,
2014-06-26 08:13:59 +00:00
' Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0 ' ,
2014-07-03 18:23:21 +00:00
]
2011-04-09 08:05:48 +00:00
return useragents [ 0 ]
2014-07-03 18:23:21 +00:00
2011-04-30 18:53:35 +00:00
def logerror ( config = { } , text = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Log error in file """
2011-04-30 18:53:35 +00:00
if text :
2014-07-03 17:24:28 +00:00
with open ( ' %s /errors.log ' % ( config [ ' path ' ] ) , ' a ' ) as outfile :
2014-07-03 18:23:21 +00:00
output = u ' %s : %s \n ' % (
datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S ' ) , text )
2014-07-03 17:24:28 +00:00
outfile . write ( output . encode ( ' utf-8 ' ) )
2011-04-30 18:53:35 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getXMLPageCore ( headers = { } , params = { } , config = { } , session = None ) :
2011-07-12 16:54:37 +00:00
""" """
2014-07-03 18:23:21 +00:00
# returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
# if retrieving params['limit'] revisions fails, returns a current only version
# if all fail, returns the empty string
2011-04-30 14:37:15 +00:00
xml = ' '
c = 0
2014-07-03 18:23:21 +00:00
maxseconds = 100 # max seconds to wait in a single sleeping
2015-08-05 20:24:59 +00:00
maxretries = config [ ' retries ' ] # x retries and skip
2014-07-03 18:23:21 +00:00
increment = 20 # increment every retry
2015-08-05 20:24:59 +00:00
2011-04-30 14:37:15 +00:00
while not re . search ( r ' </mediawiki> ' , xml ) :
2011-04-30 18:53:35 +00:00
if c > 0 and c < maxretries :
2014-07-03 18:23:21 +00:00
wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds
2018-05-07 19:05:26 +00:00
print ' In attempt %d , XML for " %s " is wrong. Waiting %d seconds and reloading... ' % ( c , params [ ' pages ' ] , wait )
2011-04-30 14:37:15 +00:00
time . sleep ( wait )
2014-07-03 18:23:21 +00:00
# reducing server load requesting smallest chunks (if curonly then
# limit = 1 from mother function)
if params [ ' limit ' ] > 1 :
params [ ' limit ' ] = params [ ' limit ' ] / 2 # half
2011-04-30 18:53:35 +00:00
if c > = maxretries :
print ' We have retried %d times ' % ( c )
print ' MediaWiki error for " %s " , network error or whatever... ' % ( params [ ' pages ' ] )
2012-05-28 09:35:56 +00:00
# If it's not already what we tried: our last chance, preserve only the last revision...
2015-03-08 18:20:12 +00:00
# config['curonly'] means that the whole dump is configured to save only the last,
2014-07-03 18:23:21 +00:00
# params['curonly'] should mean that we've already tried this
# fallback, because it's set by the following if and passed to
# getXMLPageCore
2015-03-08 18:20:12 +00:00
if not config [ ' curonly ' ] and not ' curonly ' in params :
2011-04-30 18:53:35 +00:00
print ' Trying to save only the last revision for this page... '
params [ ' curonly ' ] = 1
2014-09-27 22:12:04 +00:00
logerror (
config = config ,
2015-03-29 22:51:14 +00:00
text = u ' Error while retrieving the full history of " %s " . Trying to save only the last revision for this page ' %
2014-09-27 22:12:04 +00:00
( params [ ' pages ' ] )
)
return getXMLPageCore (
headers = headers ,
params = params ,
config = config ,
session = session
)
2011-04-30 18:53:35 +00:00
else :
2012-05-28 09:35:56 +00:00
print ' Saving in the errors log, and skipping... '
2014-09-27 22:12:04 +00:00
logerror (
config = config ,
2015-03-29 22:51:14 +00:00
text = u ' Error while retrieving the last revision of " %s " . Skipping. ' %
2014-09-27 22:12:04 +00:00
( params [ ' pages ' ] ) )
2015-03-08 18:20:12 +00:00
raise ExportAbortedError ( config [ ' index ' ] )
2014-07-03 18:23:21 +00:00
return ' ' # empty xml
# FIXME HANDLE HTTP Errors HERE
2014-09-19 00:21:01 +00:00
try :
2018-05-07 19:01:50 +00:00
r = session . post ( url = config [ ' index ' ] , data = params , headers = headers , timeout = 10 )
2014-09-19 00:21:01 +00:00
handleStatusCode ( r )
2015-03-24 01:58:01 +00:00
xml = fixBOM ( r )
2014-09-27 22:12:04 +00:00
except requests . exceptions . ConnectionError as e :
2015-08-06 21:37:29 +00:00
print ' Connection error: %s ' % ( str ( e [ 0 ] ) )
2014-09-19 00:21:01 +00:00
xml = ' '
2011-04-30 14:37:15 +00:00
c + = 1
2014-07-03 18:23:21 +00:00
2011-04-30 14:37:15 +00:00
return xml
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getXMLPage ( config = { } , title = ' ' , verbose = True , session = None ) :
2014-06-26 08:17:00 +00:00
""" Get the full history (or current only) of a page """
2014-07-03 18:23:21 +00:00
# if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
# http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
2011-04-06 18:54:33 +00:00
limit = 1000
2011-04-05 23:34:52 +00:00
truncated = False
2011-04-10 09:17:05 +00:00
title_ = title
title_ = re . sub ( ' ' , ' _ ' , title_ )
2014-07-03 18:23:21 +00:00
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
2015-03-08 18:20:12 +00:00
try :
params = { ' title ' : config [ ' export ' ] , ' pages ' : title_ , ' action ' : ' submit ' }
except KeyError :
params = { ' title ' : ' Special:Export ' , ' pages ' : title_ , ' action ' : ' submit ' }
2011-04-07 15:43:17 +00:00
if config [ ' curonly ' ] :
2011-04-05 22:00:29 +00:00
params [ ' curonly ' ] = 1
2011-04-30 17:05:59 +00:00
params [ ' limit ' ] = 1
2011-04-05 22:00:29 +00:00
else :
2014-07-03 18:23:21 +00:00
params [ ' offset ' ] = ' 1 ' # 1 always < 2000s
2011-04-05 22:00:29 +00:00
params [ ' limit ' ] = limit
2014-07-03 18:23:21 +00:00
# in other case, do not set params['templates']
if ' templates ' in config and config [ ' templates ' ] :
2012-04-20 20:03:54 +00:00
params [ ' templates ' ] = 1
2014-07-03 18:23:21 +00:00
2014-07-01 17:26:57 +00:00
xml = getXMLPageCore ( params = params , config = config , session = session )
2015-03-08 18:20:12 +00:00
if xml == " " :
raise ExportAbortedError ( config [ ' index ' ] )
2015-02-11 00:56:14 +00:00
if not " </page> " in xml :
raise PageMissingError ( params [ ' title ' ] , xml )
2015-02-07 02:50:25 +00:00
else :
# strip these sha1s sums which keep showing up in the export and
# which are invalid for the XML schema (they only apply to
# revisions)
xml = re . sub ( r ' \ n \ s*<sha1> \ w+</sha1> \ s* \ n ' , r ' \ n ' , xml )
xml = re . sub ( r ' \ n \ s*<sha1/> \ s* \ n ' , r ' \ n ' , xml )
2015-02-07 01:19:24 +00:00
yield xml . split ( " </page> " ) [ 0 ]
2011-04-05 22:00:29 +00:00
2014-07-03 18:23:21 +00:00
# if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
# else, warning about Special:Export truncating large page histories
2011-04-05 23:00:33 +00:00
r_timestamp = r ' <timestamp>([^<]+)</timestamp> '
2015-02-07 01:19:24 +00:00
numberofedits = 0
numberofedits + = len ( re . findall ( r_timestamp , xml ) )
2014-07-03 18:23:21 +00:00
# search for timestamps in xml to avoid analysing empty pages like
# Special:Allpages and the random one
if not config [ ' curonly ' ] and re . search ( r_timestamp , xml ) :
while not truncated and params [ ' offset ' ] : # next chunk
# get the last timestamp from the acum XML
params [ ' offset ' ] = re . findall ( r_timestamp , xml ) [ - 1 ]
2015-10-22 13:19:50 +00:00
try :
xml2 = getXMLPageCore (
params = params , config = config , session = session )
except MemoryError :
print " The page ' s history exceeds our memory, halving limit. "
params [ ' limit ' ] = params [ ' limit ' ] / 2
continue
2014-07-03 18:23:21 +00:00
# are there more edits in this next XML chunk or no <page></page>?
if re . findall ( r_timestamp , xml2 ) :
2011-04-06 11:39:02 +00:00
if re . findall ( r_timestamp , xml2 ) [ - 1 ] == params [ ' offset ' ] :
2014-07-03 18:23:21 +00:00
# again the same XML, this wiki does not support params in
# Special:Export, offer complete XML up to X edits (usually
# 1000)
2012-04-08 11:07:00 +00:00
print ' ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated '
2011-04-06 11:39:02 +00:00
truncated = True
break
else :
2011-04-23 21:45:57 +00:00
""" </namespaces>
< / siteinfo >
< page >
< title > Main Page < / title >
< id > 15580374 < / id >
< restrictions > edit = sysop : move = sysop < / restrictions > ( ? )
< revision >
< id > 418009832 < / id >
< timestamp > 2011 - 03 - 09 T19 : 57 : 06 Z < / timestamp >
< contributor >
"""
2014-07-03 18:23:21 +00:00
# offset is OK in this wiki, merge with the previous chunk
# of this page history and continue
2015-03-30 00:35:55 +00:00
try :
2015-10-22 13:19:50 +00:00
xml2 = xml2 . split ( " </page> " ) [ 0 ]
2015-03-30 00:35:55 +00:00
yield ' <revision> ' + ( ' <revision> ' . join ( xml2 . split ( ' <revision> ' ) [ 1 : ] ) )
except MemoryError :
print " The page ' s history exceeds our memory, halving limit. "
params [ ' limit ' ] = params [ ' limit ' ] / 2
continue
2015-02-07 01:19:24 +00:00
xml = xml2
numberofedits + = len ( re . findall ( r_timestamp , xml ) )
2011-04-05 23:00:33 +00:00
else :
2014-07-03 18:23:21 +00:00
params [ ' offset ' ] = ' ' # no more edits in this page history
2015-02-07 01:19:24 +00:00
yield " </page> \n "
2014-07-03 18:23:21 +00:00
2011-04-30 17:05:59 +00:00
if verbose :
2014-06-26 08:17:00 +00:00
if ( numberofedits == 1 ) :
2015-03-10 20:24:39 +00:00
print ' %s , 1 edit ' % ( title . strip ( ) )
2014-06-26 08:17:00 +00:00
else :
2015-03-10 20:24:39 +00:00
print ' %s , %d edits ' % ( title . strip ( ) , numberofedits )
2011-04-05 22:00:29 +00:00
2014-07-03 18:23:21 +00:00
2011-04-05 22:00:29 +00:00
def cleanXML ( xml = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Trim redundant info """
2014-07-03 18:23:21 +00:00
# do not touch XML codification, leave AS IS
2015-02-07 01:19:24 +00:00
if re . search ( r ' </siteinfo> \ n ' , xml ) :
2011-04-30 18:53:35 +00:00
xml = xml . split ( ' </siteinfo> \n ' ) [ 1 ]
2015-02-07 01:19:24 +00:00
if re . search ( r ' </mediawiki> ' , xml ) :
2011-04-30 18:53:35 +00:00
xml = xml . split ( ' </mediawiki> ' ) [ 0 ]
2011-04-05 22:00:29 +00:00
return xml
2011-04-05 16:18:18 +00:00
2014-07-03 18:23:21 +00:00
2015-03-09 00:34:07 +00:00
def generateXMLDump ( config = { } , titles = [ ] , start = None , session = None ) :
2018-05-07 19:05:26 +00:00
""" Generates a XML dump for a list of titles or from revision IDs """
2015-03-09 00:34:07 +00:00
# TODO: titles is now unused.
2014-07-03 18:23:21 +00:00
2015-03-08 18:20:12 +00:00
header , config = getXMLHeader ( config = config , session = session )
2014-07-03 18:23:21 +00:00
footer = ' </mediawiki> \n ' # new line at the end
xmlfilename = ' %s - %s - %s .xml ' % ( domain2prefix ( config = config ) ,
2014-09-27 22:12:04 +00:00
config [ ' date ' ] ,
config [ ' curonly ' ] and ' current ' or ' history ' )
2011-04-07 20:24:30 +00:00
xmlfile = ' '
lock = True
2018-05-07 19:05:26 +00:00
if config [ ' xmlrevisions ' ] :
print ' Retrieving the XML for every page from the beginning '
2011-04-07 20:24:30 +00:00
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
2014-07-01 00:14:44 +00:00
xmlfile . write ( header . encode ( ' utf-8 ' ) )
2015-02-07 01:19:24 +00:00
try :
2018-05-07 20:03:22 +00:00
r_timestamp = r ' <timestamp>([^<]+)</timestamp> '
2018-05-07 19:05:26 +00:00
for xml in getXMLRevisions ( config = config , session = session ) :
2018-05-07 20:03:22 +00:00
numrevs = len ( re . findall ( r_timestamp , xml ) )
# Due to how generators work, it's expected this may be less
print " %d more revisions exported " % numrevs
2015-02-07 01:19:24 +00:00
xml = cleanXML ( xml = xml )
xmlfile . write ( xml . encode ( ' utf-8 ' ) )
2018-05-07 19:05:26 +00:00
except AttributeError :
print " This wikitools module version is not working "
sys . exit ( )
else :
print ' Retrieving the XML for every page from " %s " ' % ( start and start or ' start ' )
if start :
print " Removing the last chunk of past XML dump: it is probably incomplete. "
for i in reverse_readline ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , truncate = True ) :
pass
else :
# requested complete xml dump
lock = False
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
xmlfile . write ( header . encode ( ' utf-8 ' ) )
xmlfile . close ( )
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' a ' )
c = 1
for title in readTitles ( config , start ) :
if not title . strip ( ) :
continue
if title == start : # start downloading from start, included
lock = False
if lock :
continue
delay ( config = config , session = session )
if c % 10 == 0 :
print ' Downloaded %d pages ' % ( c )
try :
for xml in getXMLPage ( config = config , title = title , session = session ) :
xml = cleanXML ( xml = xml )
xmlfile . write ( xml . encode ( ' utf-8 ' ) )
except PageMissingError :
logerror (
config = config ,
text = u ' The page " %s " was missing in the wiki (probably deleted) ' %
( title . decode ( ' utf-8 ' ) )
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c + = 1
2011-04-05 22:00:29 +00:00
xmlfile . write ( footer )
xmlfile . close ( )
2011-04-07 13:14:37 +00:00
print ' XML dump saved at... ' , xmlfilename
2011-04-06 18:54:33 +00:00
2018-05-07 19:05:26 +00:00
def getXMLRevisions ( config = { } , session = None ) :
site = wikitools . wiki . Wiki ( config [ ' api ' ] )
2018-05-07 20:03:22 +00:00
#if config['namespaces']:
# namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
#else:
namespaces = [ ' * ' ]
2018-05-07 19:05:26 +00:00
for namespace in namespaces :
2018-05-07 20:03:22 +00:00
print " Exporting revisions from namespace %s " % namespace
# TODO: 500 would be nicer, but need to find the wiki's limits
2018-05-07 19:05:26 +00:00
params = {
' action ' : ' query ' ,
2018-05-07 20:03:22 +00:00
' list ' : ' allrevisions ' ,
' arvnamespace ' : ' * ' ,
' arvlimit ' : 50 ,
' arvprop ' : ' ids ' ,
2018-05-07 19:05:26 +00:00
}
request = wikitools . api . APIRequest ( site , params )
results = request . queryGen ( )
try :
for result in results :
2018-05-07 20:03:22 +00:00
revids = [ ]
for page in result [ ' query ' ] [ ' allrevisions ' ] :
for revision in page [ ' revisions ' ] :
revids . append ( str ( revision [ ' revid ' ] ) )
print " 50 more revisions listed, until %d " % revids [ - 1 ]
exportparams = {
' action ' : ' query ' ,
' revids ' : ' | ' . join ( revids ) ,
' export ' : ' 1 ' ,
}
exportrequest = wikitools . api . APIRequest ( site , exportparams )
exportresults = exportrequest . queryGen ( )
for exportresult in exportresults :
yield exportresult [ ' query ' ] [ ' export ' ] [ ' * ' ]
2018-05-07 19:05:26 +00:00
except wikitools . api . APIError :
2018-05-07 20:03:22 +00:00
print " This wikitools version seems not to work for us. Exiting. "
sys . exit ( )
2018-05-07 19:05:26 +00:00
2015-03-09 00:34:07 +00:00
def readTitles ( config = { } , start = None ) :
""" Read title list from a file, from the title " start " """
2014-06-26 08:26:57 +00:00
2014-07-03 18:23:21 +00:00
titlesfilename = ' %s - %s -titles.txt ' % (
domain2prefix ( config = config ) , config [ ' date ' ] )
2015-03-09 00:34:07 +00:00
titlesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , titlesfilename ) , ' r ' )
2014-07-03 18:23:21 +00:00
2015-03-09 00:34:07 +00:00
seeking = False
if start :
seeking = True
2011-04-06 18:54:33 +00:00
2015-03-09 00:34:07 +00:00
with titlesfile as f :
for line in f :
if line . strip ( ) == ' --END-- ' :
break
elif seeking and line . strip ( ) != start :
continue
elif seeking and line . strip ( ) == start :
seeking = False
yield line . strip ( )
else :
yield line . strip ( )
2015-03-30 00:35:55 +00:00
def reverse_readline ( filename , buf_size = 8192 , truncate = False ) :
2015-03-09 00:34:07 +00:00
""" a generator that returns the lines of a file in reverse order """
# Original code by srohde, abdus_salam: cc by-sa 3.0
# http://stackoverflow.com/a/23646049/718903
2015-03-30 00:35:55 +00:00
with open ( filename , ' r+ ' ) as fh :
2015-03-09 00:34:07 +00:00
segment = None
offset = 0
fh . seek ( 0 , os . SEEK_END )
total_size = remaining_size = fh . tell ( )
while remaining_size > 0 :
offset = min ( total_size , offset + buf_size )
fh . seek ( - offset , os . SEEK_END )
buffer = fh . read ( min ( remaining_size , buf_size ) )
remaining_size - = buf_size
lines = buffer . split ( ' \n ' )
# the first line of the buffer is probably not a complete line so
# we'll save it and append it to the last line of the next buffer
# we read
if segment is not None :
# if the previous chunk starts right from the beginning of line
# do not concat the segment to the last line of new chunk
# instead, yield the segment first
if buffer [ - 1 ] is not ' \n ' :
lines [ - 1 ] + = segment
else :
2015-03-30 00:35:55 +00:00
if truncate and ' </page> ' in segment :
pages = buffer . split ( ' </page> ' )
fh . seek ( - offset + buf_size - len ( pages [ - 1 ] ) , os . SEEK_END )
fh . truncate
raise StopIteration
else :
yield segment
2015-03-09 00:34:07 +00:00
segment = lines [ 0 ]
for index in range ( len ( lines ) - 1 , 0 , - 1 ) :
2015-03-30 00:35:55 +00:00
if truncate and ' </page> ' in segment :
pages = buffer . split ( ' </page> ' )
fh . seek ( - offset - len ( pages [ - 1 ] ) , os . SEEK_END )
fh . truncate
raise StopIteration
else :
yield lines [ index ]
2015-03-09 00:34:07 +00:00
yield segment
2014-07-03 18:23:21 +00:00
2014-07-04 18:40:14 +00:00
def saveImageNames ( config = { } , images = [ ] , session = None ) :
2014-06-26 08:26:57 +00:00
""" Save image list in a file, including filename, url and uploader """
2014-07-03 18:23:21 +00:00
imagesfilename = ' %s - %s -images.txt ' % (
domain2prefix ( config = config ) , config [ ' date ' ] )
2011-04-07 22:32:05 +00:00
imagesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , imagesfilename ) , ' w ' )
2014-09-27 22:12:04 +00:00
imagesfile . write (
( ' \n ' . join (
[
' %s \t %s \t %s ' %
( filename ,
url ,
uploader ) for filename ,
url ,
uploader in images ]
) . encode ( ' utf-8 ' )
)
)
2014-07-01 00:14:44 +00:00
imagesfile . write ( ' \n --END-- ' )
2011-04-07 22:32:05 +00:00
imagesfile . close ( )
2014-07-03 18:23:21 +00:00
2011-04-07 22:32:05 +00:00
print ' Image filenames and URLs saved at... ' , imagesfilename
2014-07-03 18:23:21 +00:00
2014-07-04 17:39:48 +00:00
def curateImageURL ( config = { } , url = ' ' ) :
""" Returns an absolute URL for an image, adding the domain if missing """
2014-09-27 22:12:04 +00:00
2014-07-05 14:45:55 +00:00
if ' index ' in config and config [ ' index ' ] :
2014-09-27 22:12:04 +00:00
# remove from :// (http or https) until the first / after domain
domainalone = config [ ' index ' ] . split (
' :// ' ) [ 0 ] + ' :// ' + config [ ' index ' ] . split ( ' :// ' ) [ 1 ] . split ( ' / ' ) [ 0 ]
2014-07-05 14:45:55 +00:00
elif ' api ' in config and config [ ' api ' ] :
2014-09-27 22:12:04 +00:00
domainalone = config [ ' api ' ] . split (
' :// ' ) [ 0 ] + ' :// ' + config [ ' api ' ] . split ( ' :// ' ) [ 1 ] . split ( ' / ' ) [ 0 ]
2014-07-04 17:39:48 +00:00
else :
print ' ERROR: no index nor API '
sys . exit ( )
2014-09-27 22:12:04 +00:00
if url . startswith ( ' // ' ) : # Orain wikifarm returns URLs starting with //
2014-07-04 17:39:48 +00:00
url = u ' %s : %s ' % ( domainalone . split ( ' :// ' ) [ 0 ] , url )
2014-09-27 22:12:04 +00:00
# is it a relative URL?
elif url [ 0 ] == ' / ' or ( not url . startswith ( ' http:// ' ) and not url . startswith ( ' https:// ' ) ) :
if url [ 0 ] == ' / ' : # slash is added later
2014-07-04 17:39:48 +00:00
url = url [ 1 : ]
2014-09-27 22:12:04 +00:00
# concat http(s) + domain + relative url
url = u ' %s / %s ' % ( domainalone , url )
2014-07-04 17:39:48 +00:00
url = undoHTMLEntities ( text = url )
2014-09-27 22:12:04 +00:00
# url = urllib.unquote(url) #do not use unquote with url, it break some
# urls with odd chars
2014-07-04 17:39:48 +00:00
url = re . sub ( ' ' , ' _ ' , url )
2014-09-27 22:12:04 +00:00
2014-07-04 17:39:48 +00:00
return url
2014-07-04 17:44:01 +00:00
2014-07-04 18:40:14 +00:00
def getImageNamesScraper ( config = { } , session = None ) :
2011-07-14 20:38:10 +00:00
""" Retrieve file list: filename, url, uploader """
2014-07-03 18:23:21 +00:00
# (?<! http://docs.python.org/library/re.html
r_next = r ' (?<!&dir=prev)&offset=(?P<offset> \ d+)& '
2011-04-07 13:14:37 +00:00
images = [ ]
2014-07-03 18:23:21 +00:00
offset = ' 29990101000000 ' # january 1, 2999
2011-07-09 17:47:04 +00:00
limit = 5000
2015-08-05 20:24:59 +00:00
retries = config [ ' retries ' ]
2011-04-07 13:14:37 +00:00
while offset :
2014-07-03 18:23:21 +00:00
# 5000 overload some servers, but it is needed for sites like this with
# no next links
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
2014-09-27 22:12:04 +00:00
r = session . post (
url = config [ ' index ' ] ,
data = {
' title ' : ' Special:Imagelist ' ,
' limit ' : limit ,
2018-05-07 19:01:50 +00:00
' offset ' : offset } ,
timeout = 30 )
2014-07-01 00:14:44 +00:00
raw = r . text
delay ( config = config , session = session )
2014-07-03 18:23:21 +00:00
# delicate wiki
2014-09-27 22:12:04 +00:00
if re . search (
ur ' (?i)(allowed memory size of \ d+ bytes exhausted|Call to a member function getURL) ' ,
raw ) :
2011-07-14 21:32:48 +00:00
if limit > 10 :
print ' Error: listing %d images in a chunk is not possible, trying tiny chunks ' % ( limit )
2014-07-03 18:23:21 +00:00
limit = limit / 10
2011-07-14 21:32:48 +00:00
continue
2014-07-03 18:23:21 +00:00
elif retries > 0 : # waste retries, then exit
2011-07-14 21:32:48 +00:00
retries - = 1
print ' Retrying... '
continue
else :
print ' No more retries, exit... '
break
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
raw = cleanHTML ( raw )
2014-07-03 18:23:21 +00:00
# archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
# wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
# href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1"
# class="new" title="Usuario:Fernandocg (página no
# existe)">Fernandocg</a></td>
2011-04-08 20:35:05 +00:00
r_images1 = r ' (?im)<td class= " TablePager_col_img_name " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a>[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2014-07-03 18:23:21 +00:00
# wikijuegos 1.9.5
# http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
# mediawiki version
2011-04-08 20:35:05 +00:00
r_images2 = r ' (?im)<td class= " TablePager_col_links " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a></td> \ s*<td class= " TablePager_col_img_timestamp " >[^<]+</td> \ s*<td class= " TablePager_col_img_name " >[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2014-07-04 17:44:01 +00:00
# gentoowiki 1.18
2011-04-10 17:51:31 +00:00
r_images3 = r ' (?im)<td class= " TablePager_col_img_name " ><a[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+) " >[^<]+</a>[^<]+</td><td class= " TablePager_col_thumb " ><a[^>]+><img[^>]+></a></td><td class= " TablePager_col_img_size " >[^<]+</td><td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2014-07-03 18:23:21 +00:00
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
# (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
2011-06-13 20:19:13 +00:00
r_images4 = r ' (?im)<a href=[^>]+ title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+) " >[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a> '
2014-09-27 22:12:04 +00:00
r_images5 = (
r ' (?im)<td class= " TablePager_col_img_name " > \ s*<a href[^>]*?>(?P<filename>[^>]+)</a> \ s* \ (<a href= " (?P<url>[^>]+) " >[^<]*?</a> \ s* \ ) \ s*</td> \ s* '
' <td class= " TablePager_col_thumb " >[^ \n \r ]*?</td> \ s* '
' <td class= " TablePager_col_img_size " >[^<]*?</td> \ s* '
' <td class= " TablePager_col_img_user_text " > \ s*(<a href= " [^>]*? " title= " [^>]*? " >)?(?P<uploader>[^<]+?)(</a>)? \ s*</td> ' )
2014-07-04 17:39:48 +00:00
# Select the regexp that returns more results
regexps = [ r_images1 , r_images2 , r_images3 , r_images4 , r_images5 ]
count = 0
i = 0
regexp_best = 0
for regexp in regexps :
if len ( re . findall ( regexp , raw ) ) > count :
count = len ( re . findall ( regexp , raw ) )
regexp_best = i
i + = 1
m = re . compile ( regexps [ regexp_best ] ) . finditer ( raw )
2014-09-27 22:12:04 +00:00
2014-07-04 17:39:48 +00:00
# Iter the image results
2011-04-07 13:14:37 +00:00
for i in m :
url = i . group ( ' url ' )
2014-07-04 17:39:48 +00:00
url = curateImageURL ( config = config , url = url )
2011-04-07 17:28:08 +00:00
filename = re . sub ( ' _ ' , ' ' , i . group ( ' filename ' ) )
2011-04-08 15:34:53 +00:00
filename = undoHTMLEntities ( text = filename )
filename = urllib . unquote ( filename )
2011-04-08 14:57:36 +00:00
uploader = re . sub ( ' _ ' , ' ' , i . group ( ' uploader ' ) )
2011-04-08 15:34:53 +00:00
uploader = undoHTMLEntities ( text = uploader )
uploader = urllib . unquote ( uploader )
2011-04-08 14:57:36 +00:00
images . append ( [ filename , url , uploader ] )
2014-07-03 18:23:21 +00:00
# print filename, url
2011-04-07 13:14:37 +00:00
if re . search ( r_next , raw ) :
2014-09-20 15:41:57 +00:00
new_offset = re . findall ( r_next , raw ) [ 0 ]
# Avoid infinite loop
if new_offset != offset :
offset = new_offset
retries + = 5 # add more retries if we got a page with offset
else :
offset = ' '
2011-04-07 13:14:37 +00:00
else :
offset = ' '
2014-07-03 18:23:21 +00:00
2013-10-13 09:35:48 +00:00
if ( len ( images ) == 1 ) :
2013-10-14 19:10:10 +00:00
print ' Found 1 image '
else :
print ' Found %d images ' % ( len ( images ) )
2014-07-03 18:23:21 +00:00
2011-04-07 22:32:05 +00:00
images . sort ( )
return images
2014-07-03 18:23:21 +00:00
2014-07-04 18:40:14 +00:00
def getImageNamesAPI ( config = { } , session = None ) :
2012-05-20 08:08:28 +00:00
""" Retrieve file list: filename, url, uploader """
2014-07-04 17:39:48 +00:00
oldAPI = False
2012-05-20 08:08:28 +00:00
aifrom = ' ! '
images = [ ]
while aifrom :
2014-07-03 18:23:21 +00:00
sys . stderr . write ( ' . ' ) # progress
2014-09-27 22:12:04 +00:00
params = {
' action ' : ' query ' ,
' list ' : ' allimages ' ,
' aiprop ' : ' url|user ' ,
' aifrom ' : aifrom ,
' format ' : ' json ' ,
' ailimit ' : 500 }
2014-07-03 18:23:21 +00:00
# FIXME Handle HTTP Errors HERE
2018-05-07 19:01:50 +00:00
r = session . post ( url = config [ ' api ' ] , data = params , timeout = 30 )
2014-07-01 17:26:57 +00:00
handleStatusCode ( r )
2015-03-24 01:58:01 +00:00
jsonimages = getJSON ( r )
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2014-09-27 22:12:04 +00:00
2014-07-04 17:39:48 +00:00
if ' query ' in jsonimages :
aifrom = ' '
2014-09-27 22:12:04 +00:00
if ' query-continue ' in jsonimages and ' allimages ' in jsonimages [
' query-continue ' ] :
if ' aicontinue ' in jsonimages [ ' query-continue ' ] [ ' allimages ' ] :
aifrom = jsonimages [
' query-continue ' ] [ ' allimages ' ] [ ' aicontinue ' ]
elif ' aifrom ' in jsonimages [ ' query-continue ' ] [ ' allimages ' ] :
aifrom = jsonimages [
' query-continue ' ] [ ' allimages ' ] [ ' aifrom ' ]
2016-01-29 15:55:44 +00:00
elif ' continue ' in jsonimages :
if ' aicontinue ' in jsonimages [ ' continue ' ] :
aifrom = jsonimages [ ' continue ' ] [ ' aicontinue ' ]
elif ' aifrom ' in jsonimages [ ' continue ' ] :
aifrom = jsonimages [ ' continue ' ] [ ' aifrom ' ]
2014-09-27 22:12:04 +00:00
# print aifrom
2014-07-04 17:39:48 +00:00
for image in jsonimages [ ' query ' ] [ ' allimages ' ] :
url = image [ ' url ' ]
url = curateImageURL ( config = config , url = url )
2014-09-27 22:12:04 +00:00
# encoding to ascii is needed to work around this horrible bug:
# http://bugs.python.org/issue8136
2016-09-17 23:16:46 +00:00
if ' api ' in config and ' .wikia.com ' in config [ ' api ' ] :
#to avoid latest?cb=20120816112532 in filenames
filename = unicode ( urllib . unquote ( ( re . sub ( ' _ ' , ' ' , url . split ( ' / ' ) [ - 3 ] ) ) . encode ( ' ascii ' , ' ignore ' ) ) , ' utf-8 ' )
else :
filename = unicode ( urllib . unquote ( ( re . sub ( ' _ ' , ' ' , url . split ( ' / ' ) [ - 1 ] ) ) . encode ( ' ascii ' , ' ignore ' ) ) , ' utf-8 ' )
2014-07-04 17:39:48 +00:00
uploader = re . sub ( ' _ ' , ' ' , image [ ' user ' ] )
images . append ( [ filename , url , uploader ] )
else :
oldAPI = True
break
2014-09-27 22:12:04 +00:00
2014-07-04 17:39:48 +00:00
if oldAPI :
gapfrom = ' ! '
images = [ ]
while gapfrom :
2014-09-27 22:12:04 +00:00
sys . stderr . write ( ' . ' ) # progress
2014-07-04 17:39:48 +00:00
# Some old APIs doesn't have allimages query
# In this case use allpages (in nm=6) as generator for imageinfo
2014-09-27 22:12:04 +00:00
# Example:
# http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6
# &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
params = {
' action ' : ' query ' ,
' generator ' : ' allpages ' ,
' gapnamespace ' : 6 ,
' gaplimit ' : 500 ,
' gapfrom ' : gapfrom ,
' prop ' : ' imageinfo ' ,
' iiprop ' : ' user|url ' ,
' format ' : ' json ' }
# FIXME Handle HTTP Errors HERE
2018-05-07 19:01:50 +00:00
r = session . post ( url = config [ ' api ' ] , data = params , timeout = 30 )
2014-07-04 17:39:48 +00:00
handleStatusCode ( r )
2015-03-24 01:58:01 +00:00
jsonimages = getJSON ( r )
2014-07-04 17:39:48 +00:00
delay ( config = config , session = session )
2014-09-27 22:12:04 +00:00
2014-07-04 17:39:48 +00:00
if ' query ' in jsonimages :
gapfrom = ' '
2014-09-27 22:12:04 +00:00
if ' query-continue ' in jsonimages and ' allpages ' in jsonimages [
' query-continue ' ] :
if ' gapfrom ' in jsonimages [ ' query-continue ' ] [ ' allpages ' ] :
gapfrom = jsonimages [
' query-continue ' ] [ ' allpages ' ] [ ' gapfrom ' ]
# print gapfrom
# print jsonimages['query']
2014-07-04 17:39:48 +00:00
for image , props in jsonimages [ ' query ' ] [ ' pages ' ] . items ( ) :
url = props [ ' imageinfo ' ] [ 0 ] [ ' url ' ]
url = curateImageURL ( config = config , url = url )
2014-09-27 22:12:04 +00:00
tmp_filename = ' : ' . join ( props [ ' title ' ] . split ( ' : ' ) [ 1 : ] )
filename = re . sub ( ' _ ' , ' ' , tmp_filename )
2014-07-04 17:39:48 +00:00
uploader = re . sub ( ' _ ' , ' ' , props [ ' imageinfo ' ] [ 0 ] [ ' user ' ] )
images . append ( [ filename , url , uploader ] )
2015-02-10 02:24:50 +00:00
else :
# if the API doesn't return query data, then we're done
break
2013-10-13 09:35:48 +00:00
if ( len ( images ) == 1 ) :
print ' Found 1 image '
else :
print ' Found %d images ' % ( len ( images ) )
2012-05-20 08:08:28 +00:00
return images
2014-07-03 18:23:21 +00:00
2011-04-08 15:34:53 +00:00
def undoHTMLEntities ( text = ' ' ) :
2013-03-27 21:50:23 +00:00
""" Undo some HTML codes """
2014-07-03 18:23:21 +00:00
# i guess only < > & " ' need conversion
# http://www.w3schools.com/html/html_entities.asp
text = re . sub ( ' < ' , ' < ' , text )
2011-04-08 15:34:53 +00:00
text = re . sub ( ' > ' , ' > ' , text )
text = re . sub ( ' & ' , ' & ' , text )
text = re . sub ( ' " ' , ' " ' , text )
2011-04-16 14:51:48 +00:00
text = re . sub ( ' ' ' , ' \' ' , text )
2014-07-03 18:23:21 +00:00
2011-04-08 15:34:53 +00:00
return text
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def generateImageDump ( config = { } , other = { } , images = [ ] , start = ' ' , session = None ) :
2011-07-14 20:45:41 +00:00
""" Save files and descriptions using a file list """
2014-07-03 18:23:21 +00:00
# fix use subdirectories md5
2011-04-07 22:32:05 +00:00
print ' Retrieving images from " %s " ' % ( start and start or ' start ' )
2011-04-07 15:43:17 +00:00
imagepath = ' %s /images ' % ( config [ ' path ' ] )
2011-04-30 12:37:54 +00:00
if not os . path . isdir ( imagepath ) :
2011-04-30 17:05:59 +00:00
print ' Creating " %s " directory ' % ( imagepath )
2011-04-07 13:14:37 +00:00
os . makedirs ( imagepath )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
c = 0
2011-04-07 22:32:05 +00:00
lock = True
2011-04-07 23:37:45 +00:00
if not start :
lock = False
2011-04-08 14:57:36 +00:00
for filename , url , uploader in images :
2014-07-03 18:23:21 +00:00
if filename == start : # start downloading from start (included)
2011-04-07 22:32:05 +00:00
lock = False
if lock :
continue
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2014-07-03 18:23:21 +00:00
# saving file
# truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
# limit). Later .desc is added to filename, so better 100 as max)
2014-07-01 00:14:44 +00:00
filename2 = urllib . unquote ( filename )
2011-04-11 19:09:55 +00:00
if len ( filename2 ) > other [ ' filenamelimit ' ] :
2011-04-11 18:38:47 +00:00
# split last . (extension) and then merge
2011-04-11 19:09:55 +00:00
filename2 = truncateFilename ( other = other , filename = filename2 )
2013-03-27 21:50:23 +00:00
print ' Filename is too long, truncating. Now it is: ' , filename2
2014-06-30 18:03:32 +00:00
filename3 = u ' %s / %s ' % ( imagepath , filename2 )
2014-07-01 17:26:57 +00:00
imagefile = open ( filename3 , ' wb ' )
r = requests . get ( url = url )
imagefile . write ( r . content )
imagefile . close ( )
2014-07-03 18:23:21 +00:00
# saving description if any
2015-03-02 09:15:51 +00:00
try :
title = u ' Image: %s ' % ( filename )
xmlfiledesc = getXMLFileDesc (
config = config ,
title = title ,
session = session ) # use Image: for backwards compatibility
except PageMissingError :
xmlfiledesc = ' '
logerror (
config = config ,
2015-03-29 22:51:14 +00:00
text = u ' The page " %s " was missing in the wiki (probably deleted) ' % ( title . decode ( ' utf-8 ' ) )
2015-03-02 09:15:51 +00:00
)
2011-04-11 18:38:47 +00:00
f = open ( ' %s / %s .desc ' % ( imagepath , filename2 ) , ' w ' )
2014-07-03 18:23:21 +00:00
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
if not re . search ( r ' </mediawiki> ' , xmlfiledesc ) :
# failure when retrieving desc? then save it as empty .desc
2011-04-08 13:39:14 +00:00
xmlfiledesc = ' '
2014-07-01 00:14:44 +00:00
f . write ( xmlfiledesc . encode ( ' utf-8 ' ) )
2011-04-08 13:39:14 +00:00
f . close ( )
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2011-04-07 13:14:37 +00:00
c + = 1
if c % 10 == 0 :
print ' Downloaded %d images ' % ( c )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
print ' Downloaded %d images ' % ( c )
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def saveLogs ( config = { } , session = None ) :
2013-03-27 21:30:52 +00:00
""" Save Special:Log """
2014-07-03 18:23:21 +00:00
# get all logs from Special:Log
2011-04-06 19:17:59 +00:00
""" parse
< select name = ' type ' >
< option value = " block " > Bloqueos de usuarios < / option >
< option value = " rights " > Cambios de perfil de usuario < / option >
< option value = " protect " selected = " selected " > Protecciones de páginas < / option >
< option value = " delete " > Registro de borrados < / option >
< option value = " newusers " > Registro de creación de usuarios < / option >
< option value = " merge " > Registro de fusiones < / option >
< option value = " import " > Registro de importaciones < / option >
< option value = " patrol " > Registro de revisiones < / option >
< option value = " move " > Registro de traslados < / option >
< option value = " upload " > Subidas de archivos < / option >
< option value = " " > Todos los registros < / option >
< / select >
"""
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2011-04-06 19:17:59 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def domain2prefix ( config = { } , session = None ) :
2013-11-07 12:24:50 +00:00
""" Convert domain name to a valid prefix filename. """
2014-07-03 18:23:21 +00:00
2013-11-07 12:24:50 +00:00
# At this point, both api and index are supposed to be defined
2011-04-09 09:12:58 +00:00
domain = ' '
2013-11-07 12:24:50 +00:00
if config [ ' api ' ] :
2011-04-09 09:12:58 +00:00
domain = config [ ' api ' ]
2013-11-07 12:24:50 +00:00
elif config [ ' index ' ] :
2011-04-09 09:12:58 +00:00
domain = config [ ' index ' ]
2013-11-07 12:24:50 +00:00
2011-04-13 19:44:35 +00:00
domain = domain . lower ( )
2011-04-29 08:59:13 +00:00
domain = re . sub ( r ' (https?://|www \ .|/index \ .php|/api \ .php) ' , ' ' , domain )
2011-04-07 13:14:37 +00:00
domain = re . sub ( r ' / ' , ' _ ' , domain )
domain = re . sub ( r ' \ . ' , ' ' , domain )
domain = re . sub ( r ' [^A-Za-z0-9] ' , ' _ ' , domain )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
return domain
2014-07-03 18:23:21 +00:00
2011-04-07 15:43:17 +00:00
def loadConfig ( config = { } , configfilename = ' ' ) :
2013-03-27 21:30:52 +00:00
""" Load config file """
2014-07-03 18:23:21 +00:00
2011-04-16 14:51:48 +00:00
try :
2014-07-03 17:24:28 +00:00
with open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' r ' ) as infile :
config = cPickle . load ( infile )
2011-04-16 14:51:48 +00:00
except :
print ' There is no config file. we can \' t resume. Start a new dump. '
sys . exit ( )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
return config
2014-07-03 18:23:21 +00:00
2011-04-07 15:43:17 +00:00
def saveConfig ( config = { } , configfilename = ' ' ) :
2013-03-27 21:30:52 +00:00
""" Save config file """
2014-07-03 18:23:21 +00:00
2014-07-03 17:24:28 +00:00
with open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' w ' ) as outfile :
cPickle . dump ( config , outfile )
2014-07-03 18:23:21 +00:00
2011-04-09 17:45:56 +00:00
def welcome ( ) :
2014-06-30 23:41:03 +00:00
message = ' '
2013-03-27 21:19:46 +00:00
""" Opening message """
2014-07-03 18:23:21 +00:00
message + = " # " * 73
2014-07-03 14:02:11 +00:00
message + = """
# Welcome to DumpGenerator %s by WikiTeam (GPL v3) #
# More info at: https://github.com/WikiTeam/wikiteam #""" % (getVersion())
2014-06-30 23:41:03 +00:00
message + = " \n "
2014-07-03 18:23:21 +00:00
message + = " # " * 73
2014-06-30 23:41:03 +00:00
message + = " \n "
message + = ' '
2014-07-03 18:23:21 +00:00
message + = " \n "
message + = " # " * 73
2016-09-30 09:29:33 +00:00
message + = " \n "
2016-09-30 05:08:35 +00:00
message + = " # Copyright (C) 2011- %d WikiTeam developers # \n " % ( datetime . datetime . now ( ) . year )
2014-07-03 14:02:11 +00:00
message + = """
2011-04-09 17:45:56 +00:00
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #"""
2014-06-30 23:41:03 +00:00
message + = " \n "
2014-07-03 18:23:21 +00:00
message + = " # " * 73
2014-06-30 23:41:03 +00:00
message + = " \n "
message + = ' '
2014-07-03 18:23:21 +00:00
2014-06-30 23:41:03 +00:00
return message
2011-04-09 17:45:56 +00:00
2014-07-03 18:23:21 +00:00
2011-04-09 17:45:56 +00:00
def bye ( ) :
2013-03-27 21:19:46 +00:00
""" Closing message """
2011-07-09 18:17:04 +00:00
print " ---> Congratulations! Your dump is complete <--- "
2014-06-27 13:25:28 +00:00
print " If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues "
2014-07-11 17:31:51 +00:00
print " If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam "
2011-04-07 16:10:12 +00:00
print " Good luck! Bye! "
2011-04-07 15:43:17 +00:00
2012-03-04 12:35:02 +00:00
2014-06-30 23:41:03 +00:00
def getParameters ( params = [ ] ) :
if not params :
params = sys . argv
2012-03-04 12:35:02 +00:00
2014-07-03 14:02:11 +00:00
parser = argparse . ArgumentParser ( description = ' ' )
2014-09-27 22:12:04 +00:00
2014-07-13 10:32:49 +00:00
# General params
2014-07-03 18:23:21 +00:00
parser . add_argument (
' -v ' , ' --version ' , action = ' version ' , version = getVersion ( ) )
parser . add_argument (
' --cookies ' , metavar = " cookies.txt " , help = " path to a cookies.txt file " )
parser . add_argument (
2014-09-27 22:12:04 +00:00
' --delay ' ,
metavar = 5 ,
default = 0 ,
type = float ,
help = " adds a delay (in seconds) " )
2014-07-03 18:23:21 +00:00
parser . add_argument (
2014-09-27 22:12:04 +00:00
' --retries ' ,
metavar = 5 ,
default = 5 ,
2016-07-31 15:37:31 +00:00
help = " Maximum number of retries for " )
2014-07-13 10:32:49 +00:00
parser . add_argument ( ' --path ' , help = ' path to store wiki dump at ' )
2014-09-27 22:12:04 +00:00
parser . add_argument (
' --resume ' ,
action = ' store_true ' ,
help = ' resumes previous incomplete dump (requires --path) ' )
2014-07-13 10:32:49 +00:00
parser . add_argument ( ' --force ' , action = ' store_true ' , help = ' ' )
2014-07-03 18:23:21 +00:00
parser . add_argument (
2016-07-31 15:37:31 +00:00
' --user ' , help = ' Username if authentication is required. ' )
2014-07-13 10:32:49 +00:00
parser . add_argument (
2014-09-27 22:12:04 +00:00
' --pass ' ,
dest = ' password ' ,
2016-07-31 15:37:31 +00:00
help = ' Password if authentication is required. ' )
2014-07-01 17:26:57 +00:00
2014-07-13 10:32:49 +00:00
# URL params
groupWikiOrAPIOrIndex = parser . add_argument_group ( )
2014-07-03 18:23:21 +00:00
groupWikiOrAPIOrIndex . add_argument (
2014-09-27 22:12:04 +00:00
' wiki ' ,
default = ' ' ,
nargs = ' ? ' ,
help = " URL to wiki (e.g. http://wiki.domain.org) " )
groupWikiOrAPIOrIndex . add_argument (
' --api ' ,
help = " URL to API (e.g. http://wiki.domain.org/w/api.php) " )
groupWikiOrAPIOrIndex . add_argument (
' --index ' ,
help = " URL to index.php (e.g. http://wiki.domain.org/w/index.php) " )
2014-07-13 10:32:49 +00:00
# Download params
2014-09-27 22:12:04 +00:00
groupDownload = parser . add_argument_group (
' Data to download ' ,
' What info download from the wiki ' )
2014-07-13 10:32:49 +00:00
groupDownload . add_argument (
2014-09-27 22:12:04 +00:00
' --xml ' ,
action = ' store_true ' ,
help = " generates a full history XML dump (--xml --curonly for current revisions only) " )
2014-07-13 10:32:49 +00:00
groupDownload . add_argument ( ' --curonly ' , action = ' store_true ' ,
2018-05-07 19:05:26 +00:00
help = ' store only the current version of pages; incompatible with --xmlrevisions ' )
groupDownload . add_argument ( ' --xmlrevisions ' , action = ' store_true ' ,
2018-05-07 20:03:22 +00:00
help = ' download all revisions from an API generator. Ignores the \
namespace selection ' )
2014-07-13 10:32:49 +00:00
groupDownload . add_argument (
2014-07-03 18:23:21 +00:00
' --images ' , action = ' store_true ' , help = " generates an image dump " )
2014-09-27 22:12:04 +00:00
groupDownload . add_argument (
' --namespaces ' ,
metavar = " 1,2,3 " ,
help = ' comma-separated value of namespaces to include (all by default) ' )
groupDownload . add_argument (
' --exnamespaces ' ,
metavar = " 1,2,3 " ,
help = ' comma-separated value of namespaces to exclude ' )
2014-07-13 10:32:49 +00:00
# Meta info params
2014-09-27 22:12:04 +00:00
groupMeta = parser . add_argument_group (
' Meta info ' ,
' What meta info to retrieve from the wiki ' )
2014-07-13 10:32:49 +00:00
groupMeta . add_argument (
2014-09-27 22:12:04 +00:00
' --get-wiki-engine ' ,
action = ' store_true ' ,
help = " returns the wiki engine " )
2014-06-30 23:41:03 +00:00
args = parser . parse_args ( )
2014-07-03 18:23:21 +00:00
# print args
2014-09-27 22:12:04 +00:00
2014-07-13 10:32:49 +00:00
# Don't mix download params and meta info params
if ( args . xml or args . images ) and \
2014-09-27 22:12:04 +00:00
( args . get_wiki_engine ) :
2014-07-13 10:32:49 +00:00
print ' ERROR: Don \' t mix download params and meta info params '
parser . print_help ( )
2014-06-30 23:41:03 +00:00
sys . exit ( 1 )
2014-09-27 22:12:04 +00:00
2014-07-13 10:32:49 +00:00
# No download params and no meta info params? Exit
if ( not args . xml and not args . images ) and \
2014-09-27 22:12:04 +00:00
( not args . get_wiki_engine ) :
2014-07-13 10:32:49 +00:00
print ' ERROR: Use at least one download param or meta info param '
parser . print_help ( )
2014-06-30 23:41:03 +00:00
sys . exit ( 1 )
2014-09-27 22:12:04 +00:00
2014-07-13 10:32:49 +00:00
# Execute meta info params
if args . wiki :
if args . get_wiki_engine :
print getWikiEngine ( url = args . wiki )
sys . exit ( )
2014-09-27 22:12:04 +00:00
2014-07-13 10:32:49 +00:00
# Create session
cj = cookielib . MozillaCookieJar ( )
if args . cookies :
cj . load ( args . cookies )
print ' Using cookies from %s ' % args . cookies
2014-07-03 18:23:21 +00:00
2014-07-13 10:32:49 +00:00
session = requests . Session ( )
2018-05-07 19:01:50 +00:00
try :
from requests . packages . urllib3 . util . retry import Retry
from requests . adapters import HTTPAdapter
# Courtesy datashaman https://stackoverflow.com/a/35504626
__retries__ = Retry ( total = 5 ,
backoff_factor = 2 ,
status_forcelist = [ 500 , 502 , 503 , 504 ] )
session . mount ( ' https:// ' , HTTPAdapter ( max_retries = __retries__ ) )
session . mount ( ' http:// ' , HTTPAdapter ( max_retries = __retries__ ) )
except :
# Our urllib3/requests is too old
pass
2014-07-13 10:32:49 +00:00
session . cookies = cj
2014-09-21 00:07:49 +00:00
session . headers . update ( { ' User-Agent ' : getUserAgent ( ) } )
2014-07-13 10:32:49 +00:00
if args . user and args . password :
session . auth = ( args . user , args . password )
2014-09-27 22:12:04 +00:00
2014-07-13 10:32:49 +00:00
# check URLs
for url in [ args . api , args . index , args . wiki ] :
if url and ( not url . startswith ( ' http:// ' ) and not url . startswith ( ' https:// ' ) ) :
print url
print ' ERROR: URLs must start with http:// or https:// \n '
parser . print_help ( )
sys . exit ( 1 )
2016-07-31 15:37:31 +00:00
# Get API and index and verify
api = args . api and args . api or ' '
index = args . index and args . index or ' '
if api == ' ' or index == ' ' :
if args . wiki :
if getWikiEngine ( args . wiki ) == ' MediaWiki ' :
2014-07-13 10:32:49 +00:00
api2 , index2 = mwGetAPIAndIndex ( args . wiki )
if not api :
api = api2
if not index :
index = index2
else :
2016-07-31 15:37:31 +00:00
print ' ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki '
sys . exit ( 1 )
else :
if api == ' ' :
pass
elif index == ' ' :
index = ' / ' . join ( api . split ( ' / ' ) [ : - 1 ] ) + ' /index.php '
2014-09-27 22:12:04 +00:00
2016-07-31 15:37:31 +00:00
# print api
# print index
index2 = None
2014-09-27 22:12:04 +00:00
2016-07-31 15:37:31 +00:00
if api :
retry = 0
maxretries = args . retries
retrydelay = 20
while retry < maxretries :
try :
check = checkAPI ( api = api , session = session )
break
except requests . exceptions . ConnectionError as e :
print ' Connection error: %s ' % ( str ( e ) )
retry + = 1
print " Start retry attempt %d in %d seconds. " % ( retry + 1 , retrydelay )
time . sleep ( retrydelay )
if api and check :
index2 = check [ 1 ]
api = check [ 2 ]
print ' API is OK: ' + api
else :
if index and not args . wiki :
print ' API not available. Trying with index.php only. '
2016-02-27 20:10:02 +00:00
else :
2016-07-31 15:37:31 +00:00
print ' Error in API. Please, provide a correct path to API '
sys . exit ( 1 )
2014-09-27 22:12:04 +00:00
2016-07-31 15:37:31 +00:00
if index and checkIndex (
index = index ,
cookies = args . cookies ,
session = session ) :
print ' index.php is OK '
else :
index = index2
if index and index . startswith ( ' // ' ) :
index = args . wiki . split ( ' // ' ) [ 0 ] + index
2014-09-27 22:12:04 +00:00
if index and checkIndex (
index = index ,
cookies = args . cookies ,
session = session ) :
print ' index.php is OK '
2014-09-14 15:59:17 +00:00
else :
2016-07-31 15:37:31 +00:00
index = ' / ' . join ( index . split ( ' / ' ) [ : - 1 ] )
2015-03-02 04:13:44 +00:00
if index and checkIndex (
index = index ,
cookies = args . cookies ,
session = session ) :
print ' index.php is OK '
else :
2016-07-31 15:37:31 +00:00
print ' Error in index.php, please, provide a correct path to index.php '
sys . exit ( 1 )
2014-07-01 17:26:57 +00:00
# check user and pass (one requires both)
if ( args . user and not args . password ) or ( args . password and not args . user ) :
2014-07-13 10:32:49 +00:00
print ' ERROR: Both --user and --pass are required for authentication. '
parser . print_help ( )
2014-07-01 17:26:57 +00:00
sys . exit ( 1 )
2013-10-09 20:11:01 +00:00
2014-06-30 23:41:03 +00:00
namespaces = [ ' all ' ]
exnamespaces = [ ]
# Process namespace inclusions
if args . namespaces :
2014-07-03 18:23:21 +00:00
# fix, why - ? and... --namespaces= all with a space works?
2014-09-27 22:12:04 +00:00
if re . search (
r ' [^ \ d, \ -] ' ,
args . namespaces ) and args . namespaces . lower ( ) != ' all ' :
2014-06-30 23:41:03 +00:00
print " Invalid namespace values. \n Valid format is integer(s) separated by commas "
sys . exit ( )
else :
ns = re . sub ( ' ' , ' ' , args . namespaces )
if ns . lower ( ) == ' all ' :
namespaces = [ ' all ' ]
else :
namespaces = [ int ( i ) for i in ns . split ( ' , ' ) ]
2012-03-04 12:35:02 +00:00
2014-06-30 23:41:03 +00:00
# Process namespace exclusions
if args . exnamespaces :
if re . search ( r ' [^ \ d, \ -] ' , args . exnamespaces ) :
print " Invalid namespace values. \n Valid format is integer(s) separated by commas "
sys . exit ( 1 )
else :
ns = re . sub ( ' ' , ' ' , args . exnamespaces )
if ns . lower ( ) == ' all ' :
print ' You cannot exclude all namespaces. '
sys . exit ( 1 )
else :
exnamespaces = [ int ( i ) for i in ns . split ( ' , ' ) ]
# --curonly requires --xml
if args . curonly and not args . xml :
print " --curonly requires --xml \n "
2014-07-13 10:32:49 +00:00
parser . print_help ( )
2014-06-30 23:41:03 +00:00
sys . exit ( 1 )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
config = {
2014-06-30 23:41:03 +00:00
' curonly ' : args . curonly ,
2011-04-07 20:24:30 +00:00
' date ' : datetime . datetime . now ( ) . strftime ( ' % Y % m %d ' ) ,
2014-07-13 10:32:49 +00:00
' api ' : api ,
2014-06-30 23:41:03 +00:00
' index ' : index ,
' images ' : args . images ,
2011-04-07 13:14:37 +00:00
' logs ' : False ,
2016-07-31 15:37:31 +00:00
' xml ' : args . xml ,
2018-05-07 19:05:26 +00:00
' xmlrevisions ' : args . xmlrevisions ,
2014-06-30 23:41:03 +00:00
' namespaces ' : namespaces ,
' exnamespaces ' : exnamespaces ,
2015-04-18 21:28:57 +00:00
' path ' : args . path and os . path . normpath ( args . path ) or ' ' ,
2014-06-30 23:41:03 +00:00
' cookies ' : args . cookies or ' ' ,
2015-08-05 20:24:59 +00:00
' delay ' : args . delay ,
2015-08-07 20:33:39 +00:00
' retries ' : int ( args . retries ) ,
2011-04-07 13:14:37 +00:00
}
2015-08-05 20:24:59 +00:00
2011-04-07 15:43:17 +00:00
other = {
2014-06-30 23:41:03 +00:00
' resume ' : args . resume ,
2014-07-03 18:23:21 +00:00
' filenamelimit ' : 100 , # do not change
2014-06-30 23:41:03 +00:00
' force ' : args . force ,
2014-07-01 00:14:44 +00:00
' session ' : session
2011-04-07 15:43:17 +00:00
}
2014-07-03 18:23:21 +00:00
# calculating path, if not defined by user with --path=
2011-04-11 07:58:09 +00:00
if not config [ ' path ' ] :
2014-07-01 00:14:44 +00:00
config [ ' path ' ] = ' ./ %s - %s -wikidump ' % ( domain2prefix ( config = config , session = session ) , config [ ' date ' ] )
2011-04-07 15:43:17 +00:00
2014-06-30 23:41:03 +00:00
return config , other
2014-07-03 18:23:21 +00:00
2014-07-13 10:32:49 +00:00
def checkAPI ( api = None , session = None ) :
2013-03-27 21:26:20 +00:00
""" Checking API availability """
2014-07-01 00:14:44 +00:00
global cj
2015-03-02 03:13:03 +00:00
# handle redirects
for i in range ( 4 ) :
print ' Checking API... ' , api
r = session . post (
url = api ,
data = {
' action ' : ' query ' ,
' meta ' : ' siteinfo ' ,
2018-05-07 19:01:50 +00:00
' format ' : ' json ' } ,
timeout = 30
2015-03-08 15:01:46 +00:00
)
2015-03-02 03:13:03 +00:00
if r . url == api :
break
2015-03-08 15:01:46 +00:00
else :
api = r . url
2015-03-24 01:58:01 +00:00
if " MediaWiki API is not enabled for this site. " in r . text :
2014-06-27 19:19:54 +00:00
return False
2014-08-23 16:08:14 +00:00
try :
2015-03-24 01:58:01 +00:00
result = getJSON ( r )
2015-03-08 15:01:46 +00:00
index = None
2018-05-07 19:05:26 +00:00
if result :
2015-03-08 15:01:46 +00:00
try :
index = result [ ' query ' ] [ ' general ' ] [ ' server ' ] + \
result [ ' query ' ] [ ' general ' ] [ ' script ' ]
return ( True , index , api )
2015-03-29 03:14:43 +00:00
except KeyError :
2015-03-08 15:01:46 +00:00
print " MediaWiki API seems to work but returned no index URL "
return ( True , None , api )
2014-08-23 16:08:14 +00:00
except ValueError :
2015-03-24 01:58:01 +00:00
print repr ( r . text )
2015-03-08 15:01:46 +00:00
print " MediaWiki API returned data we could not parse "
2014-09-27 22:12:04 +00:00
return False
2012-03-05 11:08:03 +00:00
return False
2014-07-03 18:23:21 +00:00
2014-07-13 10:32:49 +00:00
def checkIndex ( index = None , cookies = None , session = None ) :
2013-03-27 21:26:20 +00:00
""" Checking index.php availability """
2018-05-07 19:01:50 +00:00
r = session . post ( url = index , data = { ' title ' : ' Special:Version ' } , timeout = 30 )
2014-07-01 00:14:44 +00:00
raw = r . text
2014-07-13 10:32:49 +00:00
print ' Checking index.php... ' , index
2014-07-03 18:23:21 +00:00
# Workaround for issue 71
2014-09-27 22:12:04 +00:00
if re . search (
r ' (Special:Badtitle</a>|class= " permissions-errors " | " wgCanonicalSpecialPageName " : " Badtitle " |Login Required</h1>) ' ,
raw ) and not cookies :
2014-07-03 18:23:21 +00:00
print " ERROR: This wiki requires login and we are not authenticated "
return False
2015-03-02 04:13:44 +00:00
if re . search (
r ' (page-Index_php| " wgPageName " : " Index.php " | " firstHeading " ><span dir= " auto " >Index.php</span>) ' ,
raw ) :
print " Looks like the page called Index.php, not index.php itself "
return False
2014-09-27 22:12:04 +00:00
if re . search (
r ' (This wiki is powered by|<h2 id= " mw-version-license " >|meta name= " generator " content= " MediaWiki) ' ,
raw ) :
2012-03-05 11:08:03 +00:00
return True
return False
2014-07-03 18:23:21 +00:00
2011-04-09 17:16:42 +00:00
def removeIP ( raw = ' ' ) :
2011-07-14 19:58:14 +00:00
""" Remove IP from HTML comments <!-- --> """
2014-07-03 18:23:21 +00:00
2011-04-09 17:16:42 +00:00
raw = re . sub ( r ' \ d+ \ . \ d+ \ . \ d+ \ . \ d+ ' , ' 0.0.0.0 ' , raw )
2014-07-03 18:23:21 +00:00
# http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
# weird cases as :: are not included
raw = re . sub (
2014-09-27 22:12:04 +00:00
r ' (?i)[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4} ' ,
' 0:0:0:0:0:0:0:0 ' ,
raw )
2014-07-03 18:23:21 +00:00
2011-04-09 17:16:42 +00:00
return raw
2014-07-03 18:23:21 +00:00
2015-03-24 01:58:01 +00:00
def getJSON ( request ) :
""" Strip Unicode BOM """
if request . text . startswith ( u ' \ufeff ' ) :
request . encoding = ' utf-8-sig '
return request . json ( )
def fixBOM ( request ) :
""" Strip Unicode BOM """
if request . text . startswith ( u ' \ufeff ' ) :
request . encoding = ' utf-8-sig '
return request . text
2014-07-03 18:23:21 +00:00
def checkXMLIntegrity ( config = { } , titles = [ ] , session = None ) :
2012-08-06 14:48:30 +00:00
""" Check XML dump integrity, to detect broken XML chunks """
2014-07-03 18:23:21 +00:00
return
2013-11-16 13:39:49 +00:00
print ' Verifying dump... '
checktitles = 0
checkpageopen = 0
checkpageclose = 0
checkrevisionopen = 0
checkrevisionclose = 0
2014-09-27 22:12:04 +00:00
for line in file (
' %s / %s - %s - %s .xml ' %
( config [ ' path ' ] ,
domain2prefix (
config = config ,
session = session ) ,
config [ ' date ' ] ,
config [ ' curonly ' ] and ' current ' or ' history ' ) ,
' r ' ) . read ( ) . splitlines ( ) :
2013-11-17 11:18:42 +00:00
if " <revision> " in line :
checkrevisionopen + = 1
elif " </revision> " in line :
checkrevisionclose + = 1
2013-11-16 13:39:49 +00:00
elif " <page> " in line :
checkpageopen + = 1
elif " </page> " in line :
checkpageclose + = 1
2013-11-17 11:18:42 +00:00
elif " <title> " in line :
checktitles + = 1
2013-11-16 13:39:49 +00:00
else :
continue
2013-11-16 14:01:24 +00:00
if ( checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose ) :
2013-11-16 13:39:49 +00:00
pass
2012-06-22 11:34:27 +00:00
else :
2013-11-16 13:39:49 +00:00
print ' XML dump seems to be corrupted. '
reply = ' '
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
reply = raw_input ( ' Regenerate a new dump ([yes, y], [no, n])? ' )
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
2014-07-03 18:23:21 +00:00
generateXMLDump ( config = config , titles = titles , session = session )
2013-11-16 13:39:49 +00:00
elif reply . lower ( ) in [ ' no ' , ' n ' ] :
print ' Not generating a new dump. '
2014-07-03 18:23:21 +00:00
2012-06-22 11:34:27 +00:00
2013-03-27 21:26:20 +00:00
def createNewDump ( config = { } , other = { } ) :
2013-03-27 21:19:46 +00:00
images = [ ]
print ' Trying generating a new dump into a new directory... '
if config [ ' xml ' ] :
2016-07-31 15:37:31 +00:00
getPageTitles ( config = config , session = other [ ' session ' ] )
2015-03-09 00:34:07 +00:00
titles = readTitles ( config )
2014-07-01 00:14:44 +00:00
generateXMLDump ( config = config , titles = titles , session = other [ ' session ' ] )
2014-09-27 22:12:04 +00:00
checkXMLIntegrity (
config = config ,
titles = titles ,
session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
if config [ ' images ' ] :
2014-07-04 18:40:14 +00:00
images + = getImageNames ( config = config , session = other [ ' session ' ] )
saveImageNames ( config = config , images = images , session = other [ ' session ' ] )
2014-09-27 22:12:04 +00:00
generateImageDump (
config = config ,
other = other ,
images = images ,
session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
if config [ ' logs ' ] :
2014-07-03 18:23:21 +00:00
saveLogs ( config = config , session = other [ ' session ' ] )
2016-07-30 14:32:25 +00:00
2016-07-31 15:37:31 +00:00
def resumePreviousDump ( config = { } , other = { } ) :
2013-03-27 21:19:46 +00:00
images = [ ]
print ' Resuming previous dump process... '
if config [ ' xml ' ] :
2015-03-09 00:34:07 +00:00
titles = readTitles ( config )
2013-03-27 21:19:46 +00:00
try :
2015-03-09 00:34:07 +00:00
lasttitles = reverse_readline ( ' %s / %s - %s -titles.txt ' %
( config [ ' path ' ] ,
domain2prefix ( config = config , session = other [ ' session ' ] ) ,
config [ ' date ' ] )
)
lasttitle = lasttitles . next ( )
if lasttitle == ' ' :
lasttitle = lasttitles . next ( )
2013-03-27 21:19:46 +00:00
except :
2015-03-09 00:34:07 +00:00
pass # probably file does not exists
2013-03-27 21:19:46 +00:00
if lasttitle == ' --END-- ' :
2014-07-03 18:23:21 +00:00
# titles list is complete
2013-03-27 21:19:46 +00:00
print ' Title list was completed in the previous session '
else :
print ' Title list is incomplete. Reloading... '
2014-07-03 18:23:21 +00:00
# do not resume, reload, to avoid inconsistences, deleted pages or
# so
2016-07-31 15:37:31 +00:00
getPageTitles ( config = config , session = other [ ' session ' ] )
2015-03-09 00:34:07 +00:00
2014-07-03 18:23:21 +00:00
# checking xml dump
2013-03-27 21:19:46 +00:00
xmliscomplete = False
2015-03-09 00:34:07 +00:00
lastxmltitle = None
2013-03-27 21:19:46 +00:00
try :
2015-03-09 00:34:07 +00:00
f = reverse_readline (
2014-09-27 22:12:04 +00:00
' %s / %s - %s - %s .xml ' %
( config [ ' path ' ] ,
domain2prefix (
config = config ,
session = other [ ' session ' ] ) ,
config [ ' date ' ] ,
config [ ' curonly ' ] and ' current ' or ' history ' ) ,
2015-03-09 00:34:07 +00:00
)
2013-03-27 21:19:46 +00:00
for l in f :
2015-03-09 00:34:07 +00:00
if l == ' </mediawiki> ' :
2014-07-03 18:23:21 +00:00
# xml dump is complete
2013-03-27 21:19:46 +00:00
xmliscomplete = True
break
2015-03-09 00:34:07 +00:00
xmltitle = re . search ( r ' <title>([^<]+)</title> ' , l )
if xmltitle :
lastxmltitle = undoHTMLEntities ( text = xmltitle . group ( 1 ) )
break
2013-03-27 21:19:46 +00:00
except :
2015-03-09 00:34:07 +00:00
pass # probably file does not exists
2013-03-27 21:19:46 +00:00
if xmliscomplete :
print ' XML dump was completed in the previous session '
elif lastxmltitle :
2014-07-03 18:23:21 +00:00
# resuming...
2013-03-27 21:19:46 +00:00
print ' Resuming XML dump from " %s " ' % ( lastxmltitle )
2015-03-09 00:34:07 +00:00
titles = readTitles ( config , start = lastxmltitle )
2014-07-03 18:23:21 +00:00
generateXMLDump (
2014-09-27 22:12:04 +00:00
config = config ,
titles = titles ,
start = lastxmltitle ,
session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
else :
2014-07-03 18:23:21 +00:00
# corrupt? only has XML header?
2013-03-27 21:19:46 +00:00
print ' XML is corrupt? Regenerating... '
2015-03-09 00:34:07 +00:00
titles = readTitles ( config )
2014-07-03 18:23:21 +00:00
generateXMLDump (
config = config , titles = titles , session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
if config [ ' images ' ] :
2014-07-03 18:23:21 +00:00
# load images
2013-03-27 21:19:46 +00:00
lastimage = ' '
try :
2014-09-27 22:12:04 +00:00
f = open (
' %s / %s - %s -images.txt ' %
( config [ ' path ' ] ,
domain2prefix (
config = config ) ,
config [ ' date ' ] ) ,
' r ' )
2014-06-30 18:03:32 +00:00
raw = unicode ( f . read ( ) , ' utf-8 ' ) . strip ( )
2013-03-27 21:19:46 +00:00
lines = raw . split ( ' \n ' )
for l in lines :
if re . search ( r ' \ t ' , l ) :
images . append ( l . split ( ' \t ' ) )
lastimage = lines [ - 1 ]
f . close ( )
except :
2014-07-03 18:23:21 +00:00
pass # probably file doesnot exists
2014-06-30 18:03:32 +00:00
if lastimage == u ' --END-- ' :
2013-03-27 21:19:46 +00:00
print ' Image list was completed in the previous session '
else :
print ' Image list is incomplete. Reloading... '
2014-07-03 18:23:21 +00:00
# do not resume, reload, to avoid inconsistences, deleted images or
# so
2014-07-04 18:40:14 +00:00
images = getImageNames ( config = config , session = other [ ' session ' ] )
saveImageNames ( config = config , images = images )
2014-07-03 18:23:21 +00:00
# checking images directory
2013-03-27 21:19:46 +00:00
listdir = [ ]
try :
2015-03-01 16:14:01 +00:00
listdir = [ n . decode ( ' utf-8 ' ) for n in os . listdir ( ' %s /images ' % ( config [ ' path ' ] ) ) ]
2013-03-27 21:19:46 +00:00
except :
2014-07-03 18:23:21 +00:00
pass # probably directory does not exist
2013-03-27 21:19:46 +00:00
listdir . sort ( )
complete = True
lastfilename = ' '
lastfilename2 = ' '
c = 0
for filename , url , uploader in images :
lastfilename2 = lastfilename
2014-07-03 18:23:21 +00:00
# return always the complete filename, not the truncated
lastfilename = filename
2013-03-27 21:19:46 +00:00
filename2 = filename
if len ( filename2 ) > other [ ' filenamelimit ' ] :
filename2 = truncateFilename ( other = other , filename = filename2 )
if filename2 not in listdir :
complete = False
break
2014-07-03 18:23:21 +00:00
c + = 1
2013-03-27 21:19:46 +00:00
print ' %d images were found in the directory from a previous session ' % ( c )
if complete :
2014-07-03 18:23:21 +00:00
# image dump is complete
2013-03-27 21:19:46 +00:00
print ' Image dump was completed in the previous session '
else :
2014-07-03 18:23:21 +00:00
# we resume from previous image, which may be corrupted (or missing
# .desc) by the previous session ctrl-c or abort
generateImageDump (
2014-09-27 22:12:04 +00:00
config = config ,
other = other ,
images = images ,
start = lastfilename2 ,
session = other [ ' session ' ] )
2014-07-03 18:23:21 +00:00
2013-03-27 21:19:46 +00:00
if config [ ' logs ' ] :
2014-07-03 18:23:21 +00:00
# fix
2013-03-27 21:19:46 +00:00
pass
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def saveSpecialVersion ( config = { } , session = None ) :
2014-06-26 08:26:57 +00:00
""" Save Special:Version as .html, to preserve extensions details """
2014-07-03 18:23:21 +00:00
2013-03-27 21:19:46 +00:00
if os . path . exists ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) ) :
print ' Special:Version.html exists, do not overwrite '
else :
print ' Downloading Special:Version with extensions and other related info '
2014-07-03 18:23:21 +00:00
r = session . post (
2018-05-07 19:01:50 +00:00
url = config [ ' index ' ] , data = { ' title ' : ' Special:Version ' } , timeout = 10 )
2014-07-01 00:14:44 +00:00
raw = r . text
delay ( config = config , session = session )
2013-03-27 21:19:46 +00:00
raw = removeIP ( raw = raw )
2014-07-03 17:24:28 +00:00
with open ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) , ' w ' ) as outfile :
outfile . write ( raw . encode ( ' utf-8 ' ) )
2013-03-27 21:19:46 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def saveIndexPHP ( config = { } , session = None ) :
2014-06-26 08:26:57 +00:00
""" Save index.php as .html, to preserve license details available at the botom of the page """
2014-07-03 18:23:21 +00:00
2013-03-27 21:19:46 +00:00
if os . path . exists ( ' %s /index.html ' % ( config [ ' path ' ] ) ) :
print ' index.html exists, do not overwrite '
else :
print ' Downloading index.php (Main Page) as index.html '
2018-05-07 19:01:50 +00:00
r = session . post ( url = config [ ' index ' ] , data = { } , timeout = 10 )
2014-07-01 00:14:44 +00:00
raw = r . text
delay ( config = config , session = session )
2013-03-27 21:19:46 +00:00
raw = removeIP ( raw = raw )
2014-07-03 17:24:28 +00:00
with open ( ' %s /index.html ' % ( config [ ' path ' ] ) , ' w ' ) as outfile :
outfile . write ( raw . encode ( ' utf-8 ' ) )
2013-03-27 21:19:46 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def saveSiteInfo ( config = { } , session = None ) :
2014-06-26 08:38:59 +00:00
""" Save a file with site info """
2014-07-03 18:23:21 +00:00
2014-06-26 08:39:45 +00:00
if config [ ' api ' ] :
if os . path . exists ( ' %s /siteinfo.json ' % ( config [ ' path ' ] ) ) :
print ' siteinfo.json exists, do not overwrite '
else :
2014-06-29 11:26:06 +00:00
print ' Downloading site info as siteinfo.json '
2014-09-27 22:12:04 +00:00
2014-09-14 15:10:43 +00:00
# MediaWiki 1.13+
2014-09-27 22:12:04 +00:00
r = session . post (
url = config [ ' api ' ] ,
data = {
' action ' : ' query ' ,
' meta ' : ' siteinfo ' ,
' siprop ' : ' general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo ' ,
' sinumberingroup ' : 1 ,
2018-05-07 19:01:50 +00:00
' format ' : ' json ' } ,
timeout = 10 )
2014-09-14 15:10:43 +00:00
# MediaWiki 1.11-1.12
2015-03-24 01:58:01 +00:00
if not ' query ' in getJSON ( r ) :
2014-09-27 22:12:04 +00:00
r = session . post (
url = config [ ' api ' ] ,
data = {
' action ' : ' query ' ,
' meta ' : ' siteinfo ' ,
' siprop ' : ' general|namespaces|statistics|dbrepllag|interwikimap ' ,
2018-05-07 19:01:50 +00:00
' format ' : ' json ' } ,
timeout = 10 )
2014-09-14 15:10:43 +00:00
# MediaWiki 1.8-1.10
2015-03-24 01:58:01 +00:00
if not ' query ' in getJSON ( r ) :
2014-09-27 22:12:04 +00:00
r = session . post (
url = config [ ' api ' ] ,
data = {
' action ' : ' query ' ,
' meta ' : ' siteinfo ' ,
' siprop ' : ' general|namespaces ' ,
2018-05-07 19:05:26 +00:00
' format ' : ' json ' } ,
2018-05-07 19:01:50 +00:00
timeout = 10 )
2015-03-24 01:58:01 +00:00
result = getJSON ( r )
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2014-07-03 17:24:28 +00:00
with open ( ' %s /siteinfo.json ' % ( config [ ' path ' ] ) , ' w ' ) as outfile :
outfile . write ( json . dumps ( result , indent = 4 , sort_keys = True ) )
2014-06-26 08:38:59 +00:00
2014-07-03 18:23:21 +00:00
2014-06-30 23:41:03 +00:00
def avoidWikimediaProjects ( config = { } , other = { } ) :
2013-03-27 21:26:20 +00:00
""" Skip Wikimedia projects and redirect to the dumps website """
2014-07-03 18:23:21 +00:00
# notice about wikipedia dumps
2014-09-27 22:12:04 +00:00
if re . findall (
r ' (?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage) \ .org ' ,
config [ ' api ' ] +
config [ ' index ' ] ) :
2013-03-27 20:57:30 +00:00
print ' PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS! '
print ' Download the dumps from http://dumps.wikimedia.org '
2011-07-14 19:54:14 +00:00
if not other [ ' force ' ] :
2011-07-14 20:17:22 +00:00
print ' Thanks! '
2011-07-14 19:54:14 +00:00
sys . exit ( )
2013-03-27 21:19:46 +00:00
2014-07-03 18:23:21 +00:00
2014-07-03 16:17:02 +00:00
def getWikiEngine ( url = ' ' ) :
""" Returns the wiki engine of a URL, if known """
2016-07-31 15:37:31 +00:00
session = requests . Session ( )
session . headers . update ( { ' User-Agent ' : getUserAgent ( ) } )
2018-05-07 19:01:50 +00:00
r = session . post ( url = url , timeout = 30 )
2016-07-31 15:37:31 +00:00
if r . status_code == 405 or r . text == ' ' :
2018-05-07 19:01:50 +00:00
r = session . get ( url = url , timeout = 120 )
2016-07-31 15:37:31 +00:00
result = r . text
wikiengine = ' Unknown '
2014-09-27 22:12:04 +00:00
if re . search (
ur ' (?im)(<meta name= " generator " content= " DokuWiki)|dokuwiki__site ' ,
result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' DokuWiki '
2014-07-03 17:33:09 +00:00
elif re . search ( ur ' (?im)(alt= " Powered by MediaWiki " |<meta name= " generator " content= " MediaWiki) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' MediaWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(>MoinMoin Powered</a>|<option value= " LocalSiteMap " >) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' MoinMoin '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' TWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<!--PageHeaderFmt-->) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' PmWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<meta name= " generator " content= " PhpWiki|<meta name= " PHPWIKI_VERSION) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' PhpWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<meta name= " generator " content= " Tiki Wiki|Powered by <a href= " http://(www \ .)?tiki \ .org " | id= " tiki-(top|main) " ) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' TikiWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(foswikiNoJs|<meta name= " foswiki \ .|foswikiTable|foswikiContentFooter) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' FosWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<meta http-equiv= " powered by " content= " MojoMojo) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' MojoMojo '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(id= " xwiki(content|nav_footer|platformversion|docinfo|maincontainer|data)|/resources/js/xwiki/xwiki|XWiki \ .webapppath) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' XWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<meta id= " confluence-(base-url|context-path) " ) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Confluence '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<meta name= " generator " content= " Banana Dance) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Banana Dance '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(Wheeled by <a class= " external-link " href= " http://www \ .wagn \ .org " >|<body id= " wagn " >) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Wagn '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<meta name= " generator " content= " MindTouch) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' MindTouch ' # formerly DekiWiki
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<div class= " wikiversion " > \ s*(<p>)?JSPWiki|xmlns:jspwiki= " http://www \ .jspwiki \ .org " ) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' JSPWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(Powered by:? \ s*(<br ?/>)? \ s*<a href= " http://kwiki \ .org " >| \b KwikiNavigation \b ) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Kwiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(Powered by <a href= " http://www \ .anwiki \ .com " ) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Anwiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<meta name= " generator " content= " Aneuch|is powered by <em>Aneuch</em>|<!-- start of Aneuch markup -->) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Aneuch '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<meta name= " generator " content= " bitweaver) ' , result ) :
wikiengine = ' bitweaver '
elif re . search ( ur ' (?im)(powered by <a href= " [^ " ]* \b zwiki.org(/[^ " ]*)? " >) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Zwiki '
2014-09-23 19:04:36 +00:00
# WakkaWiki forks
elif re . search ( ur ' (?im)(<meta name= " generator " content= " WikkaWiki|<a class= " ext " href= " (http://wikka \ .jsnx \ .com/|http://wikkawiki \ .org/) " >) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' WikkaWiki ' # formerly WikkaWakkaWiki
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<meta name= " generator " content= " CoMa Wiki) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' CoMaWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(Fonctionne avec <a href= " http://www \ .wikini \ .net) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' WikiNi '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(Powered by <a href= " [^ " ]*CitiWiki " >CitiWiki</a>) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' CitiWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(Powered by <a href= " http://wackowiki \ .com/|title= " WackoWiki " ) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' WackoWiki '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(Powered by <a href= " http://www \ .wakkawiki \ .com) ' , result ) :
2014-09-27 22:12:04 +00:00
# This may not work for heavily modded/themed installations, e.g.
# http://operawiki.info/
2016-07-31 15:37:31 +00:00
wikiengine = ' WakkaWiki '
2014-09-23 19:04:36 +00:00
# Custom wikis used by wiki farms
elif re . search ( ur ' (?im)(var wikispaces_page|<div class= " WikispacesContent) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Wikispaces '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(Powered by <a href= " http://www \ .wikidot \ .com " >|wikidot-privacy-button-hovertip|javascript:WIKIDOT \ .page) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Wikidot '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer) ' , result ) :
2016-07-31 15:37:31 +00:00
wikiengine = ' Wetpaint '
2014-09-23 19:04:36 +00:00
elif re . search ( ur ' (?im)(<div id= " footer-pbwiki " >|ws-nav-search|PBinfo *= * { ) ' , result ) :
# formerly PBwiki
2016-07-31 15:37:31 +00:00
wikiengine = ' PBworks '
2014-09-27 22:12:04 +00:00
# if wikiengine == 'Unknown': print result
2014-07-03 18:23:21 +00:00
2016-07-31 15:37:31 +00:00
return wikiengine
2014-07-03 16:17:02 +00:00
2014-07-03 18:23:21 +00:00
2014-07-13 10:32:49 +00:00
def mwGetAPIAndIndex ( url = ' ' ) :
2014-07-11 18:44:25 +00:00
""" Returns the MediaWiki API and Index.php """
2014-09-27 22:12:04 +00:00
2014-07-11 18:44:25 +00:00
api = ' '
index = ' '
session = requests . Session ( )
2014-09-21 00:07:49 +00:00
session . headers . update ( { ' User-Agent ' : getUserAgent ( ) } )
2018-05-07 19:01:50 +00:00
r = session . post ( url = url , timeout = 120 )
2014-07-11 18:44:25 +00:00
result = r . text
2014-09-27 22:12:04 +00:00
2014-07-13 10:32:49 +00:00
# API
2014-09-27 22:12:04 +00:00
m = re . findall (
ur ' (?im)< \ s*link \ s*rel= " EditURI " \ s*type= " application/rsd \ +xml " \ s*href= " ([^>]+?) \ ?action=rsd " \ s*/ \ s*> ' ,
result )
2014-07-11 18:44:25 +00:00
if m :
api = m [ 0 ]
2014-09-27 22:12:04 +00:00
if api . startswith ( ' // ' ) : # gentoo wiki
2014-07-13 10:32:49 +00:00
api = url . split ( ' // ' ) [ 0 ] + api
else :
2014-09-27 22:12:04 +00:00
pass # build API using index and check it
2014-07-13 10:32:49 +00:00
# Index.php
2014-09-27 22:12:04 +00:00
m = re . findall (
ur ' <li id= " ca-viewsource " [^>]*?> \ s*(?:<span>)? \ s*<a href= " ([^ \ ?]+?) \ ? ' ,
result )
2014-07-11 18:44:25 +00:00
if m :
index = m [ 0 ]
else :
2014-09-27 22:12:04 +00:00
m = re . findall (
ur ' <li id= " ca-history " [^>]*?> \ s*(?:<span>)? \ s*<a href= " ([^ \ ?]+?) \ ? ' ,
result )
2014-07-11 18:44:25 +00:00
if m :
index = m [ 0 ]
2014-07-13 10:52:48 +00:00
if index :
if index . startswith ( ' / ' ) :
index = ' / ' . join ( api . split ( ' / ' ) [ : - 1 ] ) + ' / ' + index . split ( ' / ' ) [ - 1 ]
else :
if api :
2014-09-27 22:12:04 +00:00
if len (
re . findall (
ur ' /index \ .php5 \ ? ' ,
result ) ) > len (
re . findall (
ur ' /index \ .php \ ? ' ,
result ) ) :
2014-07-13 10:52:48 +00:00
index = ' / ' . join ( api . split ( ' / ' ) [ : - 1 ] ) + ' /index.php5 '
else :
index = ' / ' . join ( api . split ( ' / ' ) [ : - 1 ] ) + ' /index.php '
2014-09-27 22:12:04 +00:00
2014-07-11 18:44:25 +00:00
return api , index
2014-09-27 22:12:04 +00:00
2014-07-11 18:44:25 +00:00
2013-03-27 21:19:46 +00:00
def main ( params = [ ] ) :
""" Main function """
configfilename = ' config.txt '
config , other = getParameters ( params = params )
2014-06-30 23:41:03 +00:00
avoidWikimediaProjects ( config = config , other = other )
2014-07-03 18:23:21 +00:00
2014-07-03 16:17:02 +00:00
print welcome ( )
2016-07-31 15:37:31 +00:00
print ' Analysing %s ' % ( config [ ' api ' ] and config [ ' api ' ] or config [ ' index ' ] )
2014-07-03 18:23:21 +00:00
# creating path or resuming if desired
2011-04-07 13:14:37 +00:00
c = 2
2014-07-03 18:23:21 +00:00
# to avoid concat blabla-2, blabla-2-3, and so on...
originalpath = config [ ' path ' ]
# do not enter if resume is requested from begining
while not other [ ' resume ' ] and os . path . isdir ( config [ ' path ' ] ) :
2011-04-07 15:43:17 +00:00
print ' \n Warning!: " %s " path exists ' % ( config [ ' path ' ] )
2011-04-11 07:58:09 +00:00
reply = ' '
2011-07-14 20:01:34 +00:00
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
2014-09-27 22:12:04 +00:00
reply = raw_input (
' There is a dump in " %s " , probably incomplete. \n If you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored \n and the parameters available in " %s / %s " will be loaded. \n Do you want to resume ([yes, y], [no, n])? ' %
( config [ ' path ' ] ,
config [ ' path ' ] ,
configfilename ) )
2011-04-07 13:14:37 +00:00
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
2011-04-07 20:24:30 +00:00
if not os . path . isfile ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) ) :
print ' No config file found. I can \' t resume. Aborting. '
2011-04-07 13:14:37 +00:00
sys . exit ( )
2013-03-27 20:58:55 +00:00
print ' You have selected: YES '
2011-04-07 15:43:17 +00:00
other [ ' resume ' ] = True
2011-04-07 13:14:37 +00:00
break
2011-04-11 07:58:09 +00:00
elif reply . lower ( ) in [ ' no ' , ' n ' ] :
2013-03-27 20:58:55 +00:00
print ' You have selected: NO '
2011-04-11 07:58:09 +00:00
other [ ' resume ' ] = False
2011-04-07 16:10:12 +00:00
config [ ' path ' ] = ' %s - %d ' % ( originalpath , c )
2013-03-27 20:57:30 +00:00
print ' Trying to use path " %s " ... ' % ( config [ ' path ' ] )
2011-04-07 13:14:37 +00:00
c + = 1
2011-04-07 15:43:17 +00:00
2011-04-07 20:24:30 +00:00
if other [ ' resume ' ] :
print ' Loading config file... '
config = loadConfig ( config = config , configfilename = configfilename )
else :
2011-04-07 15:43:17 +00:00
os . mkdir ( config [ ' path ' ] )
saveConfig ( config = config , configfilename = configfilename )
2014-07-03 18:23:21 +00:00
2011-04-07 15:43:17 +00:00
if other [ ' resume ' ] :
2013-03-27 21:26:20 +00:00
resumePreviousDump ( config = config , other = other )
2011-04-07 13:14:37 +00:00
else :
2013-03-27 21:26:20 +00:00
createNewDump ( config = config , other = other )
2013-03-27 21:19:46 +00:00
2014-07-03 18:23:21 +00:00
saveIndexPHP ( config = config , session = other [ ' session ' ] )
saveSpecialVersion ( config = config , session = other [ ' session ' ] )
saveSiteInfo ( config = config , session = other [ ' session ' ] )
2011-04-09 17:45:56 +00:00
bye ( )
2011-04-07 15:43:17 +00:00
if __name__ == " __main__ " :
2012-06-22 11:34:27 +00:00
main ( )