2014-02-26 23:22:53 +00:00
#!/usr/bin/env python2
2011-04-05 16:18:18 +00:00
# -*- coding: utf-8 -*-
2014-06-25 20:53:46 +00:00
# dumpgenerator.py A generator of dumps for wikis
2014-06-25 16:08:33 +00:00
# Copyright (C) 2011-2014 WikiTeam developers
2011-04-05 16:18:18 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
2014-07-03 18:23:21 +00:00
#
2011-04-05 16:18:18 +00:00
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
2014-07-03 18:23:21 +00:00
#
2011-04-05 16:18:18 +00:00
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2014-06-25 16:08:33 +00:00
# To learn more, read the documentation:
# https://github.com/WikiTeam/wikiteam/wiki
2012-04-07 14:58:23 +00:00
2013-10-09 20:11:01 +00:00
import cookielib
2011-04-07 13:14:37 +00:00
import cPickle
2011-04-05 22:00:29 +00:00
import datetime
2014-07-01 17:26:57 +00:00
import sys
try :
import argparse
except ImportError :
print " Please install the argparse module. "
sys . exit ( 1 )
2014-06-25 20:53:46 +00:00
import json
2011-05-07 22:04:00 +00:00
try :
from hashlib import md5
except ImportError : # Python 2.4 compatibility
from md5 import new as md5
2011-04-05 22:00:29 +00:00
import os
2011-04-05 16:18:18 +00:00
import re
2014-07-01 17:26:57 +00:00
try :
import requests
except ImportError :
print " Please install or update the Requests module. "
sys . exit ( 1 )
2011-04-07 15:56:48 +00:00
import time
2011-04-05 16:18:18 +00:00
import urllib
2014-07-05 14:23:15 +00:00
__VERSION__ = ' 0.3.0-alpha ' # major, minor, micro: semver.org
2014-07-03 18:23:21 +00:00
2014-06-25 20:53:46 +00:00
2014-06-30 23:41:03 +00:00
def getVersion ( ) :
return ( __VERSION__ )
2014-06-25 20:53:46 +00:00
2014-06-25 19:33:02 +00:00
2011-04-11 19:09:55 +00:00
def truncateFilename ( other = { } , filename = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Truncate filenames when downloading images with large filenames """
2011-05-30 20:31:32 +00:00
return filename [ : other [ ' filenamelimit ' ] ] + md5 ( filename ) . hexdigest ( ) + ' . ' + filename . split ( ' . ' ) [ - 1 ]
2011-04-11 18:38:47 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def delay ( config = { } , session = None ) :
2012-08-06 14:48:30 +00:00
""" Add a delay if configured for that """
2011-04-07 17:28:08 +00:00
if config [ ' delay ' ] > 0 :
print ' Sleeping... %d seconds... ' % ( config [ ' delay ' ] )
time . sleep ( config [ ' delay ' ] )
2011-04-07 16:10:12 +00:00
2014-07-03 18:23:21 +00:00
2011-04-05 22:00:29 +00:00
def cleanHTML ( raw = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Extract only the real wiki content and remove rubbish """
""" This function is ONLY used to retrieve page titles and file names when no API is available """
2012-08-06 14:54:31 +00:00
""" DO NOT use this function to extract page content """
2014-07-03 18:23:21 +00:00
# different "tags" used by different MediaWiki versions to mark where
# starts and ends content
2011-04-12 16:31:50 +00:00
if re . search ( ' <!-- bodytext --> ' , raw ) :
2011-04-05 22:00:29 +00:00
raw = raw . split ( ' <!-- bodytext --> ' ) [ 1 ] . split ( ' <!-- /bodytext --> ' ) [ 0 ]
elif re . search ( ' <!-- start content --> ' , raw ) :
2014-07-03 18:23:21 +00:00
raw = raw . split (
' <!-- start content --> ' ) [ 1 ] . split ( ' <!-- end content --> ' ) [ 0 ]
2011-04-12 16:31:50 +00:00
elif re . search ( ' <!-- Begin Content Area --> ' , raw ) :
2014-07-03 18:23:21 +00:00
raw = raw . split (
' <!-- Begin Content Area --> ' ) [ 1 ] . split ( ' <!-- End Content Area --> ' ) [ 0 ]
2011-05-05 21:46:48 +00:00
elif re . search ( ' <!-- content --> ' , raw ) :
raw = raw . split ( ' <!-- content --> ' ) [ 1 ] . split ( ' <!-- mw_content --> ' ) [ 0 ]
2011-09-08 20:00:33 +00:00
elif re . search ( ' <article id= " WikiaMainContent " class= " WikiaMainContent " > ' , raw ) :
raw = raw . split ( ' <article id= " WikiaMainContent " class= " WikiaMainContent " > ' ) [ 1 ] . split ( ' </article> ' ) [ 0 ]
2014-07-04 17:39:48 +00:00
elif re . search ( ' <body class= ' , raw ) :
raw = raw . split ( ' <body class= ' ) [ 1 ] . split ( ' <div class= " printfooter " > ' ) [ 0 ]
2011-04-05 22:00:29 +00:00
else :
2011-06-04 11:10:45 +00:00
print raw [ : 250 ]
2012-05-20 08:08:28 +00:00
print ' This wiki doesn \' t use marks to split content '
2011-04-05 22:00:29 +00:00
sys . exit ( )
return raw
2014-07-03 18:23:21 +00:00
2014-07-01 17:26:57 +00:00
def handleStatusCode ( response ) :
statuscode = response . status_code
if statuscode > = 200 and statuscode < 300 :
return
print " HTTP Error %d . " % statuscode
if statuscode > = 300 and statuscode < 400 :
print " Redirect should happen automatically: please report this as a bug. "
print response . url
elif statuscode == 400 :
print " Bad Request: The wiki may be malfunctioning. "
print " Please try again later. "
print response . url
sys . exit ( 1 )
elif statuscode == 401 or statuscode == 403 :
print " Authentication required. "
print " Please use --userpass. "
print response . url
elif statuscode == 404 :
print " Not found. Is Special:Export enabled for this wiki? "
print response . url
sys . exit ( 1 )
elif statuscode == 429 or ( statuscode > = 500 and statuscode < 600 ) :
print " Server error, max retries exceeded. "
print " Please resume the dump later. "
print response . url
sys . exit ( 1 )
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getNamespacesScraper ( config = { } , session = None ) :
2013-03-27 22:11:51 +00:00
""" Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
""" Function called if no API is available """
2011-04-07 15:43:17 +00:00
namespaces = config [ ' namespaces ' ]
2014-07-03 18:23:21 +00:00
namespacenames = { 0 : ' ' } # main is 0, no prefix
2011-04-05 16:18:18 +00:00
if namespaces :
2014-07-03 18:23:21 +00:00
r = session . post (
url = config [ ' index ' ] , data = { ' title ' : ' Special:Allpages ' } )
2014-07-01 00:14:44 +00:00
raw = r . text
delay ( config = config , session = session )
2014-01-23 16:05:19 +00:00
2014-07-03 18:23:21 +00:00
# [^>]*? to include selected="selected"
m = re . compile (
r ' <option [^>]*?value= " (?P<namespaceid> \ d+) " [^>]*?>(?P<namespacename>[^<]+)</option> ' ) . finditer ( raw )
2011-04-05 16:18:18 +00:00
if ' all ' in namespaces :
namespaces = [ ]
for i in m :
namespaces . append ( int ( i . group ( " namespaceid " ) ) )
2014-07-03 18:23:21 +00:00
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group (
" namespacename " )
2011-04-05 16:18:18 +00:00
else :
2014-07-03 18:23:21 +00:00
# check if those namespaces really exist in this wiki
2011-04-05 16:18:18 +00:00
namespaces2 = [ ]
for i in m :
if int ( i . group ( " namespaceid " ) ) in namespaces :
namespaces2 . append ( int ( i . group ( " namespaceid " ) ) )
2014-07-03 18:23:21 +00:00
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group (
" namespacename " )
2011-04-05 16:18:18 +00:00
namespaces = namespaces2
else :
namespaces = [ 0 ]
2014-07-03 18:23:21 +00:00
namespaces = list ( set ( namespaces ) ) # uniques
2011-07-14 20:17:22 +00:00
print ' %d namespaces found ' % ( len ( namespaces ) )
2011-04-09 09:12:58 +00:00
return namespaces , namespacenames
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getNamespacesAPI ( config = { } , session = None ) :
2012-05-20 08:08:28 +00:00
""" Uses the API to get the list of namespaces names and ids """
namespaces = config [ ' namespaces ' ]
2014-07-03 18:23:21 +00:00
namespacenames = { 0 : ' ' } # main is 0, no prefix
2012-05-20 08:08:28 +00:00
if namespaces :
2014-07-03 18:23:21 +00:00
r = session . post ( url = config [ ' api ' ] , data = {
' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' siprop ' : ' namespaces ' , ' format ' : ' json ' } )
2014-07-01 00:14:44 +00:00
result = json . loads ( r . text )
delay ( config = config , session = session )
2014-01-23 16:05:19 +00:00
2012-05-20 08:08:28 +00:00
if ' all ' in namespaces :
namespaces = [ ]
2014-06-25 21:54:05 +00:00
for i in result [ ' query ' ] [ ' namespaces ' ] . keys ( ) :
2014-07-03 18:23:21 +00:00
if int ( i ) < 0 : # -1: Special, -2: Media, excluding
2014-06-25 21:54:05 +00:00
continue
namespaces . append ( int ( i ) )
namespacenames [ int ( i ) ] = result [ ' query ' ] [ ' namespaces ' ] [ i ] [ ' * ' ]
2012-05-20 08:08:28 +00:00
else :
2014-07-03 18:23:21 +00:00
# check if those namespaces really exist in this wiki
2012-05-20 08:08:28 +00:00
namespaces2 = [ ]
2014-06-25 21:54:05 +00:00
for i in result [ ' query ' ] [ ' namespaces ' ] . keys ( ) :
2014-07-03 18:23:21 +00:00
if int ( i ) < 0 : # -1: Special, -2: Media, excluding
2014-06-25 21:54:05 +00:00
continue
if int ( i ) in namespaces :
namespaces2 . append ( int ( i ) )
namespacenames [ int ( i ) ] = result [ ' query ' ] [ ' namespaces ' ] [ i ] [ ' * ' ]
2012-05-20 08:08:28 +00:00
namespaces = namespaces2
else :
namespaces = [ 0 ]
2014-07-03 18:23:21 +00:00
namespaces = list ( set ( namespaces ) ) # uniques
2012-05-20 08:08:28 +00:00
print ' %d namespaces found ' % ( len ( namespaces ) )
return namespaces , namespacenames
2011-04-09 09:12:58 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getPageTitlesAPI ( config = { } , session = None ) :
2012-05-20 08:08:28 +00:00
""" Uses the API to get the list of page titles """
2011-04-05 16:18:18 +00:00
titles = [ ]
2014-07-03 18:23:21 +00:00
namespaces , namespacenames = getNamespacesAPI (
config = config , session = session )
2011-04-05 16:18:18 +00:00
for namespace in namespaces :
2011-04-13 19:44:35 +00:00
if namespace in config [ ' exnamespaces ' ] :
2014-06-25 20:53:46 +00:00
print ' Skipping namespace = %d ' % ( namespace )
2011-04-13 19:44:35 +00:00
continue
2014-07-03 18:23:21 +00:00
2011-04-09 09:12:58 +00:00
c = 0
2011-07-14 20:36:46 +00:00
print ' Retrieving titles in the namespace %d ' % ( namespace )
2011-04-09 09:12:58 +00:00
apfrom = ' ! '
while apfrom :
2014-07-03 18:23:21 +00:00
sys . stderr . write ( ' . ' ) # progress
params = { ' action ' : ' query ' , ' list ' : ' allpages ' , ' apnamespace ' : namespace ,
' apfrom ' : apfrom . encode ( ' utf-8 ' ) , ' format ' : ' json ' , ' aplimit ' : 500 }
2014-07-01 17:26:57 +00:00
r = session . post ( url = config [ ' api ' ] , data = params )
handleStatusCode ( r )
2014-07-03 18:23:21 +00:00
# FIXME Handle HTTP errors here!
2014-07-01 00:14:44 +00:00
jsontitles = json . loads ( r . text )
2014-06-25 20:53:46 +00:00
apfrom = ' '
2014-07-03 18:23:21 +00:00
if ' query-continue ' in jsontitles and ' allpages ' in jsontitles [ ' query-continue ' ] :
if ' apcontinue ' in jsontitles [ ' query-continue ' ] [ ' allpages ' ] :
apfrom = jsontitles [ ' query-continue ' ] [ ' allpages ' ] [ ' apcontinue ' ]
elif ' apfrom ' in jsontitles [ ' query-continue ' ] [ ' allpages ' ] :
2014-06-25 20:53:46 +00:00
apfrom = jsontitles [ ' query-continue ' ] [ ' allpages ' ] [ ' apfrom ' ]
2014-07-03 18:23:21 +00:00
# print apfrom
# print jsontitles
titles + = [ page [ ' title ' ]
for page in jsontitles [ ' query ' ] [ ' allpages ' ] ]
2014-06-25 20:53:46 +00:00
if len ( titles ) != len ( set ( titles ) ) :
2014-07-03 18:23:21 +00:00
# probably we are in a loop, server returning dupe titles, stop
# it
2014-06-25 20:53:46 +00:00
print ' Probably a loop, finishing '
titles = list ( set ( titles ) )
2011-04-09 09:12:58 +00:00
apfrom = ' '
2014-06-25 20:53:46 +00:00
c + = len ( jsontitles [ ' query ' ] [ ' allpages ' ] )
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2011-04-09 09:12:58 +00:00
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
return titles
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getPageTitlesScraper ( config = { } , session = None ) :
2011-07-12 16:54:37 +00:00
""" """
2011-04-09 09:12:58 +00:00
titles = [ ]
2014-07-03 18:23:21 +00:00
namespaces , namespacenames = getNamespacesScraper (
config = config , session = session )
2011-04-09 09:12:58 +00:00
for namespace in namespaces :
print ' Retrieving titles in the namespace ' , namespace
2014-07-03 18:23:21 +00:00
url = ' %s ?title=Special:Allpages&namespace= %s ' % (
config [ ' index ' ] , namespace )
2014-07-01 17:26:57 +00:00
r = session . get ( url = url )
2014-07-01 00:14:44 +00:00
raw = r . text
2011-04-05 22:00:29 +00:00
raw = cleanHTML ( raw )
2014-07-03 18:23:21 +00:00
2011-04-05 22:00:29 +00:00
r_title = r ' title= " (?P<title>[^>]+) " > '
2011-04-08 23:43:57 +00:00
r_suballpages = ' '
r_suballpages1 = r ' &from=(?P<from>[^>]+)&to=(?P<to>[^>]+) " > '
r_suballpages2 = r ' Special:Allpages/(?P<from>[^>]+) " > '
if re . search ( r_suballpages1 , raw ) :
r_suballpages = r_suballpages1
elif re . search ( r_suballpages2 , raw ) :
r_suballpages = r_suballpages2
else :
2014-07-03 18:23:21 +00:00
pass # perhaps no subpages
# 3 is the current deep of English Wikipedia for Special:Allpages, 3
# levels
deep = 3
2011-04-05 22:00:29 +00:00
c = 0
checked_suballpages = [ ]
2011-04-05 22:08:53 +00:00
rawacum = raw
2011-04-08 23:43:57 +00:00
while r_suballpages and re . search ( r_suballpages , raw ) and c < deep :
2014-07-03 18:23:21 +00:00
# load sub-Allpages
2011-04-05 22:00:29 +00:00
m = re . compile ( r_suballpages ) . finditer ( raw )
for i in m :
fr = i . group ( ' from ' )
2014-07-03 18:23:21 +00:00
2011-04-08 23:43:57 +00:00
if r_suballpages == r_suballpages1 :
to = i . group ( ' to ' )
name = ' %s - %s ' % ( fr , to )
2014-07-03 18:23:21 +00:00
url = ' %s ?title=Special:Allpages&namespace= %s &from= %s &to= %s ' % (
config [ ' index ' ] , namespace , fr , to ) # do not put urllib.quote in fr or to
# fix, esta regexp no carga bien todas? o falla el r_title en
# este tipo de subpag? (wikiindex)
elif r_suballpages == r_suballpages2 :
# clean &namespace=\d, sometimes happens
fr = fr . split ( ' &namespace= ' ) [ 0 ]
2011-04-08 23:43:57 +00:00
name = fr
2014-07-03 18:23:21 +00:00
url = ' %s ?title=Special:Allpages/ %s &namespace= %s ' % (
config [ ' index ' ] , name , namespace )
if name not in checked_suballpages :
# to avoid reload dupe subpages links
checked_suballpages . append ( name )
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2014-07-01 17:26:57 +00:00
r2 = session . get ( url = url )
2014-07-01 00:14:44 +00:00
raw2 = r2 . text
2011-04-05 22:00:29 +00:00
raw2 = cleanHTML ( raw2 )
2014-07-03 18:23:21 +00:00
rawacum + = raw2 # merge it after removed junk
2011-04-08 23:43:57 +00:00
print ' Reading ' , name , len ( raw2 ) , ' bytes ' , len ( re . findall ( r_suballpages , raw2 ) ) , ' subpages ' , len ( re . findall ( r_title , raw2 ) ) , ' pages '
2014-01-23 16:05:19 +00:00
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2011-04-05 22:00:29 +00:00
c + = 1
2014-07-03 18:23:21 +00:00
2011-04-09 08:05:48 +00:00
c = 0
2011-04-05 22:00:29 +00:00
m = re . compile ( r_title ) . finditer ( rawacum )
2011-04-05 16:18:18 +00:00
for i in m :
2014-07-01 00:14:44 +00:00
t = undoHTMLEntities ( text = i . group ( ' title ' ) )
2014-06-27 14:34:48 +00:00
if not t . startswith ( ' Special: ' ) :
2014-07-03 18:23:21 +00:00
if t not in titles :
2014-06-27 14:34:48 +00:00
titles . append ( t )
2011-04-09 08:05:48 +00:00
c + = 1
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
2011-04-09 09:12:58 +00:00
return titles
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getPageTitles ( config = { } , session = None ) :
2014-06-25 20:53:46 +00:00
""" Get list of page titles """
2014-07-03 18:23:21 +00:00
# http://en.wikipedia.org/wiki/Special:AllPages
# http://archiveteam.org/index.php?title=Special:AllPages
# http://www.wikanda.es/wiki/Especial:Todas
2011-04-13 19:44:35 +00:00
print ' Loading page titles from namespaces = %s ' % ( config [ ' namespaces ' ] and ' , ' . join ( [ str ( i ) for i in config [ ' namespaces ' ] ] ) or ' None ' )
print ' Excluding titles from namespaces = %s ' % ( config [ ' exnamespaces ' ] and ' , ' . join ( [ str ( i ) for i in config [ ' exnamespaces ' ] ] ) or ' None ' )
2014-07-03 18:23:21 +00:00
2011-04-09 09:12:58 +00:00
titles = [ ]
2014-07-05 14:45:55 +00:00
if ' api ' in config and config [ ' api ' ] :
2014-07-01 00:14:44 +00:00
titles = getPageTitlesAPI ( config = config , session = session )
2014-07-05 14:45:55 +00:00
elif ' index ' in config and config [ ' index ' ] :
2014-07-01 00:14:44 +00:00
titles = getPageTitlesScraper ( config = config , session = session )
2014-07-03 18:23:21 +00:00
# removing dupes (e.g. in CZ appears Widget:AddThis two times (main
# namespace and widget namespace))
titles = list ( set ( titles ) )
2014-07-04 18:40:14 +00:00
titles . sort ( )
2014-07-03 18:23:21 +00:00
2011-04-06 18:54:33 +00:00
print ' %d page titles loaded ' % ( len ( titles ) )
2011-04-05 16:18:18 +00:00
return titles
2014-07-03 18:23:21 +00:00
2014-07-04 18:40:14 +00:00
def getImageNames ( config = { } , session = None ) :
""" Get list of image names """
print ' Retrieving image filenames '
images = [ ]
2014-07-05 14:45:55 +00:00
if ' api ' in config and config [ ' api ' ] :
2014-07-04 18:40:14 +00:00
images = getImageNamesAPI ( config = config , session = session )
2014-07-05 14:45:55 +00:00
elif ' index ' in config and config [ ' index ' ] :
2014-07-04 18:40:14 +00:00
images = getImageNamesScraper ( config = config , session = session )
#images = list(set(images)) # it is a list of lists
images . sort ( )
print ' %d image names loaded ' % ( len ( images ) )
return images
2014-07-01 00:14:44 +00:00
def getXMLHeader ( config = { } , session = None ) :
2011-07-15 18:07:50 +00:00
""" Retrieve a random page to extract XML headers (namespace info, etc) """
2014-07-03 18:23:21 +00:00
# get the header of a random page, to attach it in the complete XML backup
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x....
randomtitle = ' Main_Page ' # previously AMF5LKE43MNFGHKSDMRTJ
xml = getXMLPage (
config = config , title = randomtitle , verbose = False , session = session )
2011-04-05 22:00:29 +00:00
header = xml . split ( ' </mediawiki> ' ) [ 0 ]
2012-05-28 09:35:56 +00:00
if not xml :
print ' XML export on this wiki is broken, quitting. '
sys . exit ( )
2011-04-05 22:00:29 +00:00
return header
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getXMLFileDesc ( config = { } , title = ' ' , session = None ) :
2013-03-27 22:11:51 +00:00
""" Get XML for image description page """
2014-07-03 18:23:21 +00:00
config [ ' curonly ' ] = 1 # tricky to get only the most recent desc
2014-07-01 00:14:44 +00:00
return getXMLPage ( config = config , title = title , verbose = False , session = session )
2011-04-08 13:39:14 +00:00
2014-07-03 18:23:21 +00:00
2011-04-09 08:05:48 +00:00
def getUserAgent ( ) :
2011-07-12 16:54:37 +00:00
""" Return a cool user-agent to hide Python user-agent """
2014-06-26 08:13:59 +00:00
useragents = [
2014-07-03 18:23:21 +00:00
# firefox
' Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0 ' ,
2014-06-26 08:13:59 +00:00
' Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0 ' ,
2014-07-03 18:23:21 +00:00
]
2011-04-09 08:05:48 +00:00
return useragents [ 0 ]
2014-07-03 18:23:21 +00:00
2011-04-30 18:53:35 +00:00
def logerror ( config = { } , text = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Log error in file """
2011-04-30 18:53:35 +00:00
if text :
2014-07-03 17:24:28 +00:00
with open ( ' %s /errors.log ' % ( config [ ' path ' ] ) , ' a ' ) as outfile :
2014-07-03 18:23:21 +00:00
output = u ' %s : %s \n ' % (
datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S ' ) , text )
2014-07-03 17:24:28 +00:00
outfile . write ( output . encode ( ' utf-8 ' ) )
2011-04-30 18:53:35 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getXMLPageCore ( headers = { } , params = { } , config = { } , session = None ) :
2011-07-12 16:54:37 +00:00
""" """
2014-07-03 18:23:21 +00:00
# returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
# if retrieving params['limit'] revisions fails, returns a current only version
# if all fail, returns the empty string
2011-04-30 14:37:15 +00:00
xml = ' '
c = 0
2014-07-03 18:23:21 +00:00
maxseconds = 100 # max seconds to wait in a single sleeping
maxretries = 5 # x retries and skip
increment = 20 # increment every retry
2011-04-30 14:37:15 +00:00
while not re . search ( r ' </mediawiki> ' , xml ) :
2011-04-30 18:53:35 +00:00
if c > 0 and c < maxretries :
2014-07-03 18:23:21 +00:00
wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds
2011-04-30 18:53:35 +00:00
print ' XML for " %s " is wrong. Waiting %d seconds and reloading... ' % ( params [ ' pages ' ] , wait )
2011-04-30 14:37:15 +00:00
time . sleep ( wait )
2014-07-03 18:23:21 +00:00
# reducing server load requesting smallest chunks (if curonly then
# limit = 1 from mother function)
if params [ ' limit ' ] > 1 :
params [ ' limit ' ] = params [ ' limit ' ] / 2 # half
2011-04-30 18:53:35 +00:00
if c > = maxretries :
print ' We have retried %d times ' % ( c )
print ' MediaWiki error for " %s " , network error or whatever... ' % ( params [ ' pages ' ] )
2012-05-28 09:35:56 +00:00
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save nonly the last
2014-07-03 18:23:21 +00:00
# params['curonly'] should mean that we've already tried this
# fallback, because it's set by the following if and passed to
# getXMLPageCore
if not config [ ' curonly ' ] :
2011-04-30 18:53:35 +00:00
print ' Trying to save only the last revision for this page... '
params [ ' curonly ' ] = 1
2014-07-03 18:23:21 +00:00
logerror ( config = config , text = ' Error while retrieving the full history of " %s " . Trying to save only the last revision for this page ' % (
params [ ' pages ' ] ) )
2014-07-11 15:24:15 +00:00
return getXMLPageCore ( headers = headers , params = params , config = config , session = session )
2011-04-30 18:53:35 +00:00
else :
2012-05-28 09:35:56 +00:00
print ' Saving in the errors log, and skipping... '
2014-07-03 18:23:21 +00:00
logerror ( config = config , text = ' Error while retrieving the last revision of " %s " . Skipping. ' % (
params [ ' pages ' ] ) )
return ' ' # empty xml
# FIXME HANDLE HTTP Errors HERE
2014-07-01 00:14:44 +00:00
r = session . post ( url = config [ ' index ' ] , data = params , headers = headers )
2014-07-01 17:26:57 +00:00
handleStatusCode ( r )
2014-07-01 00:14:44 +00:00
xml = r . text
2011-04-30 14:37:15 +00:00
c + = 1
2014-07-03 18:23:21 +00:00
2011-04-30 14:37:15 +00:00
return xml
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def getXMLPage ( config = { } , title = ' ' , verbose = True , session = None ) :
2014-06-26 08:17:00 +00:00
""" Get the full history (or current only) of a page """
2014-07-03 18:23:21 +00:00
# if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
# http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
2011-04-06 18:54:33 +00:00
limit = 1000
2011-04-05 23:34:52 +00:00
truncated = False
2011-04-10 09:17:05 +00:00
title_ = title
title_ = re . sub ( ' ' , ' _ ' , title_ )
2014-07-03 18:23:21 +00:00
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
2014-07-01 17:26:57 +00:00
params = { ' title ' : ' Special:Export ' , ' pages ' : title_ , ' action ' : ' submit ' }
2011-04-07 15:43:17 +00:00
if config [ ' curonly ' ] :
2011-04-05 22:00:29 +00:00
params [ ' curonly ' ] = 1
2011-04-30 17:05:59 +00:00
params [ ' limit ' ] = 1
2011-04-05 22:00:29 +00:00
else :
2014-07-03 18:23:21 +00:00
params [ ' offset ' ] = ' 1 ' # 1 always < 2000s
2011-04-05 22:00:29 +00:00
params [ ' limit ' ] = limit
2014-07-03 18:23:21 +00:00
# in other case, do not set params['templates']
if ' templates ' in config and config [ ' templates ' ] :
2012-04-20 20:03:54 +00:00
params [ ' templates ' ] = 1
2014-07-03 18:23:21 +00:00
2014-07-01 17:26:57 +00:00
xml = getXMLPageCore ( params = params , config = config , session = session )
2011-04-05 22:00:29 +00:00
2014-07-03 18:23:21 +00:00
# if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
# else, warning about Special:Export truncating large page histories
2011-04-05 23:00:33 +00:00
r_timestamp = r ' <timestamp>([^<]+)</timestamp> '
2014-07-03 18:23:21 +00:00
# search for timestamps in xml to avoid analysing empty pages like
# Special:Allpages and the random one
if not config [ ' curonly ' ] and re . search ( r_timestamp , xml ) :
while not truncated and params [ ' offset ' ] : # next chunk
# get the last timestamp from the acum XML
params [ ' offset ' ] = re . findall ( r_timestamp , xml ) [ - 1 ]
xml2 = getXMLPageCore (
params = params , config = config , session = session )
# are there more edits in this next XML chunk or no <page></page>?
if re . findall ( r_timestamp , xml2 ) :
2011-04-06 11:39:02 +00:00
if re . findall ( r_timestamp , xml2 ) [ - 1 ] == params [ ' offset ' ] :
2014-07-03 18:23:21 +00:00
# again the same XML, this wiki does not support params in
# Special:Export, offer complete XML up to X edits (usually
# 1000)
2012-04-08 11:07:00 +00:00
print ' ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated '
2011-04-06 11:39:02 +00:00
truncated = True
break
else :
2011-04-23 21:45:57 +00:00
""" </namespaces>
< / siteinfo >
< page >
< title > Main Page < / title >
< id > 15580374 < / id >
< restrictions > edit = sysop : move = sysop < / restrictions > ( ? )
< revision >
< id > 418009832 < / id >
< timestamp > 2011 - 03 - 09 T19 : 57 : 06 Z < / timestamp >
< contributor >
"""
2014-07-03 18:23:21 +00:00
# offset is OK in this wiki, merge with the previous chunk
# of this page history and continue
xml = xml . split (
' </page> ' ) [ 0 ] + ' <revision> ' + ( ' <revision> ' . join ( xml2 . split ( ' <revision> ' ) [ 1 : ] ) )
2011-04-05 23:00:33 +00:00
else :
2014-07-03 18:23:21 +00:00
params [ ' offset ' ] = ' ' # no more edits in this page history
2011-04-30 17:05:59 +00:00
if verbose :
2014-06-26 08:17:00 +00:00
numberofedits = len ( re . findall ( r_timestamp , xml ) )
if ( numberofedits == 1 ) :
2014-09-14 13:03:44 +00:00
print ' %s , 1 edit ' % ( title . encode ( ' utf-8 ' ) )
2014-06-26 08:17:00 +00:00
else :
2014-09-14 13:03:44 +00:00
print ' %s , %d edits ' % ( title . encode ( ' utf-8 ' ) , numberofedits )
2014-07-03 18:23:21 +00:00
2011-04-05 22:00:29 +00:00
return xml
2014-07-03 18:23:21 +00:00
2011-04-05 22:00:29 +00:00
def cleanXML ( xml = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Trim redundant info """
2014-07-03 18:23:21 +00:00
# do not touch XML codification, leave AS IS
2011-04-30 18:53:35 +00:00
if re . search ( r ' </siteinfo> \ n ' , xml ) and re . search ( r ' </mediawiki> ' , xml ) :
xml = xml . split ( ' </siteinfo> \n ' ) [ 1 ]
xml = xml . split ( ' </mediawiki> ' ) [ 0 ]
2011-04-05 22:00:29 +00:00
return xml
2011-04-05 16:18:18 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def generateXMLDump ( config = { } , titles = [ ] , start = ' ' , session = None ) :
2014-06-26 08:26:57 +00:00
""" Generates a XML dump for a list of titles """
2014-07-03 18:23:21 +00:00
2011-04-07 20:24:30 +00:00
print ' Retrieving the XML for every page from " %s " ' % ( start and start or ' start ' )
2014-07-01 00:14:44 +00:00
header = getXMLHeader ( config = config , session = session )
2014-07-03 18:23:21 +00:00
footer = ' </mediawiki> \n ' # new line at the end
xmlfilename = ' %s - %s - %s .xml ' % ( domain2prefix ( config = config ) ,
config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' )
2011-04-07 20:24:30 +00:00
xmlfile = ' '
lock = True
if start :
2014-07-03 18:23:21 +00:00
# remove the last chunk of xml dump (it is probably incomplete)
2011-04-07 20:24:30 +00:00
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' r ' )
2011-04-15 13:24:16 +00:00
xmlfile2 = open ( ' %s / %s 2 ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
prev = ' '
c = 0
for l in xmlfile :
2014-07-03 18:23:21 +00:00
# removing <page>\n until end of file
# lock to avoid write an empty line at the begining of file
if c != 0 :
if not re . search ( r ' <title> %s </title> ' % ( start ) , l ) :
2011-04-15 13:24:16 +00:00
xmlfile2 . write ( prev )
else :
break
c + = 1
prev = l
2011-04-07 20:24:30 +00:00
xmlfile . close ( )
2011-04-15 13:24:16 +00:00
xmlfile2 . close ( )
2014-07-03 18:23:21 +00:00
# subst xml with xml2
# remove previous xml dump
os . remove ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) )
# move correctly truncated dump to its real name
os . rename (
' %s / %s 2 ' % ( config [ ' path ' ] , xmlfilename ) , ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) )
2011-04-07 20:24:30 +00:00
else :
2014-07-03 18:23:21 +00:00
# requested complete xml dump
2011-04-07 20:24:30 +00:00
lock = False
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
2014-07-01 00:14:44 +00:00
xmlfile . write ( header . encode ( ' utf-8 ' ) )
2011-04-07 20:24:30 +00:00
xmlfile . close ( )
2014-07-03 18:23:21 +00:00
2011-04-07 20:24:30 +00:00
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' a ' )
2011-04-05 22:38:58 +00:00
c = 1
2011-04-05 22:00:29 +00:00
for title in titles :
2011-04-16 14:51:48 +00:00
if not title . strip ( ) :
continue
2014-07-03 18:23:21 +00:00
if title == start : # start downloading from start, included
2011-04-07 20:24:30 +00:00
lock = False
if lock :
continue
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2011-04-05 22:38:58 +00:00
if c % 10 == 0 :
2011-04-30 17:05:59 +00:00
print ' Downloaded %d pages ' % ( c )
2014-07-01 00:14:44 +00:00
xml = getXMLPage ( config = config , title = title , session = session )
2011-04-05 22:00:29 +00:00
xml = cleanXML ( xml = xml )
2011-04-30 18:53:35 +00:00
if not xml :
2014-07-03 18:23:21 +00:00
logerror (
config = config , text = u ' The page " %s " was missing in the wiki (probably deleted) ' % ( title ) )
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
2014-07-01 00:14:44 +00:00
xmlfile . write ( xml . encode ( ' utf-8 ' ) )
2011-04-05 22:38:58 +00:00
c + = 1
2011-04-05 22:00:29 +00:00
xmlfile . write ( footer )
xmlfile . close ( )
2011-04-07 13:14:37 +00:00
print ' XML dump saved at... ' , xmlfilename
2011-04-06 18:54:33 +00:00
2014-07-03 18:23:21 +00:00
2011-04-07 15:43:17 +00:00
def saveTitles ( config = { } , titles = [ ] ) :
2013-03-27 22:11:51 +00:00
""" Save title list in a file """
2014-06-26 08:26:57 +00:00
2014-07-03 18:23:21 +00:00
titlesfilename = ' %s - %s -titles.txt ' % (
domain2prefix ( config = config ) , config [ ' date ' ] )
2011-04-07 20:24:30 +00:00
titlesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , titlesfilename ) , ' w ' )
2014-06-25 20:53:46 +00:00
output = u " %s \n --END-- " % ( ' \n ' . join ( titles ) )
titlesfile . write ( output . encode ( ' utf-8 ' ) )
2011-04-07 13:14:37 +00:00
titlesfile . close ( )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
print ' Titles saved at... ' , titlesfilename
2011-04-06 18:54:33 +00:00
2014-07-03 18:23:21 +00:00
2014-07-04 18:40:14 +00:00
def saveImageNames ( config = { } , images = [ ] , session = None ) :
2014-06-26 08:26:57 +00:00
""" Save image list in a file, including filename, url and uploader """
2014-07-03 18:23:21 +00:00
imagesfilename = ' %s - %s -images.txt ' % (
domain2prefix ( config = config ) , config [ ' date ' ] )
2011-04-07 22:32:05 +00:00
imagesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , imagesfilename ) , ' w ' )
2014-07-03 18:23:21 +00:00
imagesfile . write ( ( ' \n ' . join ( [ ' %s \t %s \t %s ' % (
filename , url , uploader ) for filename , url , uploader in images ] ) . encode ( ' utf-8 ' ) ) )
2014-07-01 00:14:44 +00:00
imagesfile . write ( ' \n --END-- ' )
2011-04-07 22:32:05 +00:00
imagesfile . close ( )
2014-07-03 18:23:21 +00:00
2011-04-07 22:32:05 +00:00
print ' Image filenames and URLs saved at... ' , imagesfilename
2014-07-03 18:23:21 +00:00
2014-07-04 17:39:48 +00:00
def curateImageURL ( config = { } , url = ' ' ) :
""" Returns an absolute URL for an image, adding the domain if missing """
2014-07-05 14:45:55 +00:00
if ' index ' in config and config [ ' index ' ] :
2014-07-04 17:39:48 +00:00
#remove from :// (http or https) until the first / after domain
domainalone = config [ ' index ' ] . split ( ' :// ' ) [ 0 ] + ' :// ' + config [ ' index ' ] . split ( ' :// ' ) [ 1 ] . split ( ' / ' ) [ 0 ]
2014-07-05 14:45:55 +00:00
elif ' api ' in config and config [ ' api ' ] :
2014-07-04 17:39:48 +00:00
domainalone = config [ ' api ' ] . split ( ' :// ' ) [ 0 ] + ' :// ' + config [ ' api ' ] . split ( ' :// ' ) [ 1 ] . split ( ' / ' ) [ 0 ]
else :
print ' ERROR: no index nor API '
sys . exit ( )
if url . startswith ( ' // ' ) : # Orain wikifarm returns URLs starting with //
url = u ' %s : %s ' % ( domainalone . split ( ' :// ' ) [ 0 ] , url )
elif url [ 0 ] == ' / ' or ( not url . startswith ( ' http:// ' ) and not url . startswith ( ' https:// ' ) ) : #is it a relative URL?
if url [ 0 ] == ' / ' : #slash is added later
url = url [ 1 : ]
url = u ' %s / %s ' % ( domainalone , url ) # concat http(s) + domain + relative url
url = undoHTMLEntities ( text = url )
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
url = re . sub ( ' ' , ' _ ' , url )
return url
2014-07-04 17:44:01 +00:00
2014-07-04 18:40:14 +00:00
def getImageNamesScraper ( config = { } , session = None ) :
2011-07-14 20:38:10 +00:00
""" Retrieve file list: filename, url, uploader """
2014-07-03 18:23:21 +00:00
# (?<! http://docs.python.org/library/re.html
r_next = r ' (?<!&dir=prev)&offset=(?P<offset> \ d+)& '
2011-04-07 13:14:37 +00:00
images = [ ]
2014-07-03 18:23:21 +00:00
offset = ' 29990101000000 ' # january 1, 2999
2011-07-09 17:47:04 +00:00
limit = 5000
2011-07-14 21:32:48 +00:00
retries = 5
2011-04-07 13:14:37 +00:00
while offset :
2014-07-03 18:23:21 +00:00
# 5000 overload some servers, but it is needed for sites like this with
# no next links
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
r = session . post ( url = config [ ' index ' ] , data = {
' title ' : ' Special:Imagelist ' , ' limit ' : limit , ' offset ' : offset } )
2014-07-01 00:14:44 +00:00
raw = r . text
delay ( config = config , session = session )
2014-07-03 18:23:21 +00:00
# delicate wiki
if re . search ( ur ' (?i)(allowed memory size of \ d+ bytes exhausted|Call to a member function getURL) ' , raw ) :
2011-07-14 21:32:48 +00:00
if limit > 10 :
print ' Error: listing %d images in a chunk is not possible, trying tiny chunks ' % ( limit )
2014-07-03 18:23:21 +00:00
limit = limit / 10
2011-07-14 21:32:48 +00:00
continue
2014-07-03 18:23:21 +00:00
elif retries > 0 : # waste retries, then exit
2011-07-14 21:32:48 +00:00
retries - = 1
print ' Retrying... '
continue
else :
print ' No more retries, exit... '
break
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
raw = cleanHTML ( raw )
2014-07-03 18:23:21 +00:00
# archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
# wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
# href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1"
# class="new" title="Usuario:Fernandocg (página no
# existe)">Fernandocg</a></td>
2011-04-08 20:35:05 +00:00
r_images1 = r ' (?im)<td class= " TablePager_col_img_name " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a>[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2014-07-03 18:23:21 +00:00
# wikijuegos 1.9.5
# http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
# mediawiki version
2011-04-08 20:35:05 +00:00
r_images2 = r ' (?im)<td class= " TablePager_col_links " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a></td> \ s*<td class= " TablePager_col_img_timestamp " >[^<]+</td> \ s*<td class= " TablePager_col_img_name " >[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2014-07-04 17:44:01 +00:00
# gentoowiki 1.18
2011-04-10 17:51:31 +00:00
r_images3 = r ' (?im)<td class= " TablePager_col_img_name " ><a[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+) " >[^<]+</a>[^<]+</td><td class= " TablePager_col_thumb " ><a[^>]+><img[^>]+></a></td><td class= " TablePager_col_img_size " >[^<]+</td><td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2014-07-03 18:23:21 +00:00
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
# (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
2011-06-13 20:19:13 +00:00
r_images4 = r ' (?im)<a href=[^>]+ title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+) " >[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a> '
2014-07-04 17:39:48 +00:00
r_images5 = ( r ' (?im)<td class= " TablePager_col_img_name " > \ s*<a href[^>]*?>(?P<filename>[^>]+)</a> \ s* \ (<a href= " (?P<url>[^>]+) " >[^<]*?</a> \ s* \ ) \ s*</td> \ s* '
' <td class= " TablePager_col_thumb " >[^ \n \r ]*?</td> \ s* '
' <td class= " TablePager_col_img_size " >[^<]*?</td> \ s* '
' <td class= " TablePager_col_img_user_text " > \ s*(<a href= " [^>]*? " title= " [^>]*? " >)?(?P<uploader>[^<]+?)(</a>)? \ s*</td> ' )
# Select the regexp that returns more results
regexps = [ r_images1 , r_images2 , r_images3 , r_images4 , r_images5 ]
count = 0
i = 0
regexp_best = 0
for regexp in regexps :
if len ( re . findall ( regexp , raw ) ) > count :
count = len ( re . findall ( regexp , raw ) )
regexp_best = i
i + = 1
m = re . compile ( regexps [ regexp_best ] ) . finditer ( raw )
2011-04-08 20:35:05 +00:00
2014-07-04 17:39:48 +00:00
# Iter the image results
2011-04-07 13:14:37 +00:00
for i in m :
url = i . group ( ' url ' )
2014-07-04 17:39:48 +00:00
url = curateImageURL ( config = config , url = url )
2011-04-07 17:28:08 +00:00
filename = re . sub ( ' _ ' , ' ' , i . group ( ' filename ' ) )
2011-04-08 15:34:53 +00:00
filename = undoHTMLEntities ( text = filename )
filename = urllib . unquote ( filename )
2011-04-08 14:57:36 +00:00
uploader = re . sub ( ' _ ' , ' ' , i . group ( ' uploader ' ) )
2011-04-08 15:34:53 +00:00
uploader = undoHTMLEntities ( text = uploader )
uploader = urllib . unquote ( uploader )
2011-04-08 14:57:36 +00:00
images . append ( [ filename , url , uploader ] )
2014-07-03 18:23:21 +00:00
# print filename, url
2011-04-07 13:14:37 +00:00
if re . search ( r_next , raw ) :
offset = re . findall ( r_next , raw ) [ 0 ]
2014-07-03 18:23:21 +00:00
retries + = 5 # add more retries if we got a page with offset
2011-04-07 13:14:37 +00:00
else :
offset = ' '
2014-07-03 18:23:21 +00:00
2013-10-13 09:35:48 +00:00
if ( len ( images ) == 1 ) :
2013-10-14 19:10:10 +00:00
print ' Found 1 image '
else :
print ' Found %d images ' % ( len ( images ) )
2014-07-03 18:23:21 +00:00
2011-04-07 22:32:05 +00:00
images . sort ( )
return images
2014-07-03 18:23:21 +00:00
2014-07-04 18:40:14 +00:00
def getImageNamesAPI ( config = { } , session = None ) :
2012-05-20 08:08:28 +00:00
""" Retrieve file list: filename, url, uploader """
2014-07-04 17:39:48 +00:00
oldAPI = False
2012-05-20 08:08:28 +00:00
aifrom = ' ! '
images = [ ]
while aifrom :
2014-07-03 18:23:21 +00:00
sys . stderr . write ( ' . ' ) # progress
params = { ' action ' : ' query ' , ' list ' : ' allimages ' , ' aiprop ' :
' url|user ' , ' aifrom ' : aifrom , ' format ' : ' json ' , ' ailimit ' : 500 }
# FIXME Handle HTTP Errors HERE
2014-07-01 17:26:57 +00:00
r = session . post ( url = config [ ' api ' ] , data = params )
handleStatusCode ( r )
2014-07-01 00:14:44 +00:00
jsonimages = json . loads ( r . text )
delay ( config = config , session = session )
2014-06-29 08:01:09 +00:00
2014-07-04 17:39:48 +00:00
if ' query ' in jsonimages :
aifrom = ' '
if jsonimages . has_key ( ' query-continue ' ) and jsonimages [ ' query-continue ' ] . has_key ( ' allimages ' ) :
if jsonimages [ ' query-continue ' ] [ ' allimages ' ] . has_key ( ' aicontinue ' ) :
aifrom = jsonimages [ ' query-continue ' ] [ ' allimages ' ] [ ' aicontinue ' ]
elif jsonimages [ ' query-continue ' ] [ ' allimages ' ] . has_key ( ' aifrom ' ) :
aifrom = jsonimages [ ' query-continue ' ] [ ' allimages ' ] [ ' aifrom ' ]
#print aifrom
for image in jsonimages [ ' query ' ] [ ' allimages ' ] :
url = image [ ' url ' ]
url = curateImageURL ( config = config , url = url )
# encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136
filename = unicode ( urllib . unquote ( ( re . sub ( ' _ ' , ' ' , url . split ( ' / ' ) [ - 1 ] ) ) . encode ( ' ascii ' , ' ignore ' ) ) , ' utf-8 ' )
uploader = re . sub ( ' _ ' , ' ' , image [ ' user ' ] )
images . append ( [ filename , url , uploader ] )
else :
oldAPI = True
break
if oldAPI :
gapfrom = ' ! '
images = [ ]
while gapfrom :
sys . stderr . write ( ' . ' ) #progress
# Some old APIs doesn't have allimages query
# In this case use allpages (in nm=6) as generator for imageinfo
# Example: http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
params = { ' action ' : ' query ' , ' generator ' : ' allpages ' , ' gapnamespace ' : 6 , ' gaplimit ' : 500 , ' gapfrom ' : gapfrom , ' prop ' : ' imageinfo ' , ' iiprop ' : ' user|url ' , ' format ' : ' json ' }
#FIXME Handle HTTP Errors HERE
r = session . post ( url = config [ ' api ' ] , data = params )
handleStatusCode ( r )
jsonimages = json . loads ( r . text )
delay ( config = config , session = session )
if ' query ' in jsonimages :
gapfrom = ' '
if jsonimages . has_key ( ' query-continue ' ) and jsonimages [ ' query-continue ' ] . has_key ( ' allpages ' ) :
if jsonimages [ ' query-continue ' ] [ ' allpages ' ] . has_key ( ' gapfrom ' ) :
gapfrom = jsonimages [ ' query-continue ' ] [ ' allpages ' ] [ ' gapfrom ' ]
#print gapfrom
#print jsonimages['query']
for image , props in jsonimages [ ' query ' ] [ ' pages ' ] . items ( ) :
url = props [ ' imageinfo ' ] [ 0 ] [ ' url ' ]
url = curateImageURL ( config = config , url = url )
filename = re . sub ( ' _ ' , ' ' , ' : ' . join ( props [ ' title ' ] . split ( ' : ' ) [ 1 : ] ) )
uploader = re . sub ( ' _ ' , ' ' , props [ ' imageinfo ' ] [ 0 ] [ ' user ' ] )
images . append ( [ filename , url , uploader ] )
2013-10-13 09:35:48 +00:00
if ( len ( images ) == 1 ) :
print ' Found 1 image '
else :
print ' Found %d images ' % ( len ( images ) )
2012-05-20 08:08:28 +00:00
return images
2014-07-03 18:23:21 +00:00
2011-04-08 15:34:53 +00:00
def undoHTMLEntities ( text = ' ' ) :
2013-03-27 21:50:23 +00:00
""" Undo some HTML codes """
2014-07-03 18:23:21 +00:00
# i guess only < > & " ' need conversion
# http://www.w3schools.com/html/html_entities.asp
text = re . sub ( ' < ' , ' < ' , text )
2011-04-08 15:34:53 +00:00
text = re . sub ( ' > ' , ' > ' , text )
text = re . sub ( ' & ' , ' & ' , text )
text = re . sub ( ' " ' , ' " ' , text )
2011-04-16 14:51:48 +00:00
text = re . sub ( ' ' ' , ' \' ' , text )
2014-07-03 18:23:21 +00:00
2011-04-08 15:34:53 +00:00
return text
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def generateImageDump ( config = { } , other = { } , images = [ ] , start = ' ' , session = None ) :
2011-07-14 20:45:41 +00:00
""" Save files and descriptions using a file list """
2014-07-03 18:23:21 +00:00
# fix use subdirectories md5
2011-04-07 22:32:05 +00:00
print ' Retrieving images from " %s " ' % ( start and start or ' start ' )
2011-04-07 15:43:17 +00:00
imagepath = ' %s /images ' % ( config [ ' path ' ] )
2011-04-30 12:37:54 +00:00
if not os . path . isdir ( imagepath ) :
2011-04-30 17:05:59 +00:00
print ' Creating " %s " directory ' % ( imagepath )
2011-04-07 13:14:37 +00:00
os . makedirs ( imagepath )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
c = 0
2011-04-07 22:32:05 +00:00
lock = True
2011-04-07 23:37:45 +00:00
if not start :
lock = False
2011-04-08 14:57:36 +00:00
for filename , url , uploader in images :
2014-07-03 18:23:21 +00:00
if filename == start : # start downloading from start (included)
2011-04-07 22:32:05 +00:00
lock = False
if lock :
continue
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2014-07-03 18:23:21 +00:00
# saving file
# truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
# limit). Later .desc is added to filename, so better 100 as max)
2014-07-01 00:14:44 +00:00
filename2 = urllib . unquote ( filename )
2011-04-11 19:09:55 +00:00
if len ( filename2 ) > other [ ' filenamelimit ' ] :
2011-04-11 18:38:47 +00:00
# split last . (extension) and then merge
2011-04-11 19:09:55 +00:00
filename2 = truncateFilename ( other = other , filename = filename2 )
2013-03-27 21:50:23 +00:00
print ' Filename is too long, truncating. Now it is: ' , filename2
2014-06-30 18:03:32 +00:00
filename3 = u ' %s / %s ' % ( imagepath , filename2 )
2014-07-01 17:26:57 +00:00
imagefile = open ( filename3 , ' wb ' )
r = requests . get ( url = url )
imagefile . write ( r . content )
imagefile . close ( )
2014-07-03 18:23:21 +00:00
# saving description if any
xmlfiledesc = getXMLFileDesc ( config = config , title = u ' Image: %s ' % (
filename ) , session = session ) # use Image: for backwards compatibility
2011-04-11 18:38:47 +00:00
f = open ( ' %s / %s .desc ' % ( imagepath , filename2 ) , ' w ' )
2014-07-03 18:23:21 +00:00
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
if not re . search ( r ' </mediawiki> ' , xmlfiledesc ) :
# failure when retrieving desc? then save it as empty .desc
2011-04-08 13:39:14 +00:00
xmlfiledesc = ' '
2014-07-01 00:14:44 +00:00
f . write ( xmlfiledesc . encode ( ' utf-8 ' ) )
2011-04-08 13:39:14 +00:00
f . close ( )
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2011-04-07 13:14:37 +00:00
c + = 1
if c % 10 == 0 :
print ' Downloaded %d images ' % ( c )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
print ' Downloaded %d images ' % ( c )
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def saveLogs ( config = { } , session = None ) :
2013-03-27 21:30:52 +00:00
""" Save Special:Log """
2014-07-03 18:23:21 +00:00
# get all logs from Special:Log
2011-04-06 19:17:59 +00:00
""" parse
< select name = ' type ' >
< option value = " block " > Bloqueos de usuarios < / option >
< option value = " rights " > Cambios de perfil de usuario < / option >
< option value = " protect " selected = " selected " > Protecciones de páginas < / option >
< option value = " delete " > Registro de borrados < / option >
< option value = " newusers " > Registro de creación de usuarios < / option >
< option value = " merge " > Registro de fusiones < / option >
< option value = " import " > Registro de importaciones < / option >
< option value = " patrol " > Registro de revisiones < / option >
< option value = " move " > Registro de traslados < / option >
< option value = " upload " > Subidas de archivos < / option >
< option value = " " > Todos los registros < / option >
< / select >
"""
2014-07-01 00:14:44 +00:00
delay ( config = config , session = session )
2011-04-06 19:17:59 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def domain2prefix ( config = { } , session = None ) :
2013-11-07 12:24:50 +00:00
""" Convert domain name to a valid prefix filename. """
2014-07-03 18:23:21 +00:00
2013-11-07 12:24:50 +00:00
# At this point, both api and index are supposed to be defined
2011-04-09 09:12:58 +00:00
domain = ' '
2013-11-07 12:24:50 +00:00
if config [ ' api ' ] :
2011-04-09 09:12:58 +00:00
domain = config [ ' api ' ]
2013-11-07 12:24:50 +00:00
elif config [ ' index ' ] :
2011-04-09 09:12:58 +00:00
domain = config [ ' index ' ]
2013-11-07 12:24:50 +00:00
2011-04-13 19:44:35 +00:00
domain = domain . lower ( )
2011-04-29 08:59:13 +00:00
domain = re . sub ( r ' (https?://|www \ .|/index \ .php|/api \ .php) ' , ' ' , domain )
2011-04-07 13:14:37 +00:00
domain = re . sub ( r ' / ' , ' _ ' , domain )
domain = re . sub ( r ' \ . ' , ' ' , domain )
domain = re . sub ( r ' [^A-Za-z0-9] ' , ' _ ' , domain )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
return domain
2014-07-03 18:23:21 +00:00
2011-04-07 15:43:17 +00:00
def loadConfig ( config = { } , configfilename = ' ' ) :
2013-03-27 21:30:52 +00:00
""" Load config file """
2014-07-03 18:23:21 +00:00
2011-04-16 14:51:48 +00:00
try :
2014-07-03 17:24:28 +00:00
with open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' r ' ) as infile :
config = cPickle . load ( infile )
2011-04-16 14:51:48 +00:00
except :
print ' There is no config file. we can \' t resume. Start a new dump. '
sys . exit ( )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
return config
2014-07-03 18:23:21 +00:00
2011-04-07 15:43:17 +00:00
def saveConfig ( config = { } , configfilename = ' ' ) :
2013-03-27 21:30:52 +00:00
""" Save config file """
2014-07-03 18:23:21 +00:00
2014-07-03 17:24:28 +00:00
with open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' w ' ) as outfile :
cPickle . dump ( config , outfile )
2014-07-03 18:23:21 +00:00
2011-04-09 17:45:56 +00:00
def welcome ( ) :
2014-06-30 23:41:03 +00:00
message = ' '
2013-03-27 21:19:46 +00:00
""" Opening message """
2014-07-03 18:23:21 +00:00
message + = " # " * 73
2014-07-03 14:02:11 +00:00
message + = """
# Welcome to DumpGenerator %s by WikiTeam (GPL v3) #
# More info at: https://github.com/WikiTeam/wikiteam #""" % (getVersion())
2014-06-30 23:41:03 +00:00
message + = " \n "
2014-07-03 18:23:21 +00:00
message + = " # " * 73
2014-06-30 23:41:03 +00:00
message + = " \n "
message + = ' '
2014-07-03 18:23:21 +00:00
message + = " \n "
message + = " # " * 73
2014-07-03 14:02:11 +00:00
message + = """
# Copyright (C) 2011-2014 WikiTeam #
2011-04-09 17:45:56 +00:00
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #"""
2014-06-30 23:41:03 +00:00
message + = " \n "
2014-07-03 18:23:21 +00:00
message + = " # " * 73
2014-06-30 23:41:03 +00:00
message + = " \n "
message + = ' '
2014-07-03 18:23:21 +00:00
2014-06-30 23:41:03 +00:00
return message
2011-04-09 17:45:56 +00:00
2014-07-03 18:23:21 +00:00
2011-04-09 17:45:56 +00:00
def bye ( ) :
2013-03-27 21:19:46 +00:00
""" Closing message """
2011-07-09 18:17:04 +00:00
print " ---> Congratulations! Your dump is complete <--- "
2014-06-27 13:25:28 +00:00
print " If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues "
2014-07-11 17:31:51 +00:00
print " If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam "
2011-04-07 16:10:12 +00:00
print " Good luck! Bye! "
2011-04-07 15:43:17 +00:00
2012-03-04 12:35:02 +00:00
2014-06-30 23:41:03 +00:00
def getParameters ( params = [ ] ) :
if not params :
params = sys . argv
2012-03-04 12:35:02 +00:00
2014-07-03 14:02:11 +00:00
parser = argparse . ArgumentParser ( description = ' ' )
2014-07-13 10:32:49 +00:00
# General params
2014-07-03 18:23:21 +00:00
parser . add_argument (
' -v ' , ' --version ' , action = ' version ' , version = getVersion ( ) )
parser . add_argument (
' --cookies ' , metavar = " cookies.txt " , help = " path to a cookies.txt file " )
parser . add_argument (
2014-08-23 03:19:11 +00:00
' --delay ' , metavar = 5 , default = 0 , type = float , help = " adds a delay (in seconds) " )
2014-07-03 18:23:21 +00:00
parser . add_argument (
' --retries ' , metavar = 5 , default = 5 , help = " Maximum number of retries for " )
2014-07-13 10:32:49 +00:00
parser . add_argument ( ' --path ' , help = ' path to store wiki dump at ' )
parser . add_argument ( ' --resume ' , action = ' store_true ' ,
help = ' resumes previous incomplete dump (requires --path) ' )
parser . add_argument ( ' --force ' , action = ' store_true ' , help = ' ' )
2014-07-03 18:23:21 +00:00
parser . add_argument (
2014-07-13 10:32:49 +00:00
' --user ' , help = ' Username if authentication is required. ' )
parser . add_argument (
' --pass ' , dest = ' password ' , help = ' Password if authentication is required. ' )
2014-07-01 17:26:57 +00:00
2014-07-13 10:32:49 +00:00
# URL params
groupWikiOrAPIOrIndex = parser . add_argument_group ( )
2014-07-03 18:23:21 +00:00
groupWikiOrAPIOrIndex . add_argument (
2014-07-13 10:39:54 +00:00
' wiki ' , default = ' ' , nargs = ' ? ' , help = " URL to wiki (e.g. http://wiki.domain.org) " )
groupWikiOrAPIOrIndex . add_argument ( ' --api ' , help = " URL to API (e.g. http://wiki.domain.org/w/api.php) " )
groupWikiOrAPIOrIndex . add_argument ( ' --index ' , help = " URL to index.php (e.g. http://wiki.domain.org/w/index.php) " )
2014-07-13 10:32:49 +00:00
# Download params
groupDownload = parser . add_argument_group ( ' Data to download ' , ' What info download from the wiki ' )
groupDownload . add_argument (
2014-07-03 18:23:21 +00:00
' --xml ' , action = ' store_true ' , help = " generates a full history XML dump (--xml --curonly for current revisions only) " )
2014-07-13 10:32:49 +00:00
groupDownload . add_argument ( ' --curonly ' , action = ' store_true ' ,
2014-07-03 18:23:21 +00:00
help = ' store only the current version of pages ' )
2014-07-13 10:32:49 +00:00
groupDownload . add_argument (
2014-07-03 18:23:21 +00:00
' --images ' , action = ' store_true ' , help = " generates an image dump " )
2014-07-13 10:32:49 +00:00
groupDownload . add_argument ( ' --namespaces ' , metavar = " 1,2,3 " ,
2014-07-03 18:23:21 +00:00
help = ' comma-separated value of namespaces to include (all by default) ' )
2014-07-13 10:32:49 +00:00
groupDownload . add_argument ( ' --exnamespaces ' , metavar = " 1,2,3 " ,
2014-07-03 18:23:21 +00:00
help = ' comma-separated value of namespaces to exclude ' )
2014-07-13 10:32:49 +00:00
# Meta info params
groupMeta = parser . add_argument_group ( ' Meta info ' , ' What meta info to retrieve from the wiki ' )
groupMeta . add_argument (
' --get-wiki-engine ' , action = ' store_true ' , help = " returns the wiki engine " )
2014-06-30 23:41:03 +00:00
args = parser . parse_args ( )
2014-07-03 18:23:21 +00:00
# print args
2014-07-13 10:32:49 +00:00
# Don't mix download params and meta info params
if ( args . xml or args . images ) and \
( args . get_wiki_engine ) :
print ' ERROR: Don \' t mix download params and meta info params '
parser . print_help ( )
2014-06-30 23:41:03 +00:00
sys . exit ( 1 )
2014-07-13 10:32:49 +00:00
# No download params and no meta info params? Exit
if ( not args . xml and not args . images ) and \
( not args . get_wiki_engine ) :
print ' ERROR: Use at least one download param or meta info param '
parser . print_help ( )
2014-06-30 23:41:03 +00:00
sys . exit ( 1 )
2014-07-13 10:32:49 +00:00
# Execute meta info params
if args . wiki :
if args . get_wiki_engine :
print getWikiEngine ( url = args . wiki )
sys . exit ( )
# Create session
cj = cookielib . MozillaCookieJar ( )
if args . cookies :
cj . load ( args . cookies )
print ' Using cookies from %s ' % args . cookies
2014-07-03 18:23:21 +00:00
2014-07-13 10:32:49 +00:00
session = requests . Session ( )
session . cookies = cj
session . headers = { ' User-Agent ' : getUserAgent ( ) }
if args . user and args . password :
session . auth = ( args . user , args . password )
# session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
# check URLs
for url in [ args . api , args . index , args . wiki ] :
if url and ( not url . startswith ( ' http:// ' ) and not url . startswith ( ' https:// ' ) ) :
print url
print ' ERROR: URLs must start with http:// or https:// \n '
parser . print_help ( )
sys . exit ( 1 )
# Get API and index and verify
api = args . api and args . api or ' '
index = args . index and args . index or ' '
if api == ' ' or index == ' ' :
if args . wiki :
if getWikiEngine ( args . wiki ) == ' MediaWiki ' :
api2 , index2 = mwGetAPIAndIndex ( args . wiki )
if not api :
api = api2
if not index :
index = index2
else :
print ' ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki '
sys . exit ( 1 )
else :
if api == ' ' :
pass
elif index == ' ' :
index = ' / ' . join ( api . split ( ' / ' ) [ : - 1 ] ) + ' /index.php '
2014-07-13 10:52:48 +00:00
#print api
#print index
2014-07-13 10:32:49 +00:00
if api and checkAPI ( api = api , session = session ) :
print ' API is OK '
else :
print ' Error in API, please, provide a correct path to API '
sys . exit ( 1 )
if index and checkIndex ( index = index , cookies = args . cookies , session = session ) :
print ' index.php is OK '
else :
print ' Error in index.php, please, provide a correct path to index.php '
sys . exit ( 1 )
2014-07-01 17:26:57 +00:00
# check user and pass (one requires both)
if ( args . user and not args . password ) or ( args . password and not args . user ) :
2014-07-13 10:32:49 +00:00
print ' ERROR: Both --user and --pass are required for authentication. '
parser . print_help ( )
2014-07-01 17:26:57 +00:00
sys . exit ( 1 )
2013-10-09 20:11:01 +00:00
2014-06-30 23:41:03 +00:00
namespaces = [ ' all ' ]
exnamespaces = [ ]
# Process namespace inclusions
if args . namespaces :
2014-07-03 18:23:21 +00:00
# fix, why - ? and... --namespaces= all with a space works?
if re . search ( r ' [^ \ d, \ -] ' , args . namespaces ) and args . namespaces . lower ( ) != ' all ' :
2014-06-30 23:41:03 +00:00
print " Invalid namespace values. \n Valid format is integer(s) separated by commas "
sys . exit ( )
else :
ns = re . sub ( ' ' , ' ' , args . namespaces )
if ns . lower ( ) == ' all ' :
namespaces = [ ' all ' ]
else :
namespaces = [ int ( i ) for i in ns . split ( ' , ' ) ]
2012-03-04 12:35:02 +00:00
2014-06-30 23:41:03 +00:00
# Process namespace exclusions
if args . exnamespaces :
if re . search ( r ' [^ \ d, \ -] ' , args . exnamespaces ) :
print " Invalid namespace values. \n Valid format is integer(s) separated by commas "
sys . exit ( 1 )
else :
ns = re . sub ( ' ' , ' ' , args . exnamespaces )
if ns . lower ( ) == ' all ' :
print ' You cannot exclude all namespaces. '
sys . exit ( 1 )
else :
exnamespaces = [ int ( i ) for i in ns . split ( ' , ' ) ]
# --curonly requires --xml
if args . curonly and not args . xml :
print " --curonly requires --xml \n "
2014-07-13 10:32:49 +00:00
parser . print_help ( )
2014-06-30 23:41:03 +00:00
sys . exit ( 1 )
2014-07-03 18:23:21 +00:00
2011-04-07 13:14:37 +00:00
config = {
2014-06-30 23:41:03 +00:00
' curonly ' : args . curonly ,
2011-04-07 20:24:30 +00:00
' date ' : datetime . datetime . now ( ) . strftime ( ' % Y % m %d ' ) ,
2014-07-13 10:32:49 +00:00
' api ' : api ,
2014-06-30 23:41:03 +00:00
' index ' : index ,
' images ' : args . images ,
2011-04-07 13:14:37 +00:00
' logs ' : False ,
2014-06-30 23:41:03 +00:00
' xml ' : args . xml ,
' namespaces ' : namespaces ,
' exnamespaces ' : exnamespaces ,
' path ' : args . path or ' ' ,
' cookies ' : args . cookies or ' ' ,
' delay ' : args . delay
2011-04-07 13:14:37 +00:00
}
2011-04-07 15:43:17 +00:00
other = {
2014-06-30 23:41:03 +00:00
' resume ' : args . resume ,
2014-07-03 18:23:21 +00:00
' filenamelimit ' : 100 , # do not change
2014-06-30 23:41:03 +00:00
' force ' : args . force ,
2014-07-01 00:14:44 +00:00
' session ' : session
2011-04-07 15:43:17 +00:00
}
2014-07-03 18:23:21 +00:00
# calculating path, if not defined by user with --path=
2011-04-11 07:58:09 +00:00
if not config [ ' path ' ] :
2014-07-01 00:14:44 +00:00
config [ ' path ' ] = ' ./ %s - %s -wikidump ' % ( domain2prefix ( config = config , session = session ) , config [ ' date ' ] )
2011-04-07 15:43:17 +00:00
2014-06-30 23:41:03 +00:00
return config , other
2014-07-03 18:23:21 +00:00
2014-07-13 10:32:49 +00:00
def checkAPI ( api = None , session = None ) :
2013-03-27 21:26:20 +00:00
""" Checking API availability """
2014-07-01 00:14:44 +00:00
global cj
2014-07-03 18:23:21 +00:00
r = session . post (
url = api , data = { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' json ' } )
2014-07-01 00:14:44 +00:00
resultText = r . text
2014-07-13 10:32:49 +00:00
print ' Checking API... ' , api
2014-06-27 19:19:54 +00:00
if " MediaWiki API is not enabled for this site. " in resultText :
return False
2014-08-23 16:08:14 +00:00
try :
result = json . loads ( resultText )
if ' query ' in result :
return True
except ValueError :
return False
2012-03-05 11:08:03 +00:00
return False
2014-07-03 18:23:21 +00:00
2014-07-13 10:32:49 +00:00
def checkIndex ( index = None , cookies = None , session = None ) :
2013-03-27 21:26:20 +00:00
""" Checking index.php availability """
2014-07-13 10:32:49 +00:00
r = session . post ( url = index , data = { ' title ' : ' Special:Version ' } )
2014-07-01 00:14:44 +00:00
raw = r . text
2014-07-13 10:32:49 +00:00
print ' Checking index.php... ' , index
2014-07-03 18:23:21 +00:00
# Workaround for issue 71
2014-07-13 10:32:49 +00:00
if re . search ( r ' (Special:Badtitle</a>|class= " permissions-errors " | " wgCanonicalSpecialPageName " : " Badtitle " |Login Required</h1>) ' , raw ) and not cookies :
2014-07-03 18:23:21 +00:00
print " ERROR: This wiki requires login and we are not authenticated "
return False
2013-08-17 07:34:11 +00:00
if re . search ( r ' (This wiki is powered by|<h2 id= " mw-version-license " >|meta name= " generator " content= " MediaWiki) ' , raw ) :
2012-03-05 11:08:03 +00:00
return True
return False
2014-07-03 18:23:21 +00:00
2011-04-09 17:16:42 +00:00
def removeIP ( raw = ' ' ) :
2011-07-14 19:58:14 +00:00
""" Remove IP from HTML comments <!-- --> """
2014-07-03 18:23:21 +00:00
2011-04-09 17:16:42 +00:00
raw = re . sub ( r ' \ d+ \ . \ d+ \ . \ d+ \ . \ d+ ' , ' 0.0.0.0 ' , raw )
2014-07-03 18:23:21 +00:00
# http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
# weird cases as :: are not included
raw = re . sub (
r ' (?i)[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4} ' , ' 0:0:0:0:0:0:0:0 ' , raw )
2011-04-09 17:16:42 +00:00
return raw
2014-07-03 18:23:21 +00:00
def checkXMLIntegrity ( config = { } , titles = [ ] , session = None ) :
2012-08-06 14:48:30 +00:00
""" Check XML dump integrity, to detect broken XML chunks """
2014-07-03 18:23:21 +00:00
return
2013-11-16 13:39:49 +00:00
print ' Verifying dump... '
checktitles = 0
checkpageopen = 0
checkpageclose = 0
checkrevisionopen = 0
checkrevisionclose = 0
2014-07-01 00:14:44 +00:00
for line in file ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( config = config , session = session ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' ) . read ( ) . splitlines ( ) :
2013-11-17 11:18:42 +00:00
if " <revision> " in line :
checkrevisionopen + = 1
elif " </revision> " in line :
checkrevisionclose + = 1
2013-11-16 13:39:49 +00:00
elif " <page> " in line :
checkpageopen + = 1
elif " </page> " in line :
checkpageclose + = 1
2013-11-17 11:18:42 +00:00
elif " <title> " in line :
checktitles + = 1
2013-11-16 13:39:49 +00:00
else :
continue
2013-11-16 14:01:24 +00:00
if ( checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose ) :
2013-11-16 13:39:49 +00:00
pass
2012-06-22 11:34:27 +00:00
else :
2013-11-16 13:39:49 +00:00
print ' XML dump seems to be corrupted. '
reply = ' '
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
reply = raw_input ( ' Regenerate a new dump ([yes, y], [no, n])? ' )
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
2014-07-03 18:23:21 +00:00
generateXMLDump ( config = config , titles = titles , session = session )
2013-11-16 13:39:49 +00:00
elif reply . lower ( ) in [ ' no ' , ' n ' ] :
print ' Not generating a new dump. '
2014-07-03 18:23:21 +00:00
2012-06-22 11:34:27 +00:00
2013-03-27 21:26:20 +00:00
def createNewDump ( config = { } , other = { } ) :
2013-03-27 21:19:46 +00:00
titles = [ ]
images = [ ]
print ' Trying generating a new dump into a new directory... '
if config [ ' xml ' ] :
2014-07-01 00:14:44 +00:00
titles + = getPageTitles ( config = config , session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
saveTitles ( config = config , titles = titles )
2014-07-01 00:14:44 +00:00
generateXMLDump ( config = config , titles = titles , session = other [ ' session ' ] )
2014-07-03 18:23:21 +00:00
checkXMLIntegrity ( config = config , titles = titles , session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
if config [ ' images ' ] :
2014-07-04 18:40:14 +00:00
images + = getImageNames ( config = config , session = other [ ' session ' ] )
saveImageNames ( config = config , images = images , session = other [ ' session ' ] )
generateImageDump ( config = config , other = other , images = images , session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
if config [ ' logs ' ] :
2014-07-03 18:23:21 +00:00
saveLogs ( config = config , session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
2013-03-27 21:26:20 +00:00
def resumePreviousDump ( config = { } , other = { } ) :
2013-03-27 21:19:46 +00:00
titles = [ ]
images = [ ]
print ' Resuming previous dump process... '
if config [ ' xml ' ] :
2014-07-03 18:23:21 +00:00
# load titles
2013-03-27 21:19:46 +00:00
lasttitle = ' '
try :
2014-07-03 18:23:21 +00:00
f = open ( ' %s / %s - %s -titles.txt ' % ( config [ ' path ' ] , domain2prefix (
config = config , session = other [ ' session ' ] ) , config [ ' date ' ] ) , ' r ' )
2014-06-25 20:53:46 +00:00
raw = unicode ( f . read ( ) , ' utf-8 ' )
2013-03-27 21:19:46 +00:00
titles = raw . split ( ' \n ' )
lasttitle = titles [ - 1 ]
2014-07-03 18:23:21 +00:00
if not lasttitle : # empty line at EOF ?
2013-03-27 21:19:46 +00:00
lasttitle = titles [ - 2 ]
f . close ( )
except :
2014-07-03 18:23:21 +00:00
pass # probably file doesnot exists
2013-03-27 21:19:46 +00:00
if lasttitle == ' --END-- ' :
2014-07-03 18:23:21 +00:00
# titles list is complete
2013-03-27 21:19:46 +00:00
print ' Title list was completed in the previous session '
else :
print ' Title list is incomplete. Reloading... '
2014-07-03 18:23:21 +00:00
# do not resume, reload, to avoid inconsistences, deleted pages or
# so
2014-07-01 00:14:44 +00:00
titles = getPageTitles ( config = config , session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
saveTitles ( config = config , titles = titles )
2014-07-03 18:23:21 +00:00
# checking xml dump
2013-03-27 21:19:46 +00:00
xmliscomplete = False
lastxmltitle = ' '
try :
2014-07-03 18:23:21 +00:00
f = open ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( config = config , session = other [
' session ' ] ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' )
2013-03-27 21:19:46 +00:00
for l in f :
if re . findall ( ' </mediawiki> ' , l ) :
2014-07-03 18:23:21 +00:00
# xml dump is complete
2013-03-27 21:19:46 +00:00
xmliscomplete = True
break
2014-07-03 18:23:21 +00:00
# weird if found more than 1, but maybe
xmltitles = re . findall ( r ' <title>([^<]+)</title> ' , l )
2013-03-27 21:19:46 +00:00
if xmltitles :
lastxmltitle = undoHTMLEntities ( text = xmltitles [ - 1 ] )
f . close ( )
except :
2014-07-03 18:23:21 +00:00
pass # probably file doesnot exists
# removing --END-- before getXMLs
2013-03-27 21:19:46 +00:00
while titles and titles [ - 1 ] in [ ' ' , ' --END-- ' ] :
titles = titles [ : - 1 ]
if xmliscomplete :
print ' XML dump was completed in the previous session '
elif lastxmltitle :
2014-07-03 18:23:21 +00:00
# resuming...
2013-03-27 21:19:46 +00:00
print ' Resuming XML dump from " %s " ' % ( lastxmltitle )
2014-07-03 18:23:21 +00:00
generateXMLDump (
config = config , titles = titles , start = lastxmltitle , session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
else :
2014-07-03 18:23:21 +00:00
# corrupt? only has XML header?
2013-03-27 21:19:46 +00:00
print ' XML is corrupt? Regenerating... '
2014-07-03 18:23:21 +00:00
generateXMLDump (
config = config , titles = titles , session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
if config [ ' images ' ] :
2014-07-03 18:23:21 +00:00
# load images
2013-03-27 21:19:46 +00:00
lastimage = ' '
try :
2014-07-03 18:23:21 +00:00
f = open ( ' %s / %s - %s -images.txt ' %
( config [ ' path ' ] , domain2prefix ( config = config ) , config [ ' date ' ] ) , ' r ' )
2014-06-30 18:03:32 +00:00
raw = unicode ( f . read ( ) , ' utf-8 ' ) . strip ( )
2013-03-27 21:19:46 +00:00
lines = raw . split ( ' \n ' )
for l in lines :
if re . search ( r ' \ t ' , l ) :
images . append ( l . split ( ' \t ' ) )
lastimage = lines [ - 1 ]
f . close ( )
except :
2014-07-03 18:23:21 +00:00
pass # probably file doesnot exists
2014-06-30 18:03:32 +00:00
if lastimage == u ' --END-- ' :
2013-03-27 21:19:46 +00:00
print ' Image list was completed in the previous session '
else :
print ' Image list is incomplete. Reloading... '
2014-07-03 18:23:21 +00:00
# do not resume, reload, to avoid inconsistences, deleted images or
# so
2014-07-04 18:40:14 +00:00
images = getImageNames ( config = config , session = other [ ' session ' ] )
saveImageNames ( config = config , images = images )
2014-07-03 18:23:21 +00:00
# checking images directory
2013-03-27 21:19:46 +00:00
listdir = [ ]
try :
listdir = os . listdir ( ' %s /images ' % ( config [ ' path ' ] ) )
except :
2014-07-03 18:23:21 +00:00
pass # probably directory does not exist
2013-03-27 21:19:46 +00:00
listdir . sort ( )
complete = True
lastfilename = ' '
lastfilename2 = ' '
c = 0
for filename , url , uploader in images :
lastfilename2 = lastfilename
2014-07-03 18:23:21 +00:00
# return always the complete filename, not the truncated
lastfilename = filename
2013-03-27 21:19:46 +00:00
filename2 = filename
if len ( filename2 ) > other [ ' filenamelimit ' ] :
filename2 = truncateFilename ( other = other , filename = filename2 )
if filename2 not in listdir :
complete = False
break
2014-07-03 18:23:21 +00:00
c + = 1
2013-03-27 21:19:46 +00:00
print ' %d images were found in the directory from a previous session ' % ( c )
if complete :
2014-07-03 18:23:21 +00:00
# image dump is complete
2013-03-27 21:19:46 +00:00
print ' Image dump was completed in the previous session '
else :
2014-07-03 18:23:21 +00:00
# we resume from previous image, which may be corrupted (or missing
# .desc) by the previous session ctrl-c or abort
generateImageDump (
config = config , other = other , images = images , start = lastfilename2 , session = other [ ' session ' ] )
2013-03-27 21:19:46 +00:00
if config [ ' logs ' ] :
2014-07-03 18:23:21 +00:00
# fix
2013-03-27 21:19:46 +00:00
pass
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def saveSpecialVersion ( config = { } , session = None ) :
2014-06-26 08:26:57 +00:00
""" Save Special:Version as .html, to preserve extensions details """
2014-07-03 18:23:21 +00:00
2013-03-27 21:19:46 +00:00
if os . path . exists ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) ) :
print ' Special:Version.html exists, do not overwrite '
else :
print ' Downloading Special:Version with extensions and other related info '
2014-07-03 18:23:21 +00:00
r = session . post (
url = config [ ' index ' ] , data = { ' title ' : ' Special:Version ' } )
2014-07-01 00:14:44 +00:00
raw = r . text
delay ( config = config , session = session )
2013-03-27 21:19:46 +00:00
raw = removeIP ( raw = raw )
2014-07-03 17:24:28 +00:00
with open ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) , ' w ' ) as outfile :
outfile . write ( raw . encode ( ' utf-8 ' ) )
2013-03-27 21:19:46 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def saveIndexPHP ( config = { } , session = None ) :
2014-06-26 08:26:57 +00:00
""" Save index.php as .html, to preserve license details available at the botom of the page """
2014-07-03 18:23:21 +00:00
2013-03-27 21:19:46 +00:00
if os . path . exists ( ' %s /index.html ' % ( config [ ' path ' ] ) ) :
print ' index.html exists, do not overwrite '
else :
print ' Downloading index.php (Main Page) as index.html '
2014-07-01 17:26:57 +00:00
r = session . post ( url = config [ ' index ' ] , data = { } )
2014-07-01 00:14:44 +00:00
raw = r . text
delay ( config = config , session = session )
2013-03-27 21:19:46 +00:00
raw = removeIP ( raw = raw )
2014-07-03 17:24:28 +00:00
with open ( ' %s /index.html ' % ( config [ ' path ' ] ) , ' w ' ) as outfile :
outfile . write ( raw . encode ( ' utf-8 ' ) )
2013-03-27 21:19:46 +00:00
2014-07-03 18:23:21 +00:00
2014-07-01 00:14:44 +00:00
def saveSiteInfo ( config = { } , session = None ) :
2014-06-26 08:38:59 +00:00
""" Save a file with site info """
2014-07-03 18:23:21 +00:00
2014-06-26 08:39:45 +00:00
if config [ ' api ' ] :
if os . path . exists ( ' %s /siteinfo.json ' % ( config [ ' path ' ] ) ) :
print ' siteinfo.json exists, do not overwrite '
else :
2014-06-29 11:26:06 +00:00
print ' Downloading site info as siteinfo.json '
2014-07-03 18:23:21 +00:00
r = session . post ( url = config [ ' api ' ] , data = {
' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' json ' } )
2014-07-01 00:14:44 +00:00
result = json . loads ( r . text )
delay ( config = config , session = session )
2014-07-03 17:24:28 +00:00
with open ( ' %s /siteinfo.json ' % ( config [ ' path ' ] ) , ' w ' ) as outfile :
outfile . write ( json . dumps ( result , indent = 4 , sort_keys = True ) )
2014-06-26 08:38:59 +00:00
2014-07-03 18:23:21 +00:00
2014-06-30 23:41:03 +00:00
def avoidWikimediaProjects ( config = { } , other = { } ) :
2013-03-27 21:26:20 +00:00
""" Skip Wikimedia projects and redirect to the dumps website """
2014-07-03 18:23:21 +00:00
# notice about wikipedia dumps
if re . findall ( r ' (?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage) \ .org ' , config [ ' api ' ] + config [ ' index ' ] ) :
2013-03-27 20:57:30 +00:00
print ' PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS! '
print ' Download the dumps from http://dumps.wikimedia.org '
2011-07-14 19:54:14 +00:00
if not other [ ' force ' ] :
2011-07-14 20:17:22 +00:00
print ' Thanks! '
2011-07-14 19:54:14 +00:00
sys . exit ( )
2013-03-27 21:19:46 +00:00
2014-07-03 18:23:21 +00:00
2014-07-03 16:17:02 +00:00
def getWikiEngine ( url = ' ' ) :
""" Returns the wiki engine of a URL, if known """
2014-07-03 18:23:21 +00:00
2014-07-03 17:33:09 +00:00
session = requests . Session ( )
session . headers = { ' User-Agent ' : getUserAgent ( ) }
r = session . post ( url = url )
result = r . text
2014-07-03 18:23:21 +00:00
2014-07-03 16:17:02 +00:00
wikiengine = ' Unknown '
2014-07-03 17:33:09 +00:00
if re . search ( ur ' (?im)(<meta name= " generator " content= " DokuWiki) ' , result ) :
2014-07-03 16:17:02 +00:00
wikiengine = ' DokuWiki '
2014-07-03 17:33:09 +00:00
elif re . search ( ur ' (?im)(alt= " Powered by MediaWiki " |<meta name= " generator " content= " MediaWiki) ' , result ) :
2014-07-03 16:17:02 +00:00
wikiengine = ' MediaWiki '
2014-07-03 17:33:09 +00:00
elif re . search ( ur ' (?im)(>MoinMoin Powered</a>) ' , result ) :
2014-07-03 16:17:02 +00:00
wikiengine = ' MoinMoin '
2014-07-03 18:23:21 +00:00
2014-07-03 16:17:02 +00:00
return wikiengine
2014-07-03 18:23:21 +00:00
2014-07-13 10:32:49 +00:00
def mwGetAPIAndIndex ( url = ' ' ) :
2014-07-11 18:44:25 +00:00
""" Returns the MediaWiki API and Index.php """
api = ' '
index = ' '
session = requests . Session ( )
session . headers = { ' User-Agent ' : getUserAgent ( ) }
r = session . post ( url = url )
result = r . text
2014-07-13 10:32:49 +00:00
# API
m = re . findall ( ur ' (?im)< \ s*link \ s*rel= " EditURI " \ s*type= " application/rsd \ +xml " \ s*href= " ([^>]+?) \ ?action=rsd " \ s*/ \ s*> ' , result )
2014-07-11 18:44:25 +00:00
if m :
api = m [ 0 ]
2014-07-13 10:32:49 +00:00
if api . startswith ( ' // ' ) : # gentoo wiki
api = url . split ( ' // ' ) [ 0 ] + api
else :
pass # build API using index and check it
2014-07-11 18:44:25 +00:00
2014-07-13 10:32:49 +00:00
# Index.php
m = re . findall ( ur ' <li id= " ca-viewsource " [^>]*?> \ s*(?:<span>)? \ s*<a href= " ([^ \ ?]+?) \ ? ' , result )
2014-07-11 18:44:25 +00:00
if m :
index = m [ 0 ]
else :
2014-07-13 10:32:49 +00:00
m = re . findall ( ur ' <li id= " ca-history " [^>]*?> \ s*(?:<span>)? \ s*<a href= " ([^ \ ?]+?) \ ? ' , result )
2014-07-11 18:44:25 +00:00
if m :
index = m [ 0 ]
2014-07-13 10:52:48 +00:00
if index :
if index . startswith ( ' / ' ) :
index = ' / ' . join ( api . split ( ' / ' ) [ : - 1 ] ) + ' / ' + index . split ( ' / ' ) [ - 1 ]
else :
if api :
if len ( re . findall ( ur ' /index \ .php5 \ ? ' , result ) ) > len ( re . findall ( ur ' /index \ .php \ ? ' , result ) ) :
index = ' / ' . join ( api . split ( ' / ' ) [ : - 1 ] ) + ' /index.php5 '
else :
index = ' / ' . join ( api . split ( ' / ' ) [ : - 1 ] ) + ' /index.php '
2014-07-13 10:32:49 +00:00
2014-07-11 18:44:25 +00:00
return api , index
2013-03-27 21:19:46 +00:00
def main ( params = [ ] ) :
""" Main function """
2014-07-03 18:23:21 +00:00
2013-03-27 21:19:46 +00:00
configfilename = ' config.txt '
config , other = getParameters ( params = params )
2014-06-30 23:41:03 +00:00
avoidWikimediaProjects ( config = config , other = other )
2014-07-03 18:23:21 +00:00
2014-07-03 16:17:02 +00:00
print welcome ( )
2011-04-09 09:12:58 +00:00
print ' Analysing %s ' % ( config [ ' api ' ] and config [ ' api ' ] or config [ ' index ' ] )
2014-07-03 18:23:21 +00:00
# creating path or resuming if desired
2011-04-07 13:14:37 +00:00
c = 2
2014-07-03 18:23:21 +00:00
# to avoid concat blabla-2, blabla-2-3, and so on...
originalpath = config [ ' path ' ]
# do not enter if resume is requested from begining
while not other [ ' resume ' ] and os . path . isdir ( config [ ' path ' ] ) :
2011-04-07 15:43:17 +00:00
print ' \n Warning!: " %s " path exists ' % ( config [ ' path ' ] )
2011-04-11 07:58:09 +00:00
reply = ' '
2011-07-14 20:01:34 +00:00
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
2014-07-03 18:23:21 +00:00
reply = raw_input ( ' There is a dump in " %s " , probably incomplete. \n If you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored \n and the parameters available in " %s / %s " will be loaded. \n Do you want to resume ([yes, y], [no, n])? ' % (
config [ ' path ' ] , config [ ' path ' ] , configfilename ) )
2011-04-07 13:14:37 +00:00
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
2011-04-07 20:24:30 +00:00
if not os . path . isfile ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) ) :
print ' No config file found. I can \' t resume. Aborting. '
2011-04-07 13:14:37 +00:00
sys . exit ( )
2013-03-27 20:58:55 +00:00
print ' You have selected: YES '
2011-04-07 15:43:17 +00:00
other [ ' resume ' ] = True
2011-04-07 13:14:37 +00:00
break
2011-04-11 07:58:09 +00:00
elif reply . lower ( ) in [ ' no ' , ' n ' ] :
2013-03-27 20:58:55 +00:00
print ' You have selected: NO '
2011-04-11 07:58:09 +00:00
other [ ' resume ' ] = False
2011-04-07 16:10:12 +00:00
config [ ' path ' ] = ' %s - %d ' % ( originalpath , c )
2013-03-27 20:57:30 +00:00
print ' Trying to use path " %s " ... ' % ( config [ ' path ' ] )
2011-04-07 13:14:37 +00:00
c + = 1
2011-04-07 15:43:17 +00:00
2011-04-07 20:24:30 +00:00
if other [ ' resume ' ] :
print ' Loading config file... '
config = loadConfig ( config = config , configfilename = configfilename )
else :
2011-04-07 15:43:17 +00:00
os . mkdir ( config [ ' path ' ] )
saveConfig ( config = config , configfilename = configfilename )
2014-07-03 18:23:21 +00:00
2011-04-07 15:43:17 +00:00
if other [ ' resume ' ] :
2013-03-27 21:26:20 +00:00
resumePreviousDump ( config = config , other = other )
2011-04-07 13:14:37 +00:00
else :
2013-03-27 21:26:20 +00:00
createNewDump ( config = config , other = other )
2013-03-27 21:19:46 +00:00
2014-07-03 18:23:21 +00:00
saveIndexPHP ( config = config , session = other [ ' session ' ] )
saveSpecialVersion ( config = config , session = other [ ' session ' ] )
saveSiteInfo ( config = config , session = other [ ' session ' ] )
2011-04-09 17:45:56 +00:00
bye ( )
2011-04-07 15:43:17 +00:00
if __name__ == " __main__ " :
2012-06-22 11:34:27 +00:00
main ( )