2014-02-26 23:22:53 +00:00
#!/usr/bin/env python2
2011-04-05 16:18:18 +00:00
# -*- coding: utf-8 -*-
2013-03-27 20:33:06 +00:00
# Copyright (C) 2011-2013 WikiTeam
2011-04-05 16:18:18 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2012-08-06 14:48:30 +00:00
#######################################################################
2013-03-27 20:36:03 +00:00
# dumpgenerator.py is a script to generate backups of MediaWiki wikis #
2012-08-06 14:48:30 +00:00
# To learn more, read the documentation: #
# http://code.google.com/p/wikiteam/wiki/NewTutorial #
#######################################################################
2012-04-07 14:58:23 +00:00
2013-10-09 20:11:01 +00:00
import cookielib
2011-04-07 13:14:37 +00:00
import cPickle
2011-04-05 22:00:29 +00:00
import datetime
2011-04-07 15:43:17 +00:00
import getopt
2011-05-07 22:04:00 +00:00
try :
from hashlib import md5
except ImportError : # Python 2.4 compatibility
from md5 import new as md5
2011-04-05 22:00:29 +00:00
import os
2011-04-05 16:18:18 +00:00
import re
2011-04-05 22:00:29 +00:00
import subprocess
2011-04-05 16:18:18 +00:00
import sys
2011-04-07 15:56:48 +00:00
import time
2011-04-05 16:18:18 +00:00
import urllib
2011-04-05 22:00:29 +00:00
import urllib2
2011-04-05 16:18:18 +00:00
2011-04-11 19:09:55 +00:00
def truncateFilename ( other = { } , filename = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Truncate filenames when downloading images with large filenames """
2011-05-30 20:31:32 +00:00
return filename [ : other [ ' filenamelimit ' ] ] + md5 ( filename ) . hexdigest ( ) + ' . ' + filename . split ( ' . ' ) [ - 1 ]
2011-04-11 18:38:47 +00:00
2011-04-07 16:10:12 +00:00
def delay ( config = { } ) :
2012-08-06 14:48:30 +00:00
""" Add a delay if configured for that """
2011-04-07 17:28:08 +00:00
if config [ ' delay ' ] > 0 :
print ' Sleeping... %d seconds... ' % ( config [ ' delay ' ] )
time . sleep ( config [ ' delay ' ] )
2011-04-07 16:10:12 +00:00
2011-04-05 22:00:29 +00:00
def cleanHTML ( raw = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Extract only the real wiki content and remove rubbish """
""" This function is ONLY used to retrieve page titles and file names when no API is available """
2012-08-06 14:54:31 +00:00
""" DO NOT use this function to extract page content """
#different "tags" used by different MediaWiki versions to mark where starts and ends content
2011-04-12 16:31:50 +00:00
if re . search ( ' <!-- bodytext --> ' , raw ) :
2011-04-05 22:00:29 +00:00
raw = raw . split ( ' <!-- bodytext --> ' ) [ 1 ] . split ( ' <!-- /bodytext --> ' ) [ 0 ]
elif re . search ( ' <!-- start content --> ' , raw ) :
raw = raw . split ( ' <!-- start content --> ' ) [ 1 ] . split ( ' <!-- end content --> ' ) [ 0 ]
2011-04-12 16:31:50 +00:00
elif re . search ( ' <!-- Begin Content Area --> ' , raw ) :
raw = raw . split ( ' <!-- Begin Content Area --> ' ) [ 1 ] . split ( ' <!-- End Content Area --> ' ) [ 0 ]
2011-05-05 21:46:48 +00:00
elif re . search ( ' <!-- content --> ' , raw ) :
raw = raw . split ( ' <!-- content --> ' ) [ 1 ] . split ( ' <!-- mw_content --> ' ) [ 0 ]
2011-09-08 20:00:33 +00:00
elif re . search ( ' <article id= " WikiaMainContent " class= " WikiaMainContent " > ' , raw ) :
raw = raw . split ( ' <article id= " WikiaMainContent " class= " WikiaMainContent " > ' ) [ 1 ] . split ( ' </article> ' ) [ 0 ]
2011-04-05 22:00:29 +00:00
else :
2011-06-04 11:10:45 +00:00
print raw [ : 250 ]
2012-05-20 08:08:28 +00:00
print ' This wiki doesn \' t use marks to split content '
2011-04-05 22:00:29 +00:00
sys . exit ( )
return raw
2013-03-27 22:11:51 +00:00
def getNamespacesScraper ( config = { } ) :
""" Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
""" Function called if no API is available """
2011-04-07 15:43:17 +00:00
namespaces = config [ ' namespaces ' ]
2011-04-09 09:12:58 +00:00
namespacenames = { 0 : ' ' } # main is 0, no prefix
2011-04-05 16:18:18 +00:00
if namespaces :
2011-07-14 20:17:22 +00:00
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Allpages ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) } )
f = urllib2 . urlopen ( req )
raw = f . read ( )
f . close ( )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2011-04-05 16:18:18 +00:00
m = re . compile ( r ' <option [^>]*?value= " (?P<namespaceid> \ d+) " [^>]*?>(?P<namespacename>[^<]+)</option> ' ) . finditer ( raw ) # [^>]*? to include selected="selected"
if ' all ' in namespaces :
namespaces = [ ]
for i in m :
namespaces . append ( int ( i . group ( " namespaceid " ) ) )
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group ( " namespacename " )
else :
#check if those namespaces really exist in this wiki
namespaces2 = [ ]
for i in m :
if int ( i . group ( " namespaceid " ) ) in namespaces :
namespaces2 . append ( int ( i . group ( " namespaceid " ) ) )
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group ( " namespacename " )
namespaces = namespaces2
else :
namespaces = [ 0 ]
2013-03-27 22:11:51 +00:00
namespaces = list ( set ( namespaces ) ) #uniques
2011-07-14 20:17:22 +00:00
print ' %d namespaces found ' % ( len ( namespaces ) )
2011-04-09 09:12:58 +00:00
return namespaces , namespacenames
2012-05-20 08:08:28 +00:00
def getNamespacesAPI ( config = { } ) :
""" Uses the API to get the list of namespaces names and ids """
namespaces = config [ ' namespaces ' ]
namespacenames = { 0 : ' ' } # main is 0, no prefix
if namespaces :
req = urllib2 . Request ( url = config [ ' api ' ] , data = urllib . urlencode ( { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' siprop ' : ' namespaces ' , ' format ' : ' xml ' } ) , headers = { ' User-Agent ' : getUserAgent ( ) } )
f = urllib2 . urlopen ( req )
raw = f . read ( )
f . close ( )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2012-05-20 08:08:28 +00:00
m = re . compile ( r ' <ns id= " (?P<namespaceid> \ d+) " [^>]*?/?>(?P<namespacename>[^<]+)?(</ns>)? ' ) . finditer ( raw ) # [^>]*? to include case="first-letter" canonical= etc.
if ' all ' in namespaces :
namespaces = [ ]
for i in m :
namespaces . append ( int ( i . group ( " namespaceid " ) ) )
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group ( " namespacename " )
else :
#check if those namespaces really exist in this wiki
namespaces2 = [ ]
for i in m :
if int ( i . group ( " namespaceid " ) ) in namespaces :
namespaces2 . append ( int ( i . group ( " namespaceid " ) ) )
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group ( " namespacename " )
namespaces = namespaces2
else :
namespaces = [ 0 ]
2013-03-27 22:11:51 +00:00
namespaces = list ( set ( namespaces ) ) #uniques
2012-05-20 08:08:28 +00:00
print ' %d namespaces found ' % ( len ( namespaces ) )
return namespaces , namespacenames
2011-04-09 09:12:58 +00:00
def getPageTitlesAPI ( config = { } ) :
2012-05-20 08:08:28 +00:00
""" Uses the API to get the list of page titles """
2011-04-05 16:18:18 +00:00
titles = [ ]
2012-05-20 08:08:28 +00:00
namespaces , namespacenames = getNamespacesAPI ( config = config )
2011-04-05 16:18:18 +00:00
for namespace in namespaces :
2011-04-13 19:44:35 +00:00
if namespace in config [ ' exnamespaces ' ] :
2012-05-20 08:08:28 +00:00
print ' Skipping namespace = ' , namespace
2011-04-13 19:44:35 +00:00
continue
2011-04-09 09:12:58 +00:00
c = 0
2011-07-14 20:36:46 +00:00
print ' Retrieving titles in the namespace %d ' % ( namespace )
2011-04-09 09:12:58 +00:00
headers = { ' User-Agent ' : getUserAgent ( ) }
apfrom = ' ! '
while apfrom :
2011-07-14 20:36:46 +00:00
sys . stderr . write ( ' . ' ) #progress
2011-04-09 09:13:57 +00:00
params = { ' action ' : ' query ' , ' list ' : ' allpages ' , ' apnamespace ' : namespace , ' apfrom ' : apfrom , ' format ' : ' xml ' , ' aplimit ' : 500 }
2011-04-09 09:12:58 +00:00
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = config [ ' api ' ] , data = data , headers = headers )
try :
f = urllib2 . urlopen ( req )
except :
try :
print ' Server is slow... Waiting some seconds and retrying... '
time . sleep ( 10 )
f = urllib2 . urlopen ( req )
except :
2012-05-28 09:35:56 +00:00
print ' An error has occurred while retrieving page titles with API '
2011-04-09 09:12:58 +00:00
print ' Please, resume the dump, --resume '
sys . exit ( )
xml = f . read ( )
f . close ( )
2012-11-09 18:55:39 +00:00
m = re . findall ( r ' <allpages (?:apfrom|apcontinue)= " ([^>]+) " /> ' , xml )
2011-04-09 09:12:58 +00:00
if m :
apfrom = undoHTMLEntities ( text = m [ 0 ] ) #" = ", etc
else :
apfrom = ' '
m = re . findall ( r ' title= " ([^>]+) " /> ' , xml )
2011-04-09 11:07:23 +00:00
titles + = [ undoHTMLEntities ( title ) for title in m ]
2011-04-09 09:12:58 +00:00
c + = len ( m )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2011-04-09 09:12:58 +00:00
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
return titles
2013-03-27 22:11:51 +00:00
def getPageTitlesScraper ( config = { } ) :
2011-07-12 16:54:37 +00:00
""" """
2011-04-09 09:12:58 +00:00
titles = [ ]
2013-03-27 22:11:51 +00:00
namespaces , namespacenames = getNamespacesScraper ( config = config )
2011-04-09 09:12:58 +00:00
for namespace in namespaces :
print ' Retrieving titles in the namespace ' , namespace
url = ' %s ?title=Special:Allpages&namespace= %s ' % ( config [ ' index ' ] , namespace )
2013-10-14 09:27:30 +00:00
req = urllib2 . Request ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) } )
2013-11-05 17:10:44 +00:00
raw = urllib2 . urlopen ( req ) . read ( )
2011-04-05 22:00:29 +00:00
raw = cleanHTML ( raw )
2011-04-05 16:18:18 +00:00
2011-04-05 22:00:29 +00:00
r_title = r ' title= " (?P<title>[^>]+) " > '
2011-04-08 23:43:57 +00:00
r_suballpages = ' '
r_suballpages1 = r ' &from=(?P<from>[^>]+)&to=(?P<to>[^>]+) " > '
r_suballpages2 = r ' Special:Allpages/(?P<from>[^>]+) " > '
if re . search ( r_suballpages1 , raw ) :
r_suballpages = r_suballpages1
elif re . search ( r_suballpages2 , raw ) :
r_suballpages = r_suballpages2
else :
pass #perhaps no subpages
2011-04-05 22:00:29 +00:00
deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels
c = 0
checked_suballpages = [ ]
2011-04-05 22:08:53 +00:00
rawacum = raw
2011-04-08 23:43:57 +00:00
while r_suballpages and re . search ( r_suballpages , raw ) and c < deep :
2011-04-05 22:00:29 +00:00
#load sub-Allpages
m = re . compile ( r_suballpages ) . finditer ( raw )
for i in m :
fr = i . group ( ' from ' )
2011-04-08 23:43:57 +00:00
if r_suballpages == r_suballpages1 :
to = i . group ( ' to ' )
name = ' %s - %s ' % ( fr , to )
2011-04-09 09:12:58 +00:00
url = ' %s ?title=Special:Allpages&namespace= %s &from= %s &to= %s ' % ( config [ ' index ' ] , namespace , fr , to ) #do not put urllib.quote in fr or to
elif r_suballpages == r_suballpages2 : #fix, esta regexp no carga bien todas? o falla el r_title en este tipo de subpag? (wikiindex)
2011-04-08 23:50:24 +00:00
fr = fr . split ( ' &namespace= ' ) [ 0 ] #clean &namespace=\d, sometimes happens
2011-04-08 23:43:57 +00:00
name = fr
2011-04-09 09:12:58 +00:00
url = ' %s ?title=Special:Allpages/ %s &namespace= %s ' % ( config [ ' index ' ] , name , namespace )
2011-04-08 23:43:57 +00:00
2011-04-05 22:00:29 +00:00
if not name in checked_suballpages :
2011-04-08 23:50:24 +00:00
checked_suballpages . append ( name ) #to avoid reload dupe subpages links
2013-11-06 08:21:57 +00:00
delay ( config = config )
2013-10-14 09:27:30 +00:00
req2 = urllib2 . Request ( url = url , headers = { ' User-Agent ' : getUserAgent ( ) } )
2013-11-05 17:10:44 +00:00
raw2 = urllib2 . urlopen ( req ) . read ( )
2011-04-05 22:00:29 +00:00
raw2 = cleanHTML ( raw2 )
rawacum + = raw2 #merge it after removed junk
2011-04-08 23:43:57 +00:00
print ' Reading ' , name , len ( raw2 ) , ' bytes ' , len ( re . findall ( r_suballpages , raw2 ) ) , ' subpages ' , len ( re . findall ( r_title , raw2 ) ) , ' pages '
2014-01-23 16:05:19 +00:00
delay ( config = config )
2011-04-05 22:00:29 +00:00
c + = 1
2011-04-05 16:18:18 +00:00
2011-04-09 08:05:48 +00:00
c = 0
2011-04-05 22:00:29 +00:00
m = re . compile ( r_title ) . finditer ( rawacum )
2011-04-05 16:18:18 +00:00
for i in m :
if not i . group ( ' title ' ) . startswith ( ' Special: ' ) :
2011-04-05 22:00:29 +00:00
if not i . group ( ' title ' ) in titles :
2011-05-04 16:53:17 +00:00
titles . append ( undoHTMLEntities ( text = i . group ( ' title ' ) ) )
2011-04-09 08:05:48 +00:00
c + = 1
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
2011-04-09 09:12:58 +00:00
return titles
def getPageTitles ( config = { } ) :
2011-07-12 16:54:37 +00:00
""" """
2011-04-09 09:12:58 +00:00
#http://en.wikipedia.org/wiki/Special:AllPages
#http://archiveteam.org/index.php?title=Special:AllPages
#http://www.wikanda.es/wiki/Especial:Todas
2011-04-13 19:44:35 +00:00
print ' Loading page titles from namespaces = %s ' % ( config [ ' namespaces ' ] and ' , ' . join ( [ str ( i ) for i in config [ ' namespaces ' ] ] ) or ' None ' )
print ' Excluding titles from namespaces = %s ' % ( config [ ' exnamespaces ' ] and ' , ' . join ( [ str ( i ) for i in config [ ' exnamespaces ' ] ] ) or ' None ' )
2011-04-09 09:12:58 +00:00
titles = [ ]
if config [ ' api ' ] :
titles = getPageTitlesAPI ( config = config )
elif config [ ' index ' ] :
2013-03-27 22:11:51 +00:00
titles = getPageTitlesScraper ( config = config )
2011-04-09 09:12:58 +00:00
2011-04-23 21:45:57 +00:00
titles = list ( set ( titles ) ) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace))
titles . sort ( ) #sorting
2011-04-06 18:54:33 +00:00
print ' %d page titles loaded ' % ( len ( titles ) )
2011-04-05 16:18:18 +00:00
return titles
2011-04-07 15:43:17 +00:00
def getXMLHeader ( config = { } ) :
2011-07-15 18:07:50 +00:00
""" Retrieve a random page to extract XML headers (namespace info, etc) """
2011-04-05 22:00:29 +00:00
#get the header of a random page, to attach it in the complete XML backup
#similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x....
2011-07-15 18:07:50 +00:00
randomtitle = ' Main_Page ' #previously AMF5LKE43MNFGHKSDMRTJ
2011-04-30 17:05:59 +00:00
xml = getXMLPage ( config = config , title = randomtitle , verbose = False )
2011-04-05 22:00:29 +00:00
header = xml . split ( ' </mediawiki> ' ) [ 0 ]
2012-05-28 09:35:56 +00:00
if not xml :
print ' XML export on this wiki is broken, quitting. '
sys . exit ( )
2011-04-05 22:00:29 +00:00
return header
2011-04-08 13:39:14 +00:00
def getXMLFileDesc ( config = { } , title = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Get XML for image description page """
2011-04-08 13:39:14 +00:00
config [ ' curonly ' ] = 1 #tricky to get only the most recent desc
2011-05-01 16:50:38 +00:00
return getXMLPage ( config = config , title = title , verbose = False )
2011-04-08 13:39:14 +00:00
2011-04-09 08:05:48 +00:00
def getUserAgent ( ) :
2011-07-12 16:54:37 +00:00
""" Return a cool user-agent to hide Python user-agent """
2013-10-09 19:48:02 +00:00
useragents = [ ' Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0 ' ]
2011-04-09 08:05:48 +00:00
return useragents [ 0 ]
2011-04-30 18:53:35 +00:00
def logerror ( config = { } , text = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Log error in file """
2011-04-30 18:53:35 +00:00
if text :
f = open ( ' %s /errors.log ' % ( config [ ' path ' ] ) , ' a ' )
f . write ( ' %s : %s \n ' % ( datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S ' ) , text ) )
f . close ( )
2011-04-30 14:37:15 +00:00
def getXMLPageCore ( headers = { } , params = { } , config = { } ) :
2011-07-12 16:54:37 +00:00
""" """
2011-04-30 19:44:02 +00:00
#returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
#if retrieving params['limit'] revisions fails, returns a current only version
2011-04-30 18:53:35 +00:00
#if all fail, returns the empty string
2011-04-30 14:37:15 +00:00
xml = ' '
c = 0
2011-04-30 19:44:02 +00:00
maxseconds = 100 #max seconds to wait in a single sleeping
2011-04-30 18:53:35 +00:00
maxretries = 5 # x retries and skip
increment = 20 #increment every retry
2011-04-30 14:37:15 +00:00
while not re . search ( r ' </mediawiki> ' , xml ) :
2011-04-30 18:53:35 +00:00
if c > 0 and c < maxretries :
2011-04-30 17:05:59 +00:00
wait = increment * c < maxseconds and increment * c or maxseconds # incremental until maxseconds
2011-04-30 18:53:35 +00:00
print ' XML for " %s " is wrong. Waiting %d seconds and reloading... ' % ( params [ ' pages ' ] , wait )
2011-04-30 14:37:15 +00:00
time . sleep ( wait )
2011-04-30 18:53:35 +00:00
if params [ ' limit ' ] > 1 : # reducing server load requesting smallest chunks (if curonly then limit = 1 from mother function)
params [ ' limit ' ] = params [ ' limit ' ] / 2 # half
if c > = maxretries :
print ' We have retried %d times ' % ( c )
print ' MediaWiki error for " %s " , network error or whatever... ' % ( params [ ' pages ' ] )
2012-05-28 09:35:56 +00:00
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save nonly the last
2012-06-22 07:07:59 +00:00
# params['curonly'] should mean that we've already tried this fallback, because it's set by the following if and passed to getXMLPageCore
if not config [ ' curonly ' ] :
2011-04-30 18:53:35 +00:00
print ' Trying to save only the last revision for this page... '
params [ ' curonly ' ] = 1
logerror ( config = config , text = ' Error while retrieving the full history of " %s " . Trying to save only the last revision for this page ' % ( params [ ' pages ' ] ) )
return getXMLPageCore ( headers = headers , params = params , config = config )
else :
2012-05-28 09:35:56 +00:00
print ' Saving in the errors log, and skipping... '
logerror ( config = config , text = ' Error while retrieving the last revision of " %s " . Skipping. ' % ( params [ ' pages ' ] ) )
2011-04-30 18:53:35 +00:00
return ' ' # empty xml
2011-04-30 14:37:15 +00:00
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = config [ ' index ' ] , data = data , headers = headers )
try :
f = urllib2 . urlopen ( req )
except :
try :
print ' Server is slow... Waiting some seconds and retrying... '
time . sleep ( 15 )
f = urllib2 . urlopen ( req )
except :
2013-11-11 07:33:25 +00:00
print ' An error has occurred while retrieving " %s " ' % ( params [ ' pages ' ] )
2011-04-30 14:37:15 +00:00
print ' Please, resume the dump, --resume '
sys . exit ( )
2012-05-28 09:35:56 +00:00
# The error is usually temporary, but we exit the dump altogether.
2011-04-30 14:37:15 +00:00
xml = f . read ( )
c + = 1
return xml
2011-04-30 17:05:59 +00:00
def getXMLPage ( config = { } , title = ' ' , verbose = True ) :
2011-07-12 16:54:37 +00:00
""" """
2011-04-30 19:44:02 +00:00
#return the full history (or current only) of a page
2012-05-28 09:35:56 +00:00
#if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
2011-04-05 22:00:29 +00:00
#http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
2011-04-30 19:44:02 +00:00
2011-04-06 18:54:33 +00:00
limit = 1000
2011-04-05 23:34:52 +00:00
truncated = False
2011-04-10 09:17:05 +00:00
title_ = title
title_ = re . sub ( ' ' , ' _ ' , title_ )
2011-04-16 14:51:48 +00:00
#do not convert & into %26, title_ = re.sub('&', '%26', title_)
2011-04-09 08:05:48 +00:00
headers = { ' User-Agent ' : getUserAgent ( ) }
2011-04-06 12:26:13 +00:00
params = { ' title ' : ' Special:Export ' , ' pages ' : title_ , ' action ' : ' submit ' , }
2011-04-07 15:43:17 +00:00
if config [ ' curonly ' ] :
2011-04-05 22:00:29 +00:00
params [ ' curonly ' ] = 1
2011-04-30 17:05:59 +00:00
params [ ' limit ' ] = 1
2011-04-05 22:00:29 +00:00
else :
2011-04-23 21:45:57 +00:00
params [ ' offset ' ] = ' 1 ' # 1 always < 2000s
2011-04-05 22:00:29 +00:00
params [ ' limit ' ] = limit
2012-04-20 20:03:54 +00:00
if config . has_key ( ' templates ' ) and config [ ' templates ' ] : #in other case, do not set params['templates']
params [ ' templates ' ] = 1
2011-04-30 14:37:15 +00:00
xml = getXMLPageCore ( headers = headers , params = params , config = config )
2011-04-05 22:00:29 +00:00
2011-04-05 23:00:33 +00:00
#if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
#else, warning about Special:Export truncating large page histories
r_timestamp = r ' <timestamp>([^<]+)</timestamp> '
2011-04-07 15:43:17 +00:00
if not config [ ' curonly ' ] and re . search ( r_timestamp , xml ) : # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
2011-04-30 14:37:15 +00:00
while not truncated and params [ ' offset ' ] : #next chunk
2011-04-06 11:39:02 +00:00
params [ ' offset ' ] = re . findall ( r_timestamp , xml ) [ - 1 ] #get the last timestamp from the acum XML
2011-04-30 14:37:15 +00:00
xml2 = getXMLPageCore ( headers = headers , params = params , config = config )
2011-04-30 19:44:02 +00:00
if re . findall ( r_timestamp , xml2 ) : #are there more edits in this next XML chunk or no <page></page>?
2011-04-06 11:39:02 +00:00
if re . findall ( r_timestamp , xml2 ) [ - 1 ] == params [ ' offset ' ] :
#again the same XML, this wiki does not support params in Special:Export, offer complete XML up to X edits (usually 1000)
2012-04-08 11:07:00 +00:00
print ' ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated '
2011-04-06 11:39:02 +00:00
truncated = True
break
else :
2011-04-23 21:45:57 +00:00
""" </namespaces>
< / siteinfo >
< page >
< title > Main Page < / title >
< id > 15580374 < / id >
< restrictions > edit = sysop : move = sysop < / restrictions > ( ? )
< revision >
< id > 418009832 < / id >
< timestamp > 2011 - 03 - 09 T19 : 57 : 06 Z < / timestamp >
< contributor >
"""
2011-04-06 11:39:02 +00:00
#offset is OK in this wiki, merge with the previous chunk of this page history and continue
2011-04-23 21:45:57 +00:00
xml = xml . split ( ' </page> ' ) [ 0 ] + ' <revision> ' + ( ' <revision> ' . join ( xml2 . split ( ' <revision> ' ) [ 1 : ] ) )
2011-04-05 23:00:33 +00:00
else :
2011-04-06 11:39:02 +00:00
params [ ' offset ' ] = ' ' #no more edits in this page history
2011-04-30 14:37:15 +00:00
2011-04-30 17:05:59 +00:00
if verbose :
2013-08-24 06:10:04 +00:00
numberofedits = len ( re . findall ( r_timestamp , xml ) )
if ( numberofedits == 1 ) :
2013-10-13 09:35:48 +00:00
print ' %s , 1 edit ' % ( title )
2013-08-24 06:10:04 +00:00
else :
2013-10-13 09:35:48 +00:00
print ' %s , %d edits ' % ( title , numberofedits )
2011-04-30 17:05:59 +00:00
2011-04-05 22:00:29 +00:00
return xml
def cleanXML ( xml = ' ' ) :
2013-03-27 22:11:51 +00:00
""" Trim redundant info """
#do not touch XML codification, leave AS IS
2011-04-30 18:53:35 +00:00
if re . search ( r ' </siteinfo> \ n ' , xml ) and re . search ( r ' </mediawiki> ' , xml ) :
xml = xml . split ( ' </siteinfo> \n ' ) [ 1 ]
xml = xml . split ( ' </mediawiki> ' ) [ 0 ]
2011-04-05 22:00:29 +00:00
return xml
2011-04-05 16:18:18 +00:00
2011-04-07 20:24:30 +00:00
def generateXMLDump ( config = { } , titles = [ ] , start = ' ' ) :
2011-07-12 16:54:37 +00:00
""" """
2011-04-07 20:24:30 +00:00
print ' Retrieving the XML for every page from " %s " ' % ( start and start or ' start ' )
header = getXMLHeader ( config = config )
2011-04-06 11:39:02 +00:00
footer = ' </mediawiki> \n ' #new line at the end
2011-04-09 09:12:58 +00:00
xmlfilename = ' %s - %s - %s .xml ' % ( domain2prefix ( config = config ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' )
2011-04-07 20:24:30 +00:00
xmlfile = ' '
lock = True
if start :
#remove the last chunk of xml dump (it is probably incomplete)
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' r ' )
2011-04-15 13:24:16 +00:00
xmlfile2 = open ( ' %s / %s 2 ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
prev = ' '
c = 0
for l in xmlfile :
2011-04-23 21:45:57 +00:00
#removing <page>\n until end of file
2011-04-15 13:24:16 +00:00
if c != 0 : #lock to avoid write an empty line at the begining of file
if not re . search ( r ' <title> %s </title> ' % ( start ) , l ) :
xmlfile2 . write ( prev )
else :
break
c + = 1
prev = l
2011-04-07 20:24:30 +00:00
xmlfile . close ( )
2011-04-15 13:24:16 +00:00
xmlfile2 . close ( )
#subst xml with xml2
2011-04-15 13:47:44 +00:00
os . remove ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) ) #remove previous xml dump
os . rename ( ' %s / %s 2 ' % ( config [ ' path ' ] , xmlfilename ) , ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) ) #move correctly truncated dump to its real name
2011-04-07 20:24:30 +00:00
else :
#requested complete xml dump
lock = False
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
xmlfile . write ( header )
xmlfile . close ( )
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' a ' )
2011-04-05 22:38:58 +00:00
c = 1
2011-04-05 22:00:29 +00:00
for title in titles :
2011-04-16 14:51:48 +00:00
if not title . strip ( ) :
continue
2011-04-07 20:24:30 +00:00
if title == start : #start downloading from start, included
lock = False
if lock :
continue
2011-04-07 16:10:12 +00:00
delay ( config = config )
2011-04-05 22:38:58 +00:00
if c % 10 == 0 :
2011-04-30 17:05:59 +00:00
print ' Downloaded %d pages ' % ( c )
2011-04-07 20:24:30 +00:00
xml = getXMLPage ( config = config , title = title )
2011-04-05 22:00:29 +00:00
xml = cleanXML ( xml = xml )
2011-04-30 18:53:35 +00:00
if not xml :
logerror ( config = config , text = ' The page " %s " was missing in the wiki (probably deleted) ' % ( title ) )
#here, XML is a correct <page> </page> chunk or
#an empty string due to a deleted page (logged in errors log) or
#an empty string due to an error while retrieving the page from server (logged in errors log)
2011-04-05 22:00:29 +00:00
xmlfile . write ( xml )
2011-04-05 22:38:58 +00:00
c + = 1
2011-04-05 22:00:29 +00:00
xmlfile . write ( footer )
xmlfile . close ( )
2011-04-07 13:14:37 +00:00
print ' XML dump saved at... ' , xmlfilename
2011-04-06 18:54:33 +00:00
2011-04-07 15:43:17 +00:00
def saveTitles ( config = { } , titles = [ ] ) :
2013-03-27 22:11:51 +00:00
""" Save title list in a file """
2011-04-06 18:54:33 +00:00
#save titles in a txt for resume if needed
2011-04-09 09:12:58 +00:00
titlesfilename = ' %s - %s -titles.txt ' % ( domain2prefix ( config = config ) , config [ ' date ' ] )
2011-04-07 20:24:30 +00:00
titlesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , titlesfilename ) , ' w ' )
2011-04-07 13:14:37 +00:00
titlesfile . write ( ' \n ' . join ( titles ) )
2011-04-09 22:13:05 +00:00
titlesfile . write ( ' \n --END-- ' )
2011-04-07 13:14:37 +00:00
titlesfile . close ( )
print ' Titles saved at... ' , titlesfilename
2011-04-06 18:54:33 +00:00
2011-04-07 22:32:05 +00:00
def saveImageFilenamesURL ( config = { } , images = [ ] ) :
2013-03-27 22:11:51 +00:00
""" Save image list in a file """
2011-04-07 22:32:05 +00:00
#save list of images and their urls
2011-04-09 09:12:58 +00:00
imagesfilename = ' %s - %s -images.txt ' % ( domain2prefix ( config = config ) , config [ ' date ' ] )
2011-04-07 22:32:05 +00:00
imagesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , imagesfilename ) , ' w ' )
2011-04-08 14:57:36 +00:00
imagesfile . write ( ' \n ' . join ( [ ' %s \t %s \t %s ' % ( filename , url , uploader ) for filename , url , uploader in images ] ) )
2011-04-07 22:32:05 +00:00
imagesfile . write ( ' \n --END-- ' )
imagesfile . close ( )
print ' Image filenames and URLs saved at... ' , imagesfilename
2011-04-09 09:12:58 +00:00
def getImageFilenamesURL ( config = { } ) :
2011-07-14 20:38:10 +00:00
""" Retrieve file list: filename, url, uploader """
2011-04-07 13:14:37 +00:00
print ' Retrieving image filenames '
r_next = r ' (?<!&dir=prev)&offset=(?P<offset> \ d+)& ' # (?<! http://docs.python.org/library/re.html
images = [ ]
offset = ' 29990101000000 ' #january 1, 2999
2011-07-09 17:47:04 +00:00
limit = 5000
2011-07-14 21:32:48 +00:00
retries = 5
2011-04-07 13:14:37 +00:00
while offset :
2011-07-14 20:36:46 +00:00
#5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Imagelist ' , ' limit ' : limit , ' offset ' : offset , } ) , headers = { ' User-Agent ' : getUserAgent ( ) } )
f = urllib2 . urlopen ( req )
raw = f . read ( )
f . close ( )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2012-05-20 08:08:28 +00:00
if re . search ( ur ' (?i)(allowed memory size of \ d+ bytes exhausted|Call to a member function getURL) ' , raw ) : # delicate wiki
2011-07-14 21:32:48 +00:00
if limit > 10 :
print ' Error: listing %d images in a chunk is not possible, trying tiny chunks ' % ( limit )
limit = limit / 10
continue
elif retries > 0 : # waste retries, then exit
retries - = 1
print ' Retrying... '
continue
else :
print ' No more retries, exit... '
break
2011-07-09 17:47:04 +00:00
2011-04-07 13:14:37 +00:00
raw = cleanHTML ( raw )
2011-04-10 17:51:31 +00:00
#archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
#wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1" class="new" title="Usuario:Fernandocg (página no existe)">Fernandocg</a></td>
2011-04-08 20:35:05 +00:00
r_images1 = r ' (?im)<td class= " TablePager_col_img_name " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a>[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2011-04-10 17:51:31 +00:00
#wikijuegos 1.9.5 http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old mediawiki version
2011-04-08 20:35:05 +00:00
r_images2 = r ' (?im)<td class= " TablePager_col_links " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a></td> \ s*<td class= " TablePager_col_img_timestamp " >[^<]+</td> \ s*<td class= " TablePager_col_img_name " >[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2011-04-10 17:51:31 +00:00
#gentoowiki 1.18 <tr><td class="TablePager_col_img_timestamp">18:15, 3 April 2011</td><td class="TablePager_col_img_name"><a href="/wiki/File:Asus_eeepc-1201nl.png" title="File:Asus eeepc-1201nl.png">Asus eeepc-1201nl.png</a> (<a href="/w/images/2/2b/Asus_eeepc-1201nl.png">file</a>)</td><td class="TablePager_col_thumb"><a href="/wiki/File:Asus_eeepc-1201nl.png" class="image"><img alt="" src="/w/images/thumb/2/2b/Asus_eeepc-1201nl.png/180px-Asus_eeepc-1201nl.png" width="180" height="225" /></a></td><td class="TablePager_col_img_size">37 KB</td><td class="TablePager_col_img_user_text"><a href="/w/index.php?title=User:Yannails&action=edit&redlink=1" class="new" title="User:Yannails (page does not exist)">Yannails</a></td><td class="TablePager_col_img_description"> </td><td class="TablePager_col_count">1</td></tr>
r_images3 = r ' (?im)<td class= " TablePager_col_img_name " ><a[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+) " >[^<]+</a>[^<]+</td><td class= " TablePager_col_thumb " ><a[^>]+><img[^>]+></a></td><td class= " TablePager_col_img_size " >[^<]+</td><td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2011-06-13 20:19:13 +00:00
#http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
#(<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
r_images4 = r ' (?im)<a href=[^>]+ title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+) " >[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a> '
2011-04-08 20:35:05 +00:00
m = [ ]
#different mediawiki versions
if re . search ( r_images1 , raw ) :
m = re . compile ( r_images1 ) . finditer ( raw )
elif re . search ( r_images2 , raw ) :
m = re . compile ( r_images2 ) . finditer ( raw )
2011-04-10 17:51:31 +00:00
elif re . search ( r_images3 , raw ) :
m = re . compile ( r_images3 ) . finditer ( raw )
2011-06-13 20:19:13 +00:00
elif re . search ( r_images4 , raw ) :
m = re . compile ( r_images4 ) . finditer ( raw )
2011-04-08 20:35:05 +00:00
2011-04-07 13:14:37 +00:00
for i in m :
url = i . group ( ' url ' )
2011-05-04 09:26:20 +00:00
if url [ 0 ] == ' / ' or ( not url . startswith ( ' http:// ' ) and not url . startswith ( ' https:// ' ) ) : #is it a relative URL?
if url [ 0 ] == ' / ' : #slash is added later
2011-04-08 15:12:30 +00:00
url = url [ 1 : ]
2011-05-04 09:26:20 +00:00
domainalone = config [ ' index ' ] . split ( ' :// ' ) [ 1 ] . split ( ' / ' ) [ 0 ] #remove from :// (http or https) until the first / after domain
2011-05-04 16:53:17 +00:00
url = ' %s :// %s / %s ' % ( config [ ' index ' ] . split ( ' :// ' ) [ 0 ] , domainalone , url ) # concat http(s) + domain + relative url
2011-04-08 15:34:53 +00:00
url = undoHTMLEntities ( text = url )
2011-04-08 16:30:29 +00:00
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
2011-04-08 15:34:53 +00:00
url = re . sub ( ' ' , ' _ ' , url )
2011-04-07 17:28:08 +00:00
filename = re . sub ( ' _ ' , ' ' , i . group ( ' filename ' ) )
2011-04-08 15:34:53 +00:00
filename = undoHTMLEntities ( text = filename )
filename = urllib . unquote ( filename )
2011-04-08 14:57:36 +00:00
uploader = re . sub ( ' _ ' , ' ' , i . group ( ' uploader ' ) )
2011-04-08 15:34:53 +00:00
uploader = undoHTMLEntities ( text = uploader )
uploader = urllib . unquote ( uploader )
2011-04-08 14:57:36 +00:00
images . append ( [ filename , url , uploader ] )
2011-04-07 22:32:05 +00:00
#print filename, url
2011-04-07 13:14:37 +00:00
if re . search ( r_next , raw ) :
offset = re . findall ( r_next , raw ) [ 0 ]
2011-07-14 21:32:48 +00:00
retries + = 5 # add more retries if we got a page with offset
2011-04-07 13:14:37 +00:00
else :
offset = ' '
2013-10-13 09:35:48 +00:00
if ( len ( images ) == 1 ) :
2013-10-14 19:10:10 +00:00
print ' Found 1 image '
else :
print ' Found %d images ' % ( len ( images ) )
2013-10-13 09:35:48 +00:00
2011-04-07 22:32:05 +00:00
images . sort ( )
return images
2012-05-20 08:08:28 +00:00
def getImageFilenamesURLAPI ( config = { } ) :
""" Retrieve file list: filename, url, uploader """
print ' Retrieving image filenames '
headers = { ' User-Agent ' : getUserAgent ( ) }
aifrom = ' ! '
images = [ ]
while aifrom :
sys . stderr . write ( ' . ' ) #progress
params = { ' action ' : ' query ' , ' list ' : ' allimages ' , ' aiprop ' : ' url|user ' , ' aifrom ' : aifrom , ' format ' : ' xml ' , ' ailimit ' : 500 }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = config [ ' api ' ] , data = data , headers = headers )
try :
f = urllib2 . urlopen ( req )
except :
try :
print ' Server is slow... Waiting some seconds and retrying... '
time . sleep ( 10 )
f = urllib2 . urlopen ( req )
except :
print ' An error has occurred while retrieving page titles with API '
print ' Please, resume the dump, --resume '
sys . exit ( )
xml = f . read ( )
f . close ( )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2013-11-08 21:57:04 +00:00
# Match the query-continue, old and new format
2013-11-08 22:01:36 +00:00
m = re . findall ( r ' <allimages (?:aicontinue|aifrom)= " ([^>]+) " /> ' , xml )
2012-05-20 08:08:28 +00:00
if m :
aifrom = undoHTMLEntities ( text = m [ 0 ] ) #" = ", etc
else :
aifrom = ' '
m = re . compile ( r ' (?im)<img name= " (?P<filename>[^ " ]+) " [^>]*user= " (?P<uploader>[^ " ]+) " [^>]* url= " (?P<url>[^ " ]+) " [^>]*/> ' ) . finditer ( xml ) # Retrieves a filename, uploader, url triple from the name, user, url field of the xml line; space before url needed to avoid getting the descriptionurl field instead.
for i in m :
url = i . group ( ' url ' )
if url [ 0 ] == ' / ' or ( not url . startswith ( ' http:// ' ) and not url . startswith ( ' https:// ' ) ) : #is it a relative URL?
if url [ 0 ] == ' / ' : #slash is added later
url = url [ 1 : ]
domainalone = config [ ' index ' ] . split ( ' :// ' ) [ 1 ] . split ( ' / ' ) [ 0 ] #remove from :// (http or https) until the first / after domain
url = ' %s :// %s / %s ' % ( config [ ' index ' ] . split ( ' :// ' ) [ 0 ] , domainalone , url ) # concat http(s) + domain + relative url
url = undoHTMLEntities ( text = url )
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
url = re . sub ( ' ' , ' _ ' , url )
filename = re . sub ( ' _ ' , ' ' , i . group ( ' filename ' ) )
filename = undoHTMLEntities ( text = filename )
filename = urllib . unquote ( filename )
uploader = re . sub ( ' _ ' , ' ' , i . group ( ' uploader ' ) )
uploader = undoHTMLEntities ( text = uploader )
uploader = urllib . unquote ( uploader )
images . append ( [ filename , url , uploader ] )
2013-10-13 09:35:48 +00:00
if ( len ( images ) == 1 ) :
print ' Found 1 image '
else :
print ' Found %d images ' % ( len ( images ) )
2012-05-20 08:08:28 +00:00
images . sort ( )
return images
2011-04-08 15:34:53 +00:00
def undoHTMLEntities ( text = ' ' ) :
2013-03-27 21:50:23 +00:00
""" Undo some HTML codes """
text = re . sub ( ' < ' , ' < ' , text ) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp
2011-04-08 15:34:53 +00:00
text = re . sub ( ' > ' , ' > ' , text )
text = re . sub ( ' & ' , ' & ' , text )
text = re . sub ( ' " ' , ' " ' , text )
2011-04-16 14:51:48 +00:00
text = re . sub ( ' ' ' , ' \' ' , text )
2011-04-08 15:34:53 +00:00
return text
2011-04-11 19:09:55 +00:00
def generateImageDump ( config = { } , other = { } , images = [ ] , start = ' ' ) :
2011-07-14 20:45:41 +00:00
""" Save files and descriptions using a file list """
#fix use subdirectories md5
2011-04-07 22:32:05 +00:00
print ' Retrieving images from " %s " ' % ( start and start or ' start ' )
2011-04-07 15:43:17 +00:00
imagepath = ' %s /images ' % ( config [ ' path ' ] )
2011-04-30 12:37:54 +00:00
if not os . path . isdir ( imagepath ) :
2011-04-30 17:05:59 +00:00
print ' Creating " %s " directory ' % ( imagepath )
2011-04-07 13:14:37 +00:00
os . makedirs ( imagepath )
c = 0
2011-04-07 22:32:05 +00:00
lock = True
2011-04-07 23:37:45 +00:00
if not start :
lock = False
2011-04-08 14:57:36 +00:00
for filename , url , uploader in images :
2013-03-27 21:50:23 +00:00
if filename == start : #start downloading from start (included)
2011-04-07 22:32:05 +00:00
lock = False
if lock :
continue
2011-04-07 16:10:12 +00:00
delay ( config = config )
2011-07-14 20:47:34 +00:00
2011-04-08 13:39:14 +00:00
#saving file
2011-04-11 18:38:47 +00:00
#truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max)
filename2 = filename
2011-04-11 19:09:55 +00:00
if len ( filename2 ) > other [ ' filenamelimit ' ] :
2011-04-11 18:38:47 +00:00
# split last . (extension) and then merge
2011-04-11 19:09:55 +00:00
filename2 = truncateFilename ( other = other , filename = filename2 )
2013-03-27 21:50:23 +00:00
print ' Filename is too long, truncating. Now it is: ' , filename2
2013-10-14 09:27:30 +00:00
# We need to set the user agent for urlretrieve but we can't do it in its call
# so we just override the class here; all I know about this method comes from
# http://docs.python.org/2/library/urllib.html#urllib._urlopener ,
# http://docs.python.org/2/tutorial/classes.html#class-definition-syntax .
# TODO: Override the user agent for all functions in a more sensible place.
class URLopenerUserAgent ( urllib . FancyURLopener ) :
version = " %s " % getUserAgent ( )
urllib . _urlopener = URLopenerUserAgent ( )
2014-01-23 16:05:19 +00:00
urllib . urlretrieve ( url = url , filename = ' %s / %s ' % ( imagepath , filename2 ) )
2013-11-01 23:07:27 +00:00
# TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works?
2011-07-14 20:47:34 +00:00
2011-04-08 13:39:14 +00:00
#saving description if any
2013-03-27 21:50:23 +00:00
xmlfiledesc = getXMLFileDesc ( config = config , title = ' Image: %s ' % ( filename ) ) # use Image: for backwards compatibility
2011-04-11 18:38:47 +00:00
f = open ( ' %s / %s .desc ' % ( imagepath , filename2 ) , ' w ' )
2012-08-14 19:09:34 +00:00
if not re . search ( r ' </mediawiki> ' , xmlfiledesc ) : #<text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
2012-08-14 19:08:50 +00:00
#failure when retrieving desc? then save it as empty .desc
2011-04-08 13:39:14 +00:00
xmlfiledesc = ' '
f . write ( xmlfiledesc )
f . close ( )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2011-04-07 13:14:37 +00:00
c + = 1
if c % 10 == 0 :
print ' Downloaded %d images ' % ( c )
print ' Downloaded %d images ' % ( c )
2011-04-07 15:43:17 +00:00
def saveLogs ( config = { } ) :
2013-03-27 21:30:52 +00:00
""" Save Special:Log """
2011-04-06 19:17:59 +00:00
#get all logs from Special:Log
""" parse
< select name = ' type ' >
< option value = " block " > Bloqueos de usuarios < / option >
< option value = " rights " > Cambios de perfil de usuario < / option >
< option value = " protect " selected = " selected " > Protecciones de páginas < / option >
< option value = " delete " > Registro de borrados < / option >
< option value = " newusers " > Registro de creación de usuarios < / option >
< option value = " merge " > Registro de fusiones < / option >
< option value = " import " > Registro de importaciones < / option >
< option value = " patrol " > Registro de revisiones < / option >
< option value = " move " > Registro de traslados < / option >
< option value = " upload " > Subidas de archivos < / option >
< option value = " " > Todos los registros < / option >
< / select >
"""
2011-04-07 16:10:12 +00:00
delay ( config = config )
2011-04-06 19:17:59 +00:00
2011-04-09 09:12:58 +00:00
def domain2prefix ( config = { } ) :
2013-11-07 12:24:50 +00:00
""" Convert domain name to a valid prefix filename. """
# At this point, both api and index are supposed to be defined
2011-04-09 09:12:58 +00:00
domain = ' '
2013-11-07 12:24:50 +00:00
if config [ ' api ' ] :
2011-04-09 09:12:58 +00:00
domain = config [ ' api ' ]
2013-11-07 12:24:50 +00:00
elif config [ ' index ' ] :
2011-04-09 09:12:58 +00:00
domain = config [ ' index ' ]
2013-11-07 12:24:50 +00:00
2011-04-13 19:44:35 +00:00
domain = domain . lower ( )
2011-04-29 08:59:13 +00:00
domain = re . sub ( r ' (https?://|www \ .|/index \ .php|/api \ .php) ' , ' ' , domain )
2011-04-07 13:14:37 +00:00
domain = re . sub ( r ' / ' , ' _ ' , domain )
domain = re . sub ( r ' \ . ' , ' ' , domain )
domain = re . sub ( r ' [^A-Za-z0-9] ' , ' _ ' , domain )
return domain
2011-04-07 15:43:17 +00:00
def loadConfig ( config = { } , configfilename = ' ' ) :
2013-03-27 21:30:52 +00:00
""" Load config file """
2011-04-16 14:51:48 +00:00
try :
f = open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' r ' )
except :
print ' There is no config file. we can \' t resume. Start a new dump. '
sys . exit ( )
2011-04-07 13:14:37 +00:00
config = cPickle . load ( f )
f . close ( )
return config
2011-04-07 15:43:17 +00:00
def saveConfig ( config = { } , configfilename = ' ' ) :
2013-03-27 21:30:52 +00:00
""" Save config file """
2011-04-07 15:43:17 +00:00
f = open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' w ' )
2011-04-07 13:14:37 +00:00
cPickle . dump ( config , f )
f . close ( )
2011-04-09 17:45:56 +00:00
def welcome ( ) :
2013-03-27 21:19:46 +00:00
""" Opening message """
2011-04-09 17:45:56 +00:00
print " # " * 73
2013-11-10 09:17:06 +00:00
print """ # Welcome to DumpGenerator 0.2 by WikiTeam (GPL v3) #
2011-04-09 17:45:56 +00:00
# More info at: http://code.google.com/p/wikiteam/ #"""
print " # " * 73
print ' '
print " # " * 73
2013-03-27 21:19:46 +00:00
print """ # Copyright (C) 2011-2013 WikiTeam #
2011-04-09 17:45:56 +00:00
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #"""
print " # " * 73
print ' '
def bye ( ) :
2013-03-27 21:19:46 +00:00
""" Closing message """
2011-07-09 18:17:04 +00:00
print " ---> Congratulations! Your dump is complete <--- "
2012-04-09 10:03:32 +00:00
print " If you found any bug, report a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list "
print " If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in http://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump or contact us at http://code.google.com/p/wikiteam "
2011-04-07 16:10:12 +00:00
print " Good luck! Bye! "
2011-04-07 15:43:17 +00:00
def usage ( ) :
2011-07-12 16:54:37 +00:00
""" """
2012-03-04 12:35:02 +00:00
print """ Error. You forget mandatory parameters:
2012-11-09 18:55:39 +00:00
- - api or - - index : URL to api . php or to index . php , one of them . Examples : - - api = http : / / archiveteam . org / api . php or - - index = http : / / archiveteam . org / index . php
2012-03-04 12:35:02 +00:00
And one of these at least :
2012-09-08 17:18:25 +00:00
- - xml : It generates a XML dump . It retrieves full history of all pages ( if you want only the current version use - - xml - - curonly )
If you want filter by namespace , use the parameter - - namespaces = 0 , 1 , 2 , 3. . .
- - images : It generates an image dump
2012-03-04 12:35:02 +00:00
You can resume previous incomplete dumps :
2012-09-08 17:18:25 +00:00
- - resume : It resumes previous incomplete dump . When using - - resume , - - path is mandatory ( path to directory where incomplete dump is ) .
2012-03-04 12:35:02 +00:00
You can exclude namespaces :
2012-09-08 17:18:25 +00:00
- - exnamespaces : Write the number of the namespaces you want to exclude , split by commas .
2012-03-04 12:35:02 +00:00
2013-10-09 20:11:01 +00:00
You can use authenticaton cookies from a Mozilla cookies . txt file :
- - cookies : Path to a cookies . txt file . Example : - - cookies = $ HOME / . netscape / cookies . txt
2012-03-04 12:35:02 +00:00
You can be nice with servers using a delay :
2013-11-06 08:13:07 +00:00
- - delay : It adds a delay ( in seconds , adding 5 seconds between requests : - - delay = 5 )
2012-03-04 12:35:02 +00:00
Write - - help for help . """
2011-04-07 15:43:17 +00:00
2011-06-10 14:29:30 +00:00
def getParameters ( params = [ ] ) :
if not params :
params = sys . argv [ 1 : ]
2011-04-07 13:14:37 +00:00
config = {
' curonly ' : False ,
2011-04-07 20:24:30 +00:00
' date ' : datetime . datetime . now ( ) . strftime ( ' % Y % m %d ' ) ,
2011-04-09 09:12:58 +00:00
' api ' : ' ' ,
' index ' : ' ' ,
2011-04-07 15:43:17 +00:00
' images ' : False ,
2011-04-07 13:14:37 +00:00
' logs ' : False ,
2011-04-07 15:43:17 +00:00
' xml ' : False ,
2011-04-07 23:04:05 +00:00
' namespaces ' : [ ' all ' ] ,
2011-04-13 19:44:35 +00:00
' exnamespaces ' : [ ] ,
2011-04-07 15:43:17 +00:00
' path ' : ' ' ,
2013-10-09 20:11:01 +00:00
' cookies ' : ' ' ,
2011-04-07 15:56:48 +00:00
' delay ' : 0 ,
2011-04-07 13:14:37 +00:00
}
2011-04-07 15:43:17 +00:00
other = {
' resume ' : False ,
2011-04-11 19:09:55 +00:00
' filenamelimit ' : 100 , #do not change
2011-07-14 19:54:14 +00:00
' force ' : False ,
2011-04-07 15:43:17 +00:00
}
#console params
try :
2013-10-09 20:11:01 +00:00
opts , args = getopt . getopt ( params , " " , [ " h " , " help " , " path= " , " api= " , " index= " , " images " , " logs " , " xml " , " curonly " , " resume " , " cookies= " , " delay= " , " namespaces= " , " exnamespaces= " , " force " , ] )
2011-04-07 15:43:17 +00:00
except getopt . GetoptError , err :
# print help information and exit:
print str ( err ) # will print something like "option -a not recognized"
usage ( )
sys . exit ( 2 )
for o , a in opts :
if o in ( " -h " , " --help " ) :
usage ( )
2012-03-04 12:35:02 +00:00
sys . exit ( )
2011-04-07 15:43:17 +00:00
elif o in ( " --path " ) :
config [ " path " ] = a
while len ( config [ " path " ] ) > 0 :
if config [ " path " ] [ - 1 ] == ' / ' : #dará problemas con rutas windows?
config [ " path " ] = config [ " path " ] [ : - 1 ]
else :
break
2011-04-09 09:12:58 +00:00
elif o in ( " --api " ) :
2011-05-04 09:26:20 +00:00
if not a . startswith ( ' http:// ' ) and not a . startswith ( ' https:// ' ) :
print ' api.php must start with http:// or https:// '
sys . exit ( )
2011-04-09 09:12:58 +00:00
config [ ' api ' ] = a
elif o in ( " --index " ) :
2011-05-04 09:26:20 +00:00
if not a . startswith ( ' http:// ' ) and not a . startswith ( ' https:// ' ) :
print ' index.php must start with http:// or https:// '
sys . exit ( )
2011-04-09 09:12:58 +00:00
config [ " index " ] = a
2011-04-07 15:43:17 +00:00
elif o in ( " --images " ) :
config [ " images " ] = True
elif o in ( " --logs " ) :
config [ " logs " ] = True
elif o in ( " --xml " ) :
config [ " xml " ] = True
elif o in ( " --curonly " ) :
if not config [ " xml " ] :
print " If you select --curonly, you must use --xml too "
sys . exit ( )
config [ " curonly " ] = True
elif o in ( " --resume " ) :
other [ " resume " ] = True
2013-10-09 20:11:01 +00:00
elif o in ( " --cookies " ) :
config [ " cookies " ] = a
2011-04-07 15:56:48 +00:00
elif o in ( " --delay " ) :
config [ " delay " ] = int ( a )
2011-04-13 19:44:35 +00:00
elif o in ( " --namespaces " ) :
2011-07-14 20:36:46 +00:00
if re . search ( r ' [^ \ d, \ -] ' , a ) and a . lower ( ) != ' all ' : #fix, why - ? and... --namespaces= all with a space works?
2011-04-13 19:44:35 +00:00
print " Invalid namespaces values. \n Valid format is integer(s) splitted by commas "
sys . exit ( )
a = re . sub ( ' ' , ' ' , a )
if a . lower ( ) == ' all ' :
2011-04-13 19:49:53 +00:00
config [ " namespaces " ] = [ ' all ' ]
2011-04-13 19:44:35 +00:00
else :
config [ " namespaces " ] = [ int ( i ) for i in a . split ( ' , ' ) ]
elif o in ( " --exnamespaces " ) :
if re . search ( r ' [^ \ d, \ -] ' , a ) :
print " Invalid exnamespaces values. \n Valid format is integer(s) splitted by commas "
sys . exit ( )
a = re . sub ( ' ' , ' ' , a )
if a . lower ( ) == ' all ' :
print ' You have excluded all namespaces. Error. '
sys . exit ( )
else :
config [ " exnamespaces " ] = [ int ( i ) for i in a . split ( ' , ' ) ]
2011-07-14 19:54:14 +00:00
elif o in ( " --force " ) :
other [ " force " ] = True
2011-04-07 15:43:17 +00:00
else :
assert False , " unhandled option "
#missing mandatory params
2011-04-13 19:03:16 +00:00
#(config['index'] and not re.search('/index\.php', config['index'])) or \ # in EditThis there is no index.php, it is empty editthis.info/mywiki/?title=...
2011-04-09 09:12:58 +00:00
if ( not config [ ' api ' ] and not config [ ' index ' ] ) or \
( config [ ' api ' ] and not re . search ( ' /api \ .php ' , config [ ' api ' ] ) ) or \
2011-04-11 19:09:55 +00:00
not ( config [ " xml " ] or config [ " images " ] or config [ " logs " ] ) or \
( other [ ' resume ' ] and not config [ ' path ' ] ) :
2012-03-04 12:35:02 +00:00
usage ( )
2011-04-07 15:43:17 +00:00
sys . exit ( )
2013-11-05 18:11:23 +00:00
#user chose --api, but --index it is necessary for special:export: we generate it
2011-04-13 19:03:16 +00:00
if config [ ' api ' ] and not config [ ' index ' ] :
2013-11-08 22:20:21 +00:00
config [ ' index ' ] = config [ ' api ' ] . split ( ' api.php ' ) [ 0 ] + ' index.php '
2013-11-05 18:11:23 +00:00
# WARNING: remove index.php here for misconfigured sites like editthis.info, or provide --index directly
print ' You didn \' t provide a path for index.php, we try this one: ' , config [ ' index ' ]
2011-04-09 09:12:58 +00:00
2013-10-09 20:11:01 +00:00
if config [ ' cookies ' ] :
cj = cookielib . MozillaCookieJar ( )
cj . load ( config [ ' cookies ' ] )
opener = urllib2 . build_opener ( urllib2 . HTTPCookieProcessor ( cj ) )
urllib2 . install_opener ( opener )
print ' Using cookies from %s ' % config [ ' cookies ' ]
2011-04-09 11:07:23 +00:00
if config [ ' api ' ] :
2011-07-14 20:17:22 +00:00
#check api.php
2014-01-25 22:29:42 +00:00
if checkAPI ( config [ ' api ' ] , config ) :
2012-11-09 18:55:39 +00:00
print ' api.php is OK '
2011-04-13 19:03:16 +00:00
else :
print ' Error in api.php, please, provide a correct path to api.php '
sys . exit ( )
if config [ ' index ' ] :
#check index.php
2014-01-25 22:29:42 +00:00
if checkIndexphp ( config [ ' index ' ] , config ) :
2011-04-13 19:03:16 +00:00
print ' index.php is OK '
else :
print ' Error in index.php, please, provide a correct path to index.php '
sys . exit ( )
2011-04-09 11:07:23 +00:00
2011-04-07 20:24:30 +00:00
#calculating path, if not defined by user with --path=
2011-04-11 07:58:09 +00:00
if not config [ ' path ' ] :
config [ ' path ' ] = ' ./ %s - %s -wikidump ' % ( domain2prefix ( config = config ) , config [ ' date ' ] )
2011-04-07 15:43:17 +00:00
return config , other
2014-01-25 22:29:42 +00:00
def checkAPI ( api , config = { } ) :
2013-03-27 21:26:20 +00:00
""" Checking API availability """
2013-10-09 19:48:02 +00:00
req = urllib2 . Request ( url = api , headers = { ' User-Agent ' : getUserAgent ( ) } )
f = urllib2 . urlopen ( req )
2012-03-05 11:08:03 +00:00
raw = f . read ( )
f . close ( )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2012-04-13 20:42:01 +00:00
print ' Checking api.php... ' , api
2012-03-05 11:08:03 +00:00
if re . search ( r ' action=query ' , raw ) :
return True
return False
2014-01-25 22:29:42 +00:00
def checkIndexphp ( indexphp , config = { } ) :
2013-03-27 21:26:20 +00:00
""" Checking index.php availability """
2012-03-05 11:08:03 +00:00
req = urllib2 . Request ( url = indexphp , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) } )
f = urllib2 . urlopen ( req )
raw = f . read ( )
f . close ( )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2012-04-13 20:42:01 +00:00
print ' Checking index.php... ' , indexphp
2014-02-02 23:06:10 +00:00
if re . search ( r ' (Special:Badtitle</a>|class= " permissions-errors " | " wgCanonicalSpecialPageName " : " Badtitle " |Login Required</h1>) ' , raw ) and not config [ ' cookies ' ] : # Workaround for issue 71
2013-11-07 22:59:28 +00:00
print " ERROR: This wiki requires login and we are not authenticated "
return False
2013-08-17 07:34:11 +00:00
if re . search ( r ' (This wiki is powered by|<h2 id= " mw-version-license " >|meta name= " generator " content= " MediaWiki) ' , raw ) :
2012-03-05 11:08:03 +00:00
return True
return False
2011-04-09 17:16:42 +00:00
def removeIP ( raw = ' ' ) :
2011-07-14 19:58:14 +00:00
""" Remove IP from HTML comments <!-- --> """
2011-04-09 17:16:42 +00:00
raw = re . sub ( r ' \ d+ \ . \ d+ \ . \ d+ \ . \ d+ ' , ' 0.0.0.0 ' , raw )
2011-04-09 17:37:23 +00:00
#http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
#weird cases as :: are not included
raw = re . sub ( r ' (?i)[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4} ' , ' 0:0:0:0:0:0:0:0 ' , raw )
2011-04-09 17:16:42 +00:00
return raw
2012-06-22 11:34:27 +00:00
def checkXMLIntegrity ( config = { } ) :
2012-08-06 14:48:30 +00:00
""" Check XML dump integrity, to detect broken XML chunks """
2012-08-07 08:03:14 +00:00
return
2013-11-16 13:39:49 +00:00
print ' Verifying dump... '
checktitles = 0
checkpageopen = 0
checkpageclose = 0
checkrevisionopen = 0
checkrevisionclose = 0
for line in file ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( config = config ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' ) . read ( ) . splitlines ( ) :
2013-11-17 11:18:42 +00:00
if " <revision> " in line :
checkrevisionopen + = 1
elif " </revision> " in line :
checkrevisionclose + = 1
2013-11-16 13:39:49 +00:00
elif " <page> " in line :
checkpageopen + = 1
elif " </page> " in line :
checkpageclose + = 1
2013-11-17 11:18:42 +00:00
elif " <title> " in line :
checktitles + = 1
2013-11-16 13:39:49 +00:00
else :
continue
2013-11-16 14:01:24 +00:00
if ( checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose ) :
2013-11-16 13:39:49 +00:00
pass
2012-06-22 11:34:27 +00:00
else :
2013-11-16 13:39:49 +00:00
print ' XML dump seems to be corrupted. '
reply = ' '
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
reply = raw_input ( ' Regenerate a new dump ([yes, y], [no, n])? ' )
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
generateXMLDump ( config = config , titles = titles )
elif reply . lower ( ) in [ ' no ' , ' n ' ] :
print ' Not generating a new dump. '
2012-06-22 11:34:27 +00:00
2013-03-27 21:26:20 +00:00
def createNewDump ( config = { } , other = { } ) :
2013-03-27 21:19:46 +00:00
titles = [ ]
images = [ ]
print ' Trying generating a new dump into a new directory... '
if config [ ' xml ' ] :
titles + = getPageTitles ( config = config )
saveTitles ( config = config , titles = titles )
generateXMLDump ( config = config , titles = titles )
checkXMLIntegrity ( config = config )
if config [ ' images ' ] :
if config [ ' api ' ] :
images + = getImageFilenamesURLAPI ( config = config )
else :
images + = getImageFilenamesURL ( config = config )
saveImageFilenamesURL ( config = config , images = images )
generateImageDump ( config = config , other = other , images = images )
if config [ ' logs ' ] :
saveLogs ( config = config )
2013-03-27 21:26:20 +00:00
def resumePreviousDump ( config = { } , other = { } ) :
2013-03-27 21:19:46 +00:00
titles = [ ]
images = [ ]
print ' Resuming previous dump process... '
if config [ ' xml ' ] :
#load titles
lasttitle = ' '
try :
f = open ( ' %s / %s - %s -titles.txt ' % ( config [ ' path ' ] , domain2prefix ( config = config ) , config [ ' date ' ] ) , ' r ' )
raw = f . read ( )
titles = raw . split ( ' \n ' )
lasttitle = titles [ - 1 ]
if not lasttitle : #empty line at EOF ?
lasttitle = titles [ - 2 ]
f . close ( )
except :
pass #probably file doesnot exists
if lasttitle == ' --END-- ' :
#titles list is complete
print ' Title list was completed in the previous session '
else :
print ' Title list is incomplete. Reloading... '
#do not resume, reload, to avoid inconsistences, deleted pages or so
titles = getPageTitles ( config = config )
saveTitles ( config = config , titles = titles )
#checking xml dump
xmliscomplete = False
lastxmltitle = ' '
try :
f = open ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( config = config ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' )
for l in f :
if re . findall ( ' </mediawiki> ' , l ) :
#xml dump is complete
xmliscomplete = True
break
xmltitles = re . findall ( r ' <title>([^<]+)</title> ' , l ) #weird if found more than 1, but maybe
if xmltitles :
lastxmltitle = undoHTMLEntities ( text = xmltitles [ - 1 ] )
f . close ( )
except :
pass #probably file doesnot exists
#removing --END-- before getXMLs
while titles and titles [ - 1 ] in [ ' ' , ' --END-- ' ] :
titles = titles [ : - 1 ]
if xmliscomplete :
print ' XML dump was completed in the previous session '
elif lastxmltitle :
#resuming...
print ' Resuming XML dump from " %s " ' % ( lastxmltitle )
generateXMLDump ( config = config , titles = titles , start = lastxmltitle )
else :
#corrupt? only has XML header?
print ' XML is corrupt? Regenerating... '
generateXMLDump ( config = config , titles = titles )
if config [ ' images ' ] :
#load images
lastimage = ' '
try :
f = open ( ' %s / %s - %s -images.txt ' % ( config [ ' path ' ] , domain2prefix ( config = config ) , config [ ' date ' ] ) , ' r ' )
raw = f . read ( )
lines = raw . split ( ' \n ' )
for l in lines :
if re . search ( r ' \ t ' , l ) :
images . append ( l . split ( ' \t ' ) )
lastimage = lines [ - 1 ]
f . close ( )
except :
pass #probably file doesnot exists
if lastimage == ' --END-- ' :
print ' Image list was completed in the previous session '
else :
print ' Image list is incomplete. Reloading... '
#do not resume, reload, to avoid inconsistences, deleted images or so
if config [ ' api ' ] :
images = getImageFilenamesURLAPI ( config = config )
else :
images = getImageFilenamesURL ( config = config )
saveImageFilenamesURL ( config = config , images = images )
#checking images directory
listdir = [ ]
try :
listdir = os . listdir ( ' %s /images ' % ( config [ ' path ' ] ) )
except :
pass #probably directory does not exist
listdir . sort ( )
complete = True
lastfilename = ' '
lastfilename2 = ' '
c = 0
for filename , url , uploader in images :
lastfilename2 = lastfilename
lastfilename = filename #return always the complete filename, not the truncated
filename2 = filename
if len ( filename2 ) > other [ ' filenamelimit ' ] :
filename2 = truncateFilename ( other = other , filename = filename2 )
if filename2 not in listdir :
complete = False
break
c + = 1
print ' %d images were found in the directory from a previous session ' % ( c )
if complete :
#image dump is complete
print ' Image dump was completed in the previous session '
else :
generateImageDump ( config = config , other = other , images = images , start = lastfilename2 ) # we resume from previous image, which may be corrupted (or missing .desc) by the previous session ctrl-c or abort
2011-04-06 18:54:33 +00:00
2013-03-27 21:19:46 +00:00
if config [ ' logs ' ] :
#fix
pass
def saveSpecialVersion ( config = { } ) :
#save Special:Version as .html, to preserve extensions details
if os . path . exists ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) ) :
print ' Special:Version.html exists, do not overwrite '
else :
print ' Downloading Special:Version with extensions and other related info '
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { ' title ' : ' Special:Version ' , } ) , headers = { ' User-Agent ' : getUserAgent ( ) } )
f = urllib2 . urlopen ( req )
raw = f . read ( )
f . close ( )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2013-03-27 21:19:46 +00:00
raw = removeIP ( raw = raw )
f = open ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) , ' w ' )
f . write ( raw )
f . close ( )
def saveIndexPHP ( config = { } ) :
#save index.php as .html, to preserve license details available at the botom of the page
if os . path . exists ( ' %s /index.html ' % ( config [ ' path ' ] ) ) :
print ' index.html exists, do not overwrite '
else :
print ' Downloading index.php (Main Page) as index.html '
req = urllib2 . Request ( url = config [ ' index ' ] , data = urllib . urlencode ( { } ) , headers = { ' User-Agent ' : getUserAgent ( ) } )
f = urllib2 . urlopen ( req )
raw = f . read ( )
f . close ( )
2014-01-23 16:05:19 +00:00
delay ( config = config )
2013-03-27 21:19:46 +00:00
raw = removeIP ( raw = raw )
f = open ( ' %s /index.html ' % ( config [ ' path ' ] ) , ' w ' )
f . write ( raw )
f . close ( )
def avoidWikimediaProjects ( config = { } ) :
2013-03-27 21:26:20 +00:00
""" Skip Wikimedia projects and redirect to the dumps website """
2011-04-07 13:14:37 +00:00
#notice about wikipedia dumps
2013-03-27 20:58:15 +00:00
if re . findall ( r ' (?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage) \ .org ' , config [ ' api ' ] + config [ ' index ' ] ) :
2013-03-27 20:57:30 +00:00
print ' PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS! '
print ' Download the dumps from http://dumps.wikimedia.org '
2011-07-14 19:54:14 +00:00
if not other [ ' force ' ] :
2011-07-14 20:17:22 +00:00
print ' Thanks! '
2011-07-14 19:54:14 +00:00
sys . exit ( )
2013-03-27 21:19:46 +00:00
def main ( params = [ ] ) :
""" Main function """
welcome ( )
configfilename = ' config.txt '
config , other = getParameters ( params = params )
avoidWikimediaProjects ( config = config )
2011-04-09 09:12:58 +00:00
print ' Analysing %s ' % ( config [ ' api ' ] and config [ ' api ' ] or config [ ' index ' ] )
2011-04-07 15:56:48 +00:00
2011-04-07 15:43:17 +00:00
#creating path or resuming if desired
2011-04-07 13:14:37 +00:00
c = 2
2011-04-07 16:10:12 +00:00
originalpath = config [ ' path ' ] # to avoid concat blabla-2, blabla-2-3, and so on...
2013-03-27 20:57:30 +00:00
while not other [ ' resume ' ] and os . path . isdir ( config [ ' path ' ] ) : #do not enter if resume is requested from begining
2011-04-07 15:43:17 +00:00
print ' \n Warning!: " %s " path exists ' % ( config [ ' path ' ] )
2011-04-11 07:58:09 +00:00
reply = ' '
2011-07-14 20:01:34 +00:00
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
2011-04-11 07:58:09 +00:00
reply = raw_input ( ' There is a dump in " %s " , probably incomplete. \n If you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored \n and the parameters available in " %s / %s " will be loaded. \n Do you want to resume ([yes, y], [no, n])? ' % ( config [ ' path ' ] , config [ ' path ' ] , configfilename ) )
2011-04-07 13:14:37 +00:00
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
2011-04-07 20:24:30 +00:00
if not os . path . isfile ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) ) :
print ' No config file found. I can \' t resume. Aborting. '
2011-04-07 13:14:37 +00:00
sys . exit ( )
2013-03-27 20:58:55 +00:00
print ' You have selected: YES '
2011-04-07 15:43:17 +00:00
other [ ' resume ' ] = True
2011-04-07 13:14:37 +00:00
break
2011-04-11 07:58:09 +00:00
elif reply . lower ( ) in [ ' no ' , ' n ' ] :
2013-03-27 20:58:55 +00:00
print ' You have selected: NO '
2011-04-11 07:58:09 +00:00
other [ ' resume ' ] = False
2011-04-07 16:10:12 +00:00
config [ ' path ' ] = ' %s - %d ' % ( originalpath , c )
2013-03-27 20:57:30 +00:00
print ' Trying to use path " %s " ... ' % ( config [ ' path ' ] )
2011-04-07 13:14:37 +00:00
c + = 1
2011-04-07 15:43:17 +00:00
2011-04-07 20:24:30 +00:00
if other [ ' resume ' ] :
print ' Loading config file... '
config = loadConfig ( config = config , configfilename = configfilename )
else :
2011-04-07 15:43:17 +00:00
os . mkdir ( config [ ' path ' ] )
saveConfig ( config = config , configfilename = configfilename )
2011-04-07 13:14:37 +00:00
2011-04-07 15:43:17 +00:00
if other [ ' resume ' ] :
2013-03-27 21:26:20 +00:00
resumePreviousDump ( config = config , other = other )
2011-04-07 13:14:37 +00:00
else :
2013-03-27 21:26:20 +00:00
createNewDump ( config = config , other = other )
2013-03-27 21:19:46 +00:00
saveIndexPHP ( config = config )
saveSpecialVersion ( config = config )
2011-04-09 17:45:56 +00:00
bye ( )
2011-04-07 15:43:17 +00:00
if __name__ == " __main__ " :
2012-06-22 11:34:27 +00:00
main ( )