2011-04-05 16:18:18 +00:00
# -*- coding: utf-8 -*-
2011-04-09 17:37:23 +00:00
# Copyright (C) 2011 WikiTeam
2011-04-05 16:18:18 +00:00
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2011-04-07 13:14:37 +00:00
import cPickle
2011-04-05 22:00:29 +00:00
import datetime
2011-04-07 15:43:17 +00:00
import getopt
2011-04-11 18:38:47 +00:00
import md5
2011-04-05 22:00:29 +00:00
import os
2011-04-05 16:18:18 +00:00
import re
2011-04-05 22:00:29 +00:00
import subprocess
2011-04-05 16:18:18 +00:00
import sys
2011-04-07 15:56:48 +00:00
import time
2011-04-05 16:18:18 +00:00
import urllib
2011-04-05 22:00:29 +00:00
import urllib2
2011-04-05 16:18:18 +00:00
# todo:
# curonly and all history (curonly si puede acumular varias peticiones en un solo GET, ara full history pedir cada pagina una a una)
# usar api o parsear html si no está disponible
# http://www.mediawiki.org/wiki/Manual:Parameters_to_Special:Export
2011-04-05 22:38:58 +00:00
# threads para bajar más rápido? pedir varias páginas a la vez
2011-04-05 23:34:52 +00:00
# Special:Log? uploads, account creations, etc
2011-04-06 18:54:33 +00:00
# download Special:Version to save whch extension it used
2011-04-06 20:23:18 +00:00
# que guarde el index.php (la portada) como index.html para que se vea la licencia del wiki abajo del todo
2011-04-07 13:14:37 +00:00
# fix use api when available
2011-04-05 16:18:18 +00:00
2011-04-11 19:09:55 +00:00
def truncateFilename ( other = { } , filename = ' ' ) :
return filename [ : other [ ' filenamelimit ' ] ] + md5 . new ( filename ) . hexdigest ( ) + ' . ' + filename . split ( ' . ' ) [ - 1 ]
2011-04-11 18:38:47 +00:00
2011-04-07 16:10:12 +00:00
def delay ( config = { } ) :
2011-04-07 17:28:08 +00:00
if config [ ' delay ' ] > 0 :
print ' Sleeping... %d seconds... ' % ( config [ ' delay ' ] )
time . sleep ( config [ ' delay ' ] )
2011-04-07 16:10:12 +00:00
2011-04-05 22:00:29 +00:00
def cleanHTML ( raw = ' ' ) :
2011-04-12 16:31:50 +00:00
#<!-- bodytext --> <!-- /bodytext -->
#<!-- start content --> <!-- end content -->
#<!-- Begin Content Area --> <!-- End Content Area -->
if re . search ( ' <!-- bodytext --> ' , raw ) :
2011-04-05 22:00:29 +00:00
raw = raw . split ( ' <!-- bodytext --> ' ) [ 1 ] . split ( ' <!-- /bodytext --> ' ) [ 0 ]
elif re . search ( ' <!-- start content --> ' , raw ) :
raw = raw . split ( ' <!-- start content --> ' ) [ 1 ] . split ( ' <!-- end content --> ' ) [ 0 ]
2011-04-12 16:31:50 +00:00
elif re . search ( ' <!-- Begin Content Area --> ' , raw ) :
raw = raw . split ( ' <!-- Begin Content Area --> ' ) [ 1 ] . split ( ' <!-- End Content Area --> ' ) [ 0 ]
2011-04-05 22:00:29 +00:00
else :
print ' This wiki doesn \' t use marks to split contain '
sys . exit ( )
return raw
2011-04-09 09:12:58 +00:00
def getNamespaces ( config = { } ) :
2011-04-05 22:00:29 +00:00
#namespace checks and stuff
2011-04-09 09:12:58 +00:00
#fix get namespaces from a random Special:Export page, it is better
#too from API http://wikiindex.org/api.php?action=query&meta=siteinfo&siprop=general|namespaces
2011-04-07 15:43:17 +00:00
namespaces = config [ ' namespaces ' ]
2011-04-09 09:12:58 +00:00
namespacenames = { 0 : ' ' } # main is 0, no prefix
2011-04-05 16:18:18 +00:00
if namespaces :
2011-04-09 09:12:58 +00:00
raw = urllib . urlopen ( ' %s ?title=Special:Allpages ' % ( config [ ' index ' ] ) ) . read ( )
2011-04-05 16:18:18 +00:00
m = re . compile ( r ' <option [^>]*?value= " (?P<namespaceid> \ d+) " [^>]*?>(?P<namespacename>[^<]+)</option> ' ) . finditer ( raw ) # [^>]*? to include selected="selected"
if ' all ' in namespaces :
namespaces = [ ]
for i in m :
namespaces . append ( int ( i . group ( " namespaceid " ) ) )
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group ( " namespacename " )
else :
#check if those namespaces really exist in this wiki
namespaces2 = [ ]
for i in m :
if int ( i . group ( " namespaceid " ) ) in namespaces :
namespaces2 . append ( int ( i . group ( " namespaceid " ) ) )
namespacenames [ int ( i . group ( " namespaceid " ) ) ] = i . group ( " namespacename " )
namespaces = namespaces2
else :
namespaces = [ 0 ]
2011-04-05 22:00:29 +00:00
#retrieve all titles from Special:Allpages, if the wiki is big, perhaps there are sub-Allpages to explore
2011-04-05 16:18:18 +00:00
namespaces = [ i for i in set ( namespaces ) ] #uniques
2011-04-06 18:54:33 +00:00
print ' %d namespaces have been found ' % ( len ( namespaces ) )
2011-04-09 09:12:58 +00:00
return namespaces , namespacenames
def getPageTitlesAPI ( config = { } ) :
2011-04-05 16:18:18 +00:00
titles = [ ]
2011-04-09 09:12:58 +00:00
namespaces , namespacenames = getNamespaces ( config = config )
2011-04-05 16:18:18 +00:00
for namespace in namespaces :
2011-04-13 19:44:35 +00:00
if namespace in config [ ' exnamespaces ' ] :
print ' Skiping namespace = ' , namespace
continue
2011-04-09 09:12:58 +00:00
c = 0
2011-04-05 22:00:29 +00:00
print ' Retrieving titles in the namespace ' , namespace
2011-04-09 09:12:58 +00:00
headers = { ' User-Agent ' : getUserAgent ( ) }
apfrom = ' ! '
while apfrom :
2011-04-09 09:13:57 +00:00
params = { ' action ' : ' query ' , ' list ' : ' allpages ' , ' apnamespace ' : namespace , ' apfrom ' : apfrom , ' format ' : ' xml ' , ' aplimit ' : 500 }
2011-04-09 09:12:58 +00:00
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = config [ ' api ' ] , data = data , headers = headers )
try :
f = urllib2 . urlopen ( req )
except :
try :
print ' Server is slow... Waiting some seconds and retrying... '
time . sleep ( 10 )
f = urllib2 . urlopen ( req )
except :
print ' An error have occurred while retrieving page titles with API '
print ' Please, resume the dump, --resume '
sys . exit ( )
xml = f . read ( )
f . close ( )
m = re . findall ( r ' <allpages apfrom= " ([^>]+) " /> ' , xml )
if m :
apfrom = undoHTMLEntities ( text = m [ 0 ] ) #" = ", etc
else :
apfrom = ' '
m = re . findall ( r ' title= " ([^>]+) " /> ' , xml )
2011-04-09 11:07:23 +00:00
titles + = [ undoHTMLEntities ( title ) for title in m ]
2011-04-09 09:12:58 +00:00
c + = len ( m )
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
return titles
def getPageTitlesScrapper ( config = { } ) :
titles = [ ]
namespaces , namespacenames = getNamespaces ( config = config )
for namespace in namespaces :
print ' Retrieving titles in the namespace ' , namespace
url = ' %s ?title=Special:Allpages&namespace= %s ' % ( config [ ' index ' ] , namespace )
2011-04-05 22:00:29 +00:00
raw = urllib . urlopen ( url ) . read ( )
raw = cleanHTML ( raw )
2011-04-05 16:18:18 +00:00
2011-04-05 22:00:29 +00:00
r_title = r ' title= " (?P<title>[^>]+) " > '
2011-04-08 23:43:57 +00:00
r_suballpages = ' '
r_suballpages1 = r ' &from=(?P<from>[^>]+)&to=(?P<to>[^>]+) " > '
r_suballpages2 = r ' Special:Allpages/(?P<from>[^>]+) " > '
if re . search ( r_suballpages1 , raw ) :
r_suballpages = r_suballpages1
elif re . search ( r_suballpages2 , raw ) :
r_suballpages = r_suballpages2
else :
pass #perhaps no subpages
2011-04-05 22:00:29 +00:00
deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels
c = 0
checked_suballpages = [ ]
2011-04-05 22:08:53 +00:00
rawacum = raw
2011-04-08 23:43:57 +00:00
while r_suballpages and re . search ( r_suballpages , raw ) and c < deep :
2011-04-05 22:00:29 +00:00
#load sub-Allpages
m = re . compile ( r_suballpages ) . finditer ( raw )
for i in m :
fr = i . group ( ' from ' )
2011-04-08 23:43:57 +00:00
if r_suballpages == r_suballpages1 :
to = i . group ( ' to ' )
name = ' %s - %s ' % ( fr , to )
2011-04-09 09:12:58 +00:00
url = ' %s ?title=Special:Allpages&namespace= %s &from= %s &to= %s ' % ( config [ ' index ' ] , namespace , fr , to ) #do not put urllib.quote in fr or to
elif r_suballpages == r_suballpages2 : #fix, esta regexp no carga bien todas? o falla el r_title en este tipo de subpag? (wikiindex)
2011-04-08 23:50:24 +00:00
fr = fr . split ( ' &namespace= ' ) [ 0 ] #clean &namespace=\d, sometimes happens
2011-04-08 23:43:57 +00:00
name = fr
2011-04-09 09:12:58 +00:00
url = ' %s ?title=Special:Allpages/ %s &namespace= %s ' % ( config [ ' index ' ] , name , namespace )
2011-04-08 23:43:57 +00:00
2011-04-05 22:00:29 +00:00
if not name in checked_suballpages :
2011-04-08 23:50:24 +00:00
checked_suballpages . append ( name ) #to avoid reload dupe subpages links
2011-04-05 22:00:29 +00:00
raw2 = urllib . urlopen ( url ) . read ( )
raw2 = cleanHTML ( raw2 )
rawacum + = raw2 #merge it after removed junk
2011-04-08 23:43:57 +00:00
print ' Reading ' , name , len ( raw2 ) , ' bytes ' , len ( re . findall ( r_suballpages , raw2 ) ) , ' subpages ' , len ( re . findall ( r_title , raw2 ) ) , ' pages '
2011-04-05 22:00:29 +00:00
c + = 1
2011-04-05 16:18:18 +00:00
2011-04-09 08:05:48 +00:00
c = 0
2011-04-05 22:00:29 +00:00
m = re . compile ( r_title ) . finditer ( rawacum )
2011-04-05 16:18:18 +00:00
for i in m :
if not i . group ( ' title ' ) . startswith ( ' Special: ' ) :
2011-04-05 22:00:29 +00:00
if not i . group ( ' title ' ) in titles :
titles . append ( i . group ( ' title ' ) )
2011-04-09 08:05:48 +00:00
c + = 1
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
2011-04-09 09:12:58 +00:00
return titles
def getPageTitles ( config = { } ) :
#Get page titles parsing Special:Allpages or using API (fix)
#http://en.wikipedia.org/wiki/Special:AllPages
#http://archiveteam.org/index.php?title=Special:AllPages
#http://www.wikanda.es/wiki/Especial:Todas
2011-04-13 19:44:35 +00:00
print ' Loading page titles from namespaces = %s ' % ( config [ ' namespaces ' ] and ' , ' . join ( [ str ( i ) for i in config [ ' namespaces ' ] ] ) or ' None ' )
print ' Excluding titles from namespaces = %s ' % ( config [ ' exnamespaces ' ] and ' , ' . join ( [ str ( i ) for i in config [ ' exnamespaces ' ] ] ) or ' None ' )
2011-04-09 09:12:58 +00:00
titles = [ ]
if config [ ' api ' ] :
titles = getPageTitlesAPI ( config = config )
elif config [ ' index ' ] :
titles = getPageTitlesScrapper ( config = config )
2011-04-06 18:54:33 +00:00
print ' %d page titles loaded ' % ( len ( titles ) )
2011-04-05 16:18:18 +00:00
return titles
2011-04-07 15:43:17 +00:00
def getXMLHeader ( config = { } ) :
2011-04-05 22:00:29 +00:00
#get the header of a random page, to attach it in the complete XML backup
#similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x....
randomtitle = ' AMF5LKE43MNFGHKSDMRTJ '
2011-04-07 15:43:17 +00:00
xml = getXMLPage ( config = config , title = randomtitle )
2011-04-05 22:00:29 +00:00
header = xml . split ( ' </mediawiki> ' ) [ 0 ]
return header
2011-04-08 13:39:14 +00:00
def getXMLFileDesc ( config = { } , title = ' ' ) :
config [ ' curonly ' ] = 1 #tricky to get only the most recent desc
return getXMLPage ( config = config , title = title )
2011-04-09 08:05:48 +00:00
def getUserAgent ( ) :
useragents = [ ' Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4 ' ]
return useragents [ 0 ]
2011-04-07 15:43:17 +00:00
def getXMLPage ( config = { } , title = ' ' ) :
2011-04-05 22:00:29 +00:00
#http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
2011-04-06 18:54:33 +00:00
limit = 1000
2011-04-05 23:34:52 +00:00
truncated = False
2011-04-10 09:17:05 +00:00
title_ = title
title_ = re . sub ( ' ' , ' _ ' , title_ )
title_ = re . sub ( ' & ' , ' % 26 ' , title_ ) # titles with & need to be converted into %26
2011-04-09 08:05:48 +00:00
headers = { ' User-Agent ' : getUserAgent ( ) }
2011-04-06 12:26:13 +00:00
params = { ' title ' : ' Special:Export ' , ' pages ' : title_ , ' action ' : ' submit ' , }
2011-04-07 15:43:17 +00:00
if config [ ' curonly ' ] :
2011-04-05 22:00:29 +00:00
params [ ' curonly ' ] = 1
else :
2011-04-05 22:38:58 +00:00
params [ ' offset ' ] = ' 1 '
2011-04-05 22:00:29 +00:00
params [ ' limit ' ] = limit
2011-04-05 22:38:58 +00:00
data = urllib . urlencode ( params )
2011-04-09 09:12:58 +00:00
req = urllib2 . Request ( url = config [ ' index ' ] , data = data , headers = headers )
2011-04-07 21:04:17 +00:00
try :
f = urllib2 . urlopen ( req )
except :
try :
2011-04-09 08:06:01 +00:00
print ' Server is slow... Waiting some seconds and retrying... '
2011-04-07 21:04:17 +00:00
time . sleep ( 10 )
f = urllib2 . urlopen ( req )
except :
2011-04-09 12:34:07 +00:00
print ' An error have occurred while retrieving " %s " ' % ( title )
2011-04-07 21:04:17 +00:00
print ' Please, resume the dump, --resume '
sys . exit ( )
2011-04-05 22:00:29 +00:00
xml = f . read ( )
2011-04-05 23:00:33 +00:00
#if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
#else, warning about Special:Export truncating large page histories
r_timestamp = r ' <timestamp>([^<]+)</timestamp> '
2011-04-07 15:43:17 +00:00
if not config [ ' curonly ' ] and re . search ( r_timestamp , xml ) : # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
2011-04-06 11:39:02 +00:00
while not truncated and params [ ' offset ' ] :
params [ ' offset ' ] = re . findall ( r_timestamp , xml ) [ - 1 ] #get the last timestamp from the acum XML
2011-04-05 22:38:58 +00:00
data = urllib . urlencode ( params )
2011-04-09 09:12:58 +00:00
req2 = urllib2 . Request ( url = config [ ' index ' ] , data = data , headers = headers )
2011-04-07 21:10:57 +00:00
try :
f2 = urllib2 . urlopen ( req2 )
except :
try :
print ' Sever is slow... Waiting some seconds and retrying... '
time . sleep ( 10 )
f2 = urllib2 . urlopen ( req2 )
except :
print ' An error have occurred while retrieving ' , title
print ' Please, resume the dump, --resume '
sys . exit ( )
2011-04-05 22:00:29 +00:00
xml2 = f2 . read ( )
2011-04-06 11:39:02 +00:00
if re . findall ( r_timestamp , xml2 ) : #are there more edits in this next XML chunk?
if re . findall ( r_timestamp , xml2 ) [ - 1 ] == params [ ' offset ' ] :
#again the same XML, this wiki does not support params in Special:Export, offer complete XML up to X edits (usually 1000)
print ' ATTENTION: This wiki does not allow some parameters in Special:Export, so, pages with large histories may be truncated '
truncated = True
break
else :
#offset is OK in this wiki, merge with the previous chunk of this page history and continue
xml = xml . split ( ' </page> ' ) [ 0 ] + xml2 . split ( ' <page> \n ' ) [ 1 ]
2011-04-05 23:00:33 +00:00
else :
2011-04-06 11:39:02 +00:00
params [ ' offset ' ] = ' ' #no more edits in this page history
print title , len ( re . findall ( r_timestamp , xml ) ) , ' edits '
2011-04-05 22:00:29 +00:00
return xml
def cleanXML ( xml = ' ' ) :
2011-04-10 09:17:05 +00:00
#do not touch xml codification, as is
2011-04-05 22:00:29 +00:00
xml = xml . split ( ' </siteinfo> \n ' ) [ 1 ]
xml = xml . split ( ' </mediawiki> ' ) [ 0 ]
return xml
2011-04-05 16:18:18 +00:00
2011-04-07 20:24:30 +00:00
def generateXMLDump ( config = { } , titles = [ ] , start = ' ' ) :
print ' Retrieving the XML for every page from " %s " ' % ( start and start or ' start ' )
header = getXMLHeader ( config = config )
2011-04-06 11:39:02 +00:00
footer = ' </mediawiki> \n ' #new line at the end
2011-04-09 09:12:58 +00:00
xmlfilename = ' %s - %s - %s .xml ' % ( domain2prefix ( config = config ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' )
2011-04-07 20:24:30 +00:00
xmlfile = ' '
lock = True
if start :
#remove the last chunk of xml dump (it is probably incomplete)
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' r ' )
xml = xmlfile . read ( )
xmlfile . close ( )
xml = xml . split ( ' <title> %s </title> ' % ( start ) ) [ 0 ]
xml = ' \n ' . join ( xml . split ( ' \n ' ) [ : - 2 ] ) # [:-1] removing <page>\n tag
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
xmlfile . write ( ' %s \n ' % ( xml ) )
xmlfile . close ( )
else :
#requested complete xml dump
lock = False
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
xmlfile . write ( header )
xmlfile . close ( )
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' a ' )
2011-04-05 22:38:58 +00:00
c = 1
2011-04-05 22:00:29 +00:00
for title in titles :
2011-04-07 20:24:30 +00:00
if title == start : #start downloading from start, included
lock = False
if lock :
continue
2011-04-07 16:10:12 +00:00
delay ( config = config )
2011-04-05 22:38:58 +00:00
if c % 10 == 0 :
2011-04-09 12:34:07 +00:00
print ' Downloaded %d pages ' % ( c )
2011-04-07 20:24:30 +00:00
xml = getXMLPage ( config = config , title = title )
2011-04-05 22:00:29 +00:00
xml = cleanXML ( xml = xml )
xmlfile . write ( xml )
2011-04-05 22:38:58 +00:00
c + = 1
2011-04-05 22:00:29 +00:00
xmlfile . write ( footer )
xmlfile . close ( )
2011-04-07 13:14:37 +00:00
print ' XML dump saved at... ' , xmlfilename
2011-04-06 18:54:33 +00:00
2011-04-07 15:43:17 +00:00
def saveTitles ( config = { } , titles = [ ] ) :
2011-04-06 18:54:33 +00:00
#save titles in a txt for resume if needed
2011-04-09 09:12:58 +00:00
titlesfilename = ' %s - %s -titles.txt ' % ( domain2prefix ( config = config ) , config [ ' date ' ] )
2011-04-07 20:24:30 +00:00
titlesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , titlesfilename ) , ' w ' )
2011-04-07 13:14:37 +00:00
titlesfile . write ( ' \n ' . join ( titles ) )
2011-04-09 22:13:05 +00:00
titlesfile . write ( ' \n --END-- ' )
2011-04-07 13:14:37 +00:00
titlesfile . close ( )
print ' Titles saved at... ' , titlesfilename
2011-04-06 18:54:33 +00:00
2011-04-07 22:32:05 +00:00
def saveImageFilenamesURL ( config = { } , images = [ ] ) :
#save list of images and their urls
2011-04-09 09:12:58 +00:00
imagesfilename = ' %s - %s -images.txt ' % ( domain2prefix ( config = config ) , config [ ' date ' ] )
2011-04-07 22:32:05 +00:00
imagesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , imagesfilename ) , ' w ' )
2011-04-08 14:57:36 +00:00
imagesfile . write ( ' \n ' . join ( [ ' %s \t %s \t %s ' % ( filename , url , uploader ) for filename , url , uploader in images ] ) )
2011-04-07 22:32:05 +00:00
imagesfile . write ( ' \n --END-- ' )
imagesfile . close ( )
print ' Image filenames and URLs saved at... ' , imagesfilename
2011-04-09 09:12:58 +00:00
def getImageFilenamesURL ( config = { } ) :
2011-04-07 22:32:05 +00:00
#fix start is only available if parsing from API, if not, reload all the list from special:imagelist is mandatory
2011-04-07 13:14:37 +00:00
print ' Retrieving image filenames '
r_next = r ' (?<!&dir=prev)&offset=(?P<offset> \ d+)& ' # (?<! http://docs.python.org/library/re.html
images = [ ]
offset = ' 29990101000000 ' #january 1, 2999
while offset :
2011-04-09 09:12:58 +00:00
url = ' %s ?title=Special:Imagelist&limit=5000&offset= %s ' % ( config [ ' index ' ] , offset )
2011-04-07 13:14:37 +00:00
raw = urllib . urlopen ( url ) . read ( )
raw = cleanHTML ( raw )
2011-04-10 17:51:31 +00:00
#archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
#wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a href="/w/index.php?title=Usuario:Fernandocg&action=edit&redlink=1" class="new" title="Usuario:Fernandocg (página no existe)">Fernandocg</a></td>
2011-04-08 20:35:05 +00:00
r_images1 = r ' (?im)<td class= " TablePager_col_img_name " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a>[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2011-04-10 17:51:31 +00:00
#wikijuegos 1.9.5 http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old mediawiki version
2011-04-08 20:35:05 +00:00
r_images2 = r ' (?im)<td class= " TablePager_col_links " ><a href[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+/[^>/]+) " >[^<]+</a></td> \ s*<td class= " TablePager_col_img_timestamp " >[^<]+</td> \ s*<td class= " TablePager_col_img_name " >[^<]+</td> \ s*<td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2011-04-10 17:51:31 +00:00
#gentoowiki 1.18 <tr><td class="TablePager_col_img_timestamp">18:15, 3 April 2011</td><td class="TablePager_col_img_name"><a href="/wiki/File:Asus_eeepc-1201nl.png" title="File:Asus eeepc-1201nl.png">Asus eeepc-1201nl.png</a> (<a href="/w/images/2/2b/Asus_eeepc-1201nl.png">file</a>)</td><td class="TablePager_col_thumb"><a href="/wiki/File:Asus_eeepc-1201nl.png" class="image"><img alt="" src="/w/images/thumb/2/2b/Asus_eeepc-1201nl.png/180px-Asus_eeepc-1201nl.png" width="180" height="225" /></a></td><td class="TablePager_col_img_size">37 KB</td><td class="TablePager_col_img_user_text"><a href="/w/index.php?title=User:Yannails&action=edit&redlink=1" class="new" title="User:Yannails (page does not exist)">Yannails</a></td><td class="TablePager_col_img_description"> </td><td class="TablePager_col_count">1</td></tr>
r_images3 = r ' (?im)<td class= " TablePager_col_img_name " ><a[^>]+title= " [^:>]+:(?P<filename>[^>]+) " >[^<]+</a>[^<]+<a href= " (?P<url>[^>]+) " >[^<]+</a>[^<]+</td><td class= " TablePager_col_thumb " ><a[^>]+><img[^>]+></a></td><td class= " TablePager_col_img_size " >[^<]+</td><td class= " TablePager_col_img_user_text " ><a[^>]+>(?P<uploader>[^<]+)</a></td> '
2011-04-08 20:35:05 +00:00
m = [ ]
#different mediawiki versions
if re . search ( r_images1 , raw ) :
m = re . compile ( r_images1 ) . finditer ( raw )
elif re . search ( r_images2 , raw ) :
m = re . compile ( r_images2 ) . finditer ( raw )
2011-04-10 17:51:31 +00:00
elif re . search ( r_images3 , raw ) :
m = re . compile ( r_images3 ) . finditer ( raw )
2011-04-08 20:35:05 +00:00
2011-04-07 13:14:37 +00:00
for i in m :
url = i . group ( ' url ' )
2011-04-08 15:12:30 +00:00
if url [ 0 ] == ' / ' or not url . startswith ( ' http:// ' ) : #relative URL
if url [ 0 ] == ' / ' : #it is added later
url = url [ 1 : ]
2011-04-09 09:12:58 +00:00
domainalone = config [ ' index ' ] . split ( ' http:// ' ) [ 1 ] . split ( ' / ' ) [ 0 ]
2011-04-08 15:12:30 +00:00
url = ' http:// %s / %s ' % ( domainalone , url )
2011-04-08 15:34:53 +00:00
url = undoHTMLEntities ( text = url )
2011-04-08 16:30:29 +00:00
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
2011-04-08 15:34:53 +00:00
url = re . sub ( ' ' , ' _ ' , url )
2011-04-07 17:28:08 +00:00
filename = re . sub ( ' _ ' , ' ' , i . group ( ' filename ' ) )
2011-04-08 15:34:53 +00:00
filename = undoHTMLEntities ( text = filename )
filename = urllib . unquote ( filename )
2011-04-08 14:57:36 +00:00
uploader = re . sub ( ' _ ' , ' ' , i . group ( ' uploader ' ) )
2011-04-08 15:34:53 +00:00
uploader = undoHTMLEntities ( text = uploader )
uploader = urllib . unquote ( uploader )
2011-04-08 14:57:36 +00:00
images . append ( [ filename , url , uploader ] )
2011-04-07 22:32:05 +00:00
#print filename, url
2011-04-07 13:14:37 +00:00
if re . search ( r_next , raw ) :
offset = re . findall ( r_next , raw ) [ 0 ]
else :
offset = ' '
print ' Found %d images ' % ( len ( images ) )
2011-04-07 22:32:05 +00:00
images . sort ( )
return images
2011-04-08 15:34:53 +00:00
def undoHTMLEntities ( text = ' ' ) :
2011-04-08 15:35:40 +00:00
text = re . sub ( ' < ' , ' < ' , text ) # i guess only < > & " need conversion http://www.w3schools.com/html/html_entities.asp
2011-04-08 15:34:53 +00:00
text = re . sub ( ' > ' , ' > ' , text )
text = re . sub ( ' & ' , ' & ' , text )
text = re . sub ( ' " ' , ' " ' , text )
return text
2011-04-11 19:09:55 +00:00
def generateImageDump ( config = { } , other = { } , images = [ ] , start = ' ' ) :
2011-04-07 22:32:05 +00:00
#slurp all the images
#save in a .tar?
#tener en cuenta http://www.mediawiki.org/wiki/Manual:ImportImages.php
2011-04-07 23:01:19 +00:00
#fix, download .desc ? YEP!
#fix download the upload log too, for uploaders info and date
2011-04-07 22:32:05 +00:00
print ' Retrieving images from " %s " ' % ( start and start or ' start ' )
2011-04-07 15:43:17 +00:00
imagepath = ' %s /images ' % ( config [ ' path ' ] )
2011-04-07 13:14:37 +00:00
if os . path . isdir ( imagepath ) :
2011-04-07 16:10:12 +00:00
print ' It exists an images directory for this dump ' #fix, resume?
2011-04-07 13:14:37 +00:00
else :
os . makedirs ( imagepath )
c = 0
2011-04-07 22:32:05 +00:00
lock = True
2011-04-07 23:37:45 +00:00
if not start :
lock = False
2011-04-08 14:57:36 +00:00
for filename , url , uploader in images :
2011-04-07 22:32:05 +00:00
if filename == start : #start downloading from start, included
lock = False
if lock :
continue
2011-04-07 16:10:12 +00:00
delay ( config = config )
2011-04-08 13:39:14 +00:00
#saving file
2011-04-11 18:38:47 +00:00
#truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max)
filename2 = filename
2011-04-11 19:09:55 +00:00
if len ( filename2 ) > other [ ' filenamelimit ' ] :
2011-04-11 18:38:47 +00:00
# split last . (extension) and then merge
2011-04-11 19:09:55 +00:00
filename2 = truncateFilename ( other = other , filename = filename2 )
2011-04-11 18:38:47 +00:00
print ' Truncating filename, it is too long. Now it is called: ' , filename2
urllib . urlretrieve ( url , ' %s / %s ' % ( imagepath , filename2 ) )
2011-04-08 13:39:14 +00:00
#saving description if any
xmlfiledesc = getXMLFileDesc ( config = config , title = ' Image: %s ' % ( filename ) )
2011-04-11 18:38:47 +00:00
f = open ( ' %s / %s .desc ' % ( imagepath , filename2 ) , ' w ' )
2011-04-08 13:39:14 +00:00
if re . search ( r ' <text xml:space= " preserve " /> ' , xmlfiledesc ) :
#empty desc
xmlfiledesc = ' '
elif re . search ( r ' <text xml:space= " preserve " > ' , xmlfiledesc ) :
xmlfiledesc = xmlfiledesc . split ( ' <text xml:space= " preserve " > ' ) [ 1 ] . split ( ' </text> ' ) [ 0 ]
2011-04-08 15:34:53 +00:00
xmlfiledesc = undoHTMLEntities ( text = xmlfiledesc )
2011-04-08 13:39:14 +00:00
else : #failure when retrieving desc?
xmlfiledesc = ' '
f . write ( xmlfiledesc )
f . close ( )
2011-04-07 13:14:37 +00:00
c + = 1
if c % 10 == 0 :
print ' Downloaded %d images ' % ( c )
print ' Downloaded %d images ' % ( c )
2011-04-07 15:43:17 +00:00
def saveLogs ( config = { } ) :
2011-04-06 19:17:59 +00:00
#get all logs from Special:Log
""" parse
< select name = ' type ' >
< option value = " block " > Bloqueos de usuarios < / option >
< option value = " rights " > Cambios de perfil de usuario < / option >
< option value = " protect " selected = " selected " > Protecciones de páginas < / option >
< option value = " delete " > Registro de borrados < / option >
< option value = " newusers " > Registro de creación de usuarios < / option >
< option value = " merge " > Registro de fusiones < / option >
< option value = " import " > Registro de importaciones < / option >
< option value = " patrol " > Registro de revisiones < / option >
< option value = " move " > Registro de traslados < / option >
< option value = " upload " > Subidas de archivos < / option >
< option value = " " > Todos los registros < / option >
< / select >
"""
2011-04-07 16:10:12 +00:00
delay ( config = config )
2011-04-06 19:17:59 +00:00
2011-04-09 09:12:58 +00:00
def domain2prefix ( config = { } ) :
domain = ' '
if config [ ' api ' ] :
domain = config [ ' api ' ]
elif config [ ' index ' ] :
domain = config [ ' index ' ]
2011-04-13 19:44:35 +00:00
domain = domain . lower ( )
2011-04-09 09:12:58 +00:00
domain = re . sub ( r ' (http://|www \ .|/index \ .php|/api \ .php) ' , ' ' , domain )
2011-04-07 13:14:37 +00:00
domain = re . sub ( r ' / ' , ' _ ' , domain )
domain = re . sub ( r ' \ . ' , ' ' , domain )
domain = re . sub ( r ' [^A-Za-z0-9] ' , ' _ ' , domain )
return domain
2011-04-07 15:43:17 +00:00
def loadConfig ( config = { } , configfilename = ' ' ) :
f = open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' r ' )
2011-04-07 13:14:37 +00:00
config = cPickle . load ( f )
f . close ( )
return config
2011-04-07 15:43:17 +00:00
def saveConfig ( config = { } , configfilename = ' ' ) :
f = open ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) , ' w ' )
2011-04-07 13:14:37 +00:00
cPickle . dump ( config , f )
f . close ( )
2011-04-09 17:45:56 +00:00
def welcome ( ) :
print " # " * 73
print """ # Welcome to DumpGenerator 0.1 by WikiTeam (GPL v3) #
# More info at: http://code.google.com/p/wikiteam/ #"""
print " # " * 73
print ' '
print " # " * 73
print """ # Copyright (C) 2011 WikiTeam #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #"""
print " # " * 73
print ' '
def bye ( ) :
print " Your dump is complete "
2011-04-07 17:28:08 +00:00
print " If you found any bug, report a new issue here (Gmail account required): http://code.google.com/p/wikiteam/issues/list "
2011-04-07 16:10:12 +00:00
print " Good luck! Bye! "
2011-04-07 15:43:17 +00:00
def usage ( ) :
print " Write a complete help "
def getParameters ( ) :
2011-04-07 13:14:37 +00:00
config = {
' curonly ' : False ,
2011-04-07 20:24:30 +00:00
' date ' : datetime . datetime . now ( ) . strftime ( ' % Y % m %d ' ) ,
2011-04-09 09:12:58 +00:00
' api ' : ' ' ,
' index ' : ' ' ,
2011-04-07 15:43:17 +00:00
' images ' : False ,
2011-04-07 13:14:37 +00:00
' logs ' : False ,
2011-04-07 15:43:17 +00:00
' xml ' : False ,
2011-04-07 23:04:05 +00:00
' namespaces ' : [ ' all ' ] ,
2011-04-13 19:44:35 +00:00
' exnamespaces ' : [ ] ,
2011-04-07 15:43:17 +00:00
' path ' : ' ' ,
2011-04-07 23:04:05 +00:00
' threads ' : 1 , #fix not coded yet
2011-04-07 15:56:48 +00:00
' delay ' : 0 ,
2011-04-07 13:14:37 +00:00
}
2011-04-07 15:43:17 +00:00
other = {
' resume ' : False ,
2011-04-11 19:09:55 +00:00
' filenamelimit ' : 100 , #do not change
2011-04-07 15:43:17 +00:00
}
#console params
try :
2011-04-13 19:44:35 +00:00
opts , args = getopt . getopt ( sys . argv [ 1 : ] , " " , [ " h " , " help " , " path= " , " api= " , " index= " , " images " , " logs " , " xml " , " curonly " , " threads= " , " resume " , " delay= " , " namespaces= " , " exnamespaces= " , ] )
2011-04-07 15:43:17 +00:00
except getopt . GetoptError , err :
# print help information and exit:
print str ( err ) # will print something like "option -a not recognized"
usage ( )
sys . exit ( 2 )
for o , a in opts :
if o in ( " -h " , " --help " ) :
usage ( )
elif o in ( " --path " ) :
config [ " path " ] = a
while len ( config [ " path " ] ) > 0 :
if config [ " path " ] [ - 1 ] == ' / ' : #dará problemas con rutas windows?
config [ " path " ] = config [ " path " ] [ : - 1 ]
else :
break
2011-04-09 09:12:58 +00:00
elif o in ( " --api " ) :
config [ ' api ' ] = a
elif o in ( " --index " ) :
config [ " index " ] = a
2011-04-07 15:43:17 +00:00
elif o in ( " --images " ) :
config [ " images " ] = True
elif o in ( " --logs " ) :
config [ " logs " ] = True
elif o in ( " --xml " ) :
config [ " xml " ] = True
elif o in ( " --curonly " ) :
if not config [ " xml " ] :
print " If you select --curonly, you must use --xml too "
sys . exit ( )
config [ " curonly " ] = True
elif o in ( " --threads " ) :
config [ " threads " ] = int ( a )
elif o in ( " --resume " ) :
other [ " resume " ] = True
2011-04-07 15:56:48 +00:00
elif o in ( " --delay " ) :
config [ " delay " ] = int ( a )
2011-04-13 19:44:35 +00:00
elif o in ( " --namespaces " ) :
if re . search ( r ' [^ \ d, \ -] ' , a ) and a . lower ( ) != ' all ' :
print " Invalid namespaces values. \n Valid format is integer(s) splitted by commas "
sys . exit ( )
a = re . sub ( ' ' , ' ' , a )
if a . lower ( ) == ' all ' :
2011-04-13 19:49:53 +00:00
config [ " namespaces " ] = [ ' all ' ]
2011-04-13 19:44:35 +00:00
else :
config [ " namespaces " ] = [ int ( i ) for i in a . split ( ' , ' ) ]
elif o in ( " --exnamespaces " ) :
if re . search ( r ' [^ \ d, \ -] ' , a ) :
print " Invalid exnamespaces values. \n Valid format is integer(s) splitted by commas "
sys . exit ( )
a = re . sub ( ' ' , ' ' , a )
if a . lower ( ) == ' all ' :
print ' You have excluded all namespaces. Error. '
sys . exit ( )
else :
config [ " exnamespaces " ] = [ int ( i ) for i in a . split ( ' , ' ) ]
2011-04-07 15:43:17 +00:00
else :
assert False , " unhandled option "
#missing mandatory params
2011-04-13 19:03:16 +00:00
#(config['index'] and not re.search('/index\.php', config['index'])) or \ # in EditThis there is no index.php, it is empty editthis.info/mywiki/?title=...
2011-04-09 09:12:58 +00:00
if ( not config [ ' api ' ] and not config [ ' index ' ] ) or \
( config [ ' api ' ] and not re . search ( ' /api \ .php ' , config [ ' api ' ] ) ) or \
2011-04-11 19:09:55 +00:00
not ( config [ " xml " ] or config [ " images " ] or config [ " logs " ] ) or \
( other [ ' resume ' ] and not config [ ' path ' ] ) :
2011-04-07 15:43:17 +00:00
print """ Error. You forget mandatory parameters:
2011-04-09 09:12:58 +00:00
- - api or - - index : URL to api . php or to index . php , one of them . If wiki has api . php , please , use - - api instead of - - index . Examples : - - api = http : / / archiveteam . org / api . php or - - index = http : / / archiveteam . org / index . php
2011-04-07 15:43:17 +00:00
And one of these , or two or three :
- - xml : it generates a XML dump . It retrieves full history of pages located in namespace = 0 ( articles )
If you want more namespaces , use the parameter - - namespaces = 0 , 1 , 2 , 3. . . or - - namespaces = all
- - images : it generates an image dump
- - logs : it generates a log dump
2011-04-11 19:09:55 +00:00
You can resume previous incomplete dumps :
- - resume : it resumes previous incomplete dump . When using - - resume , - - path is mandatory ( path to directory where incomplete dump is ) .
2011-04-13 19:44:35 +00:00
You can exclude namespaces :
- - exnamespaces : write the number of the namespaces you want to exclude , splitted by commas .
2011-04-07 15:43:17 +00:00
Write - - help for help . """
sys . exit ( )
#usage()
2011-04-13 19:03:16 +00:00
#user chosen --api, --index it is neccesary for special:export, we generate it
if config [ ' api ' ] and not config [ ' index ' ] :
config [ ' index ' ] = config [ ' api ' ] . split ( ' api.php ' ) [ 0 ] + ' index.php '
#print 'You didn\'t provide a path for index.php, trying to wonder one:', config['index']
2011-04-09 09:12:58 +00:00
2011-04-09 11:07:23 +00:00
if config [ ' api ' ] :
#fix add here api.php existence comprobation
2011-04-13 19:03:16 +00:00
f = urllib . urlopen ( config [ ' api ' ] )
raw = f . read ( )
f . close ( )
print ' Checking api.php... '
if re . search ( r ' action=query ' , raw ) :
print ' api.php is OK '
else :
print ' Error in api.php, please, provide a correct path to api.php '
sys . exit ( )
if config [ ' index ' ] :
#check index.php
f = urllib . urlopen ( ' %s ?title=Special:Version ' % ( config [ ' index ' ] ) )
raw = f . read ( )
f . close ( )
print ' Checking index.php... '
if re . search ( r ' This wiki is powered by ' , raw ) :
print ' index.php is OK '
else :
print ' Error in index.php, please, provide a correct path to index.php '
sys . exit ( )
2011-04-09 11:07:23 +00:00
2011-04-09 09:12:58 +00:00
#adding http://
2011-04-12 16:31:50 +00:00
if not config [ ' index ' ] and not config [ ' api ' ] . startswith ( ' http:// ' ) :
2011-04-09 09:12:58 +00:00
config [ ' api ' ] = ' http:// ' + config [ ' api ' ]
2011-04-12 16:31:50 +00:00
if not config [ ' api ' ] and not config [ ' index ' ] . startswith ( ' http:// ' ) :
2011-04-09 09:12:58 +00:00
config [ ' index ' ] = ' http:// ' + config [ ' index ' ]
2011-04-13 19:03:16 +00:00
2011-04-12 16:31:50 +00:00
2011-04-07 20:24:30 +00:00
#calculating path, if not defined by user with --path=
2011-04-11 07:58:09 +00:00
if not config [ ' path ' ] :
config [ ' path ' ] = ' ./ %s - %s -wikidump ' % ( domain2prefix ( config = config ) , config [ ' date ' ] )
2011-04-07 15:43:17 +00:00
return config , other
2011-04-09 17:16:42 +00:00
def removeIP ( raw = ' ' ) :
raw = re . sub ( r ' \ d+ \ . \ d+ \ . \ d+ \ . \ d+ ' , ' 0.0.0.0 ' , raw )
2011-04-09 17:37:23 +00:00
#http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
#weird cases as :: are not included
raw = re . sub ( r ' (?i)[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4}:[ \ da-f] { 0,4} ' , ' 0:0:0:0:0:0:0:0 ' , raw )
2011-04-09 17:16:42 +00:00
return raw
2011-04-07 15:43:17 +00:00
def main ( ) :
2011-04-09 17:45:56 +00:00
welcome ( )
2011-04-07 15:43:17 +00:00
configfilename = ' config.txt '
config , other = getParameters ( )
2011-04-06 18:54:33 +00:00
2011-04-07 13:14:37 +00:00
#notice about wikipedia dumps
2011-04-09 09:12:58 +00:00
if re . findall ( r ' (wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews) \ .org ' , config [ ' api ' ] + config [ ' index ' ] ) :
2011-04-06 18:54:33 +00:00
print ' DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS! \n Download the dumps from http://download.wikimedia.org \n Thanks! '
sys . exit ( )
2011-04-09 09:12:58 +00:00
print ' Analysing %s ' % ( config [ ' api ' ] and config [ ' api ' ] or config [ ' index ' ] )
2011-04-07 15:56:48 +00:00
2011-04-07 15:43:17 +00:00
#creating path or resuming if desired
2011-04-07 13:14:37 +00:00
c = 2
2011-04-07 16:10:12 +00:00
originalpath = config [ ' path ' ] # to avoid concat blabla-2, blabla-2-3, and so on...
2011-04-07 20:24:30 +00:00
while not other [ ' resume ' ] and os . path . isdir ( config [ ' path ' ] ) : #do not enter if resume is request from begining
2011-04-07 15:43:17 +00:00
print ' \n Warning!: " %s " path exists ' % ( config [ ' path ' ] )
2011-04-11 07:58:09 +00:00
reply = ' '
while reply not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
reply = raw_input ( ' There is a dump in " %s " , probably incomplete. \n If you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored \n and the parameters available in " %s / %s " will be loaded. \n Do you want to resume ([yes, y], [no, n])? ' % ( config [ ' path ' ] , config [ ' path ' ] , configfilename ) )
2011-04-07 13:14:37 +00:00
if reply . lower ( ) in [ ' yes ' , ' y ' ] :
2011-04-07 20:24:30 +00:00
if not os . path . isfile ( ' %s / %s ' % ( config [ ' path ' ] , configfilename ) ) :
print ' No config file found. I can \' t resume. Aborting. '
2011-04-07 13:14:37 +00:00
sys . exit ( )
2011-04-07 15:43:17 +00:00
print ' You have selected YES '
other [ ' resume ' ] = True
2011-04-07 13:14:37 +00:00
break
2011-04-11 07:58:09 +00:00
elif reply . lower ( ) in [ ' no ' , ' n ' ] :
2011-04-07 15:43:17 +00:00
print ' You have selected NO '
2011-04-11 07:58:09 +00:00
other [ ' resume ' ] = False
2011-04-07 16:10:12 +00:00
config [ ' path ' ] = ' %s - %d ' % ( originalpath , c )
2011-04-07 15:43:17 +00:00
print ' Trying " %s " ... ' % ( config [ ' path ' ] )
2011-04-07 13:14:37 +00:00
c + = 1
2011-04-07 15:43:17 +00:00
2011-04-07 20:24:30 +00:00
if other [ ' resume ' ] :
print ' Loading config file... '
config = loadConfig ( config = config , configfilename = configfilename )
else :
2011-04-07 15:43:17 +00:00
os . mkdir ( config [ ' path ' ] )
saveConfig ( config = config , configfilename = configfilename )
2011-04-07 13:14:37 +00:00
titles = [ ]
2011-04-07 22:32:05 +00:00
images = [ ]
2011-04-07 15:43:17 +00:00
if other [ ' resume ' ] :
2011-04-07 20:24:30 +00:00
print ' Resuming previous dump process... '
if config [ ' xml ' ] :
#load titles
2011-04-07 22:32:05 +00:00
lasttitle = ' '
try :
2011-04-09 09:12:58 +00:00
f = open ( ' %s / %s - %s -titles.txt ' % ( config [ ' path ' ] , domain2prefix ( config = config ) , config [ ' date ' ] ) , ' r ' )
2011-04-07 22:32:05 +00:00
raw = f . read ( )
titles = raw . split ( ' \n ' )
lasttitle = titles [ - 1 ]
2011-04-09 11:07:23 +00:00
if not lasttitle : #empty line at EOF ?
lasttitle = titles [ - 2 ]
2011-04-07 22:32:05 +00:00
f . close ( )
except :
pass #probably file doesnot exists
2011-04-07 20:24:30 +00:00
if lasttitle == ' --END-- ' :
#titles list is complete
2011-04-07 22:32:05 +00:00
print ' Title list was completed in the previous session '
2011-04-07 20:24:30 +00:00
else :
2011-04-09 11:07:23 +00:00
print ' Title list is incomplete. Reloading... '
2011-04-09 09:12:58 +00:00
#do not resume, reload, to avoid inconsistences, deleted pages or so
2011-04-08 16:30:29 +00:00
titles = getPageTitles ( config = config )
2011-04-07 20:24:30 +00:00
saveTitles ( config = config , titles = titles )
#checking xml dump
2011-04-15 12:26:23 +00:00
xmliscomplete = False
lastxmltitle = ' '
2011-04-09 11:07:23 +00:00
try :
f = open ( ' %s / %s - %s - %s .xml ' % ( config [ ' path ' ] , domain2prefix ( config = config ) , config [ ' date ' ] , config [ ' curonly ' ] and ' current ' or ' history ' ) , ' r ' )
2011-04-15 12:26:23 +00:00
for l in f :
if re . findall ( ' </mediawiki> ' , l ) :
#xml dump is complete
xmliscomplete = True
break
xmltitles = re . findall ( r ' <title>([^<]+)</title> ' , l ) #weird if found more than 1, but maybe
if xmltitles :
lastxmltitle = xmltitles [ - 1 ]
2011-04-09 11:07:23 +00:00
f . close ( )
except :
pass #probably file doesnot exists
2011-04-15 12:26:23 +00:00
if xmliscomplete :
2011-04-07 20:24:30 +00:00
print ' XML dump was completed in the previous session '
2011-04-15 12:26:23 +00:00
elif lastxmltitle :
#resuming...
print ' Resuming XML dump from " %s " ' % ( lastxmltitle )
2011-04-07 20:24:30 +00:00
generateXMLDump ( config = config , titles = titles , start = lastxmltitle )
2011-04-15 12:26:23 +00:00
else :
#corrupt? only has XML header?
print ' XML is corrupt? Regenerating... '
generateXMLDump ( config = config , titles = titles )
2011-04-07 20:24:30 +00:00
if config [ ' images ' ] :
2011-04-07 22:32:05 +00:00
#load images
lastimage = ' '
try :
2011-04-09 09:12:58 +00:00
f = open ( ' %s / %s - %s -images.txt ' % ( config [ ' path ' ] , domain2prefix ( config = config ) , config [ ' date ' ] ) , ' r ' )
2011-04-07 22:32:05 +00:00
raw = f . read ( )
lines = raw . split ( ' \n ' )
for l in lines :
if re . search ( r ' \ t ' , l ) :
images . append ( l . split ( ' \t ' ) )
lastimage = lines [ - 1 ]
f . close ( )
except :
pass #probably file doesnot exists
if lastimage == ' --END-- ' :
print ' Image list was completed in the previous session '
else :
2011-04-09 09:12:58 +00:00
print ' Image list is incomplete. Reloading... '
#do not resume, reload, to avoid inconsistences, deleted images or so
2011-04-08 16:30:29 +00:00
images = getImageFilenamesURL ( config = config )
2011-04-07 22:32:05 +00:00
saveImageFilenamesURL ( config = config , images = images )
#checking images directory
listdir = [ ]
try :
listdir = os . listdir ( ' %s /images ' % ( config [ ' path ' ] ) )
except :
pass #probably directory does not exist
listdir . sort ( )
complete = True
lastfilename = ' '
lastfilename2 = ' '
2011-04-07 22:35:25 +00:00
c = 0
2011-04-08 14:57:36 +00:00
for filename , url , uploader in images :
2011-04-11 18:38:47 +00:00
filename2 = filename
2011-04-11 19:09:55 +00:00
if len ( filename2 ) > other [ ' filenamelimit ' ] :
filename2 = truncateFilename ( other = other , filename = filename2 )
2011-04-11 18:38:47 +00:00
if filename2 not in listdir :
2011-04-07 22:32:05 +00:00
complete = False
lastfilename2 = lastfilename
2011-04-11 18:38:47 +00:00
lastfilename = filename #return always the complete filename, not the truncated
2011-04-07 22:32:05 +00:00
break
2011-04-07 22:35:25 +00:00
c + = 1
print ' %d images were found in the directory from a previous session ' % ( c )
2011-04-07 22:32:05 +00:00
lastfilename2 = lastfilename # we resume from previous image, which may be corrupted by the previous session ctrl-c or abort
if complete :
#image dump is complete
print ' Image dump was completed in the previous session '
else :
2011-04-11 19:09:55 +00:00
generateImageDump ( config = config , other = other , images = images , start = lastfilename )
2011-04-07 20:24:30 +00:00
if config [ ' logs ' ] :
2011-04-07 20:38:06 +00:00
#fix
2011-04-07 13:14:37 +00:00
pass
else :
2011-04-07 20:24:30 +00:00
print ' Trying generating a new dump into a new directory... '
2011-04-07 15:43:17 +00:00
if config [ ' xml ' ] :
2011-04-09 09:12:58 +00:00
titles + = getPageTitles ( config = config )
2011-04-07 20:24:30 +00:00
saveTitles ( config = config , titles = titles )
2011-04-07 15:43:17 +00:00
generateXMLDump ( config = config , titles = titles )
2011-04-07 13:14:37 +00:00
if config [ ' images ' ] :
2011-04-07 22:32:05 +00:00
images + = getImageFilenamesURL ( config = config ) #fix add start like above
saveImageFilenamesURL ( config = config , images = images )
2011-04-11 19:09:55 +00:00
generateImageDump ( config = config , other = other , images = images )
2011-04-07 13:14:37 +00:00
if config [ ' logs ' ] :
2011-04-07 15:43:17 +00:00
saveLogs ( config = config )
2011-04-08 22:42:35 +00:00
#save index.php as html, for license details at the bootom of the page
2011-04-15 12:26:23 +00:00
print ' Downloading index.php (Main Page) '
2011-04-09 17:16:42 +00:00
f = urllib . urlopen ( config [ ' index ' ] )
raw = f . read ( )
raw = removeIP ( raw = raw )
f = open ( ' %s /index.html ' % ( config [ ' path ' ] ) , ' w ' )
f . write ( raw )
f . close ( )
#save special:Version as html, for extensions details
2011-04-15 12:26:23 +00:00
print ' Downloading Special:Version with extensions and other related info '
2011-04-09 17:16:42 +00:00
f = urllib . urlopen ( ' %s ?title=Special:Version ' % ( config [ ' index ' ] ) )
raw = f . read ( )
raw = removeIP ( raw = raw )
f = open ( ' %s /Special:Version.html ' % ( config [ ' path ' ] ) , ' w ' )
f . write ( raw )
f . close ( )
2011-04-08 22:42:35 +00:00
2011-04-09 17:45:56 +00:00
bye ( )
2011-04-07 15:43:17 +00:00
if __name__ == " __main__ " :
main ( )