|
|
@ -6,12 +6,12 @@
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
|
|
# (at your option) any later version.
|
|
|
|
# (at your option) any later version.
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
|
@ -30,11 +30,11 @@ def main():
|
|
|
|
if len(sys.argv) < 2:
|
|
|
|
if len(sys.argv) < 2:
|
|
|
|
print 'python script.py file-with-apis.txt'
|
|
|
|
print 'python script.py file-with-apis.txt'
|
|
|
|
sys.exit()
|
|
|
|
sys.exit()
|
|
|
|
|
|
|
|
|
|
|
|
print 'Reading list of APIs from', sys.argv[1]
|
|
|
|
print 'Reading list of APIs from', sys.argv[1]
|
|
|
|
wikis = open(sys.argv[1], 'r').read().splitlines()
|
|
|
|
wikis = open(sys.argv[1], 'r').read().splitlines()
|
|
|
|
print '%d APIs found' % (len(wikis))
|
|
|
|
print '%d APIs found' % (len(wikis))
|
|
|
|
|
|
|
|
|
|
|
|
for wiki in wikis:
|
|
|
|
for wiki in wikis:
|
|
|
|
print "#"*73
|
|
|
|
print "#"*73
|
|
|
|
print "# Downloading", wiki
|
|
|
|
print "# Downloading", wiki
|
|
|
@ -42,7 +42,7 @@ def main():
|
|
|
|
wiki = wiki.lower()
|
|
|
|
wiki = wiki.lower()
|
|
|
|
# Make the prefix in standard way; api and index must be defined, not important which is which
|
|
|
|
# Make the prefix in standard way; api and index must be defined, not important which is which
|
|
|
|
prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
|
|
|
|
prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
|
|
|
|
|
|
|
|
|
|
|
|
#check if compressed, in that case dump was finished previously
|
|
|
|
#check if compressed, in that case dump was finished previously
|
|
|
|
compressed = False
|
|
|
|
compressed = False
|
|
|
|
for f in os.listdir('.'):
|
|
|
|
for f in os.listdir('.'):
|
|
|
@ -50,7 +50,7 @@ def main():
|
|
|
|
compressed = True
|
|
|
|
compressed = True
|
|
|
|
zipfilename = f
|
|
|
|
zipfilename = f
|
|
|
|
break #stop searching, dot not explore subdirectories
|
|
|
|
break #stop searching, dot not explore subdirectories
|
|
|
|
|
|
|
|
|
|
|
|
if compressed:
|
|
|
|
if compressed:
|
|
|
|
print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
|
|
|
|
print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
|
|
|
|
# Get the archive's file list.
|
|
|
|
# Get the archive's file list.
|
|
|
@ -65,17 +65,17 @@ def main():
|
|
|
|
print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
|
|
|
|
print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
|
|
|
|
# TODO: Find a way like grep -q below without doing a 7z l multiple times?
|
|
|
|
# TODO: Find a way like grep -q below without doing a 7z l multiple times?
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
#download
|
|
|
|
#download
|
|
|
|
started = False #was this wiki download started before? then resume
|
|
|
|
started = False #was this wiki download started before? then resume
|
|
|
|
wikidir = ''
|
|
|
|
wikidir = ''
|
|
|
|
for f in os.listdir('.'):
|
|
|
|
for f in os.listdir('.'):
|
|
|
|
# Does not find numbered wikidumps not verify directories
|
|
|
|
# Does not find numbered wikidumps not verify directories
|
|
|
|
if d.startswith(prefix) and d.endswith('wikidump'):
|
|
|
|
if f.startswith(prefix) and f.endswith('wikidump'):
|
|
|
|
wikidir = d
|
|
|
|
wikidir = f
|
|
|
|
started = True
|
|
|
|
started = True
|
|
|
|
break #stop searching, dot not explore subdirectories
|
|
|
|
break #stop searching, dot not explore subdirectories
|
|
|
|
|
|
|
|
|
|
|
|
# time.sleep(60)
|
|
|
|
# time.sleep(60)
|
|
|
|
# Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
|
|
|
|
# Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
|
|
|
|
# such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
|
|
|
|
# such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
|
|
|
@ -89,12 +89,12 @@ def main():
|
|
|
|
#save wikidir now
|
|
|
|
#save wikidir now
|
|
|
|
for f in os.listdir('.'):
|
|
|
|
for f in os.listdir('.'):
|
|
|
|
# Does not find numbered wikidumps not verify directories
|
|
|
|
# Does not find numbered wikidumps not verify directories
|
|
|
|
if d.startswith(prefix) and d.endswith('wikidump'):
|
|
|
|
if f.startswith(prefix) and f.endswith('wikidump'):
|
|
|
|
wikidir = d
|
|
|
|
wikidir = f
|
|
|
|
break #stop searching, dot not explore subdirectories
|
|
|
|
break #stop searching, dot not explore subdirectories
|
|
|
|
|
|
|
|
|
|
|
|
prefix = wikidir.split('-wikidump')[0]
|
|
|
|
prefix = wikidir.split('-wikidump')[0]
|
|
|
|
|
|
|
|
|
|
|
|
finished = False
|
|
|
|
finished = False
|
|
|
|
if started and wikidir and prefix:
|
|
|
|
if started and wikidir and prefix:
|
|
|
|
if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
|
|
|
|
if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
|
|
|
@ -103,7 +103,7 @@ def main():
|
|
|
|
finished = True
|
|
|
|
finished = True
|
|
|
|
# You can also issue this on your working directory to find all incomplete dumps:
|
|
|
|
# You can also issue this on your working directory to find all incomplete dumps:
|
|
|
|
# tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
|
|
|
|
# tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
|
|
|
|
|
|
|
|
|
|
|
|
#compress
|
|
|
|
#compress
|
|
|
|
if finished:
|
|
|
|
if finished:
|
|
|
|
time.sleep(1)
|
|
|
|
time.sleep(1)
|
|
|
|