Merge branch '2015/iterators' of https://github.com/nemobis/wikiteam into nemobis-2015/iterators

9 years ago · 632b99ea53
parent 2537e9852e 55e5888a00
commit 632b99ea53
1 changed files with 158 additions and 81 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -19,6 +19,10 @@
 # To learn more, read the documentation:
 #     https://github.com/WikiTeam/wikiteam/wiki

+try:
+    from kitchen.text.converters import getwriter
+except ImportError:
+    print "Please install the kitchen module."
 import cookielib
 import cPickle
 import datetime
@ -42,6 +46,8 @@ except ImportError:
    sys.exit(1)
 import time
 import urllib
+UTF8Writer = getwriter('utf8')
+sys.stdout = UTF8Writer(sys.stdout)

 __VERSION__ = '0.3.0-alpha'  # major, minor, micro: semver.org

@ -52,6 +58,12 @@ class PageMissingError(Exception):
    def __str__(self):
        return "page '%s' not found" % self.title

+class ExportAbortedError(Exception):
+    def __init__(self, index):
+        self.index = index
+    def __str__(self):
+        return "Export from '%s' did not return anything." % self.index
+
 def getVersion():
    return(__VERSION__)

@ -251,19 +263,19 @@ def getPageTitlesAPI(config={}, session=None):
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
-            titles += [page['title']
-                       for page in allpages]
+            for page in allpages:
+                yield page['title']
+            c += len(allpages)
+
            if len(titles) != len(set(titles)):
                # probably we are in a loop, server returning dupe titles, stop
                # it
                print 'Probably a loop, finishing'
                titles = list(set(titles))
                apfrom = ''
-            c += len(allpages)
+
            delay(config=config, session=session)
        print '    %d titles retrieved in the namespace %d' % (c, namespace)
-    return titles
-

 def getPageTitlesScraper(config={}, session=None):
    """  """
@ -362,13 +374,21 @@ def getPageTitles(config={}, session=None):
    elif 'index' in config and config['index']:
        titles = getPageTitlesScraper(config=config, session=session)

-    # removing dupes (e.g. in CZ appears Widget:AddThis two times (main
-    # namespace and widget namespace))
-    titles = sorted(set(titles))
-
-    print '%d page titles loaded' % (len(titles))
-    return titles
+    titlesfilename = '%s-%s-titles.txt' % (
+        domain2prefix(config=config), config['date'])
+    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'a')
+    c = 0
+    for title in titles:
+        titlesfile.write(title.encode('utf-8') + "\n")
+        c += 1
+    # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
+    # main namespace and widget namespace.
+    # We can use sort -u in UNIX, but is it worth it?
+    titlesfile.write(u'--END--\n')
+    titlesfile.close()
+    print 'Titles saved at...', titlesfilename

+    print '%d page titles loaded' % (c)

 def getImageNames(config={}, session=None):
    """ Get list of image names """
@ -396,13 +416,36 @@ def getXMLHeader(config={}, session=None):
    try:
        xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
    except PageMissingError as pme:
+        # The <page> does not exist. Not a problem, if we get the <siteinfo>.
        xml = pme.xml
+    # Issue 26: Account for missing "Special" namespace.
+    # Hope the canonical special name has not been removed.
+    # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
+    except ExportAbortedError:
+        try:
+            if config['api']:
+                print "Trying the local name for the Special namespace instead"
+                r = session.post(
+                url=config['api'],
+                data={
+                    'action': 'query',
+                    'meta': 'siteinfo',
+                    'siprop': 'namespaces',
+                    'format': 'json'}
+                )
+                config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+                    + ':Export'
+                xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+        except PageMissingError as pme:
+            xml = pme.xml
+        except ExportAbortedError:
+            pass

    header = xml.split('</mediawiki>')[0]
    if not re.match("<mediawiki", xml):
        print 'XML export on this wiki is broken, quitting.'
        sys.exit()
-    return header
+    return header, config


 def getXMLFileDesc(config={}, title='', session=None):
@ -454,11 +497,11 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
            print '    We have retried %d times' % (c)
            print '    MediaWiki error for "%s", network error or whatever...' % (params['pages'])
            # If it's not already what we tried: our last chance, preserve only the last revision...
-            # config['curonly'] means that the whole dump is configured to save nonly the last
+            # config['curonly'] means that the whole dump is configured to save only the last,
            # params['curonly'] should mean that we've already tried this
            # fallback, because it's set by the following if and passed to
            # getXMLPageCore
-            if not config['curonly']:
+            if not config['curonly'] and not 'curonly' in params:
                print '    Trying to save only the last revision for this page...'
                params['curonly'] = 1
                logerror(
@ -478,6 +521,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
                    config=config,
                    text='Error while retrieving the last revision of "%s". Skipping.' %
                    (params['pages']))
+                raise ExportAbortedError(config['index'])
                return ''  # empty xml
        # FIXME HANDLE HTTP Errors HERE
        try:
@ -485,6 +529,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
            handleStatusCode(r)
            xml = r.text
        except requests.exceptions.ConnectionError as e:
+            raise ExportAbortedError(config['index'])
            xml = ''
        c += 1

@ -502,7 +547,10 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
    title_ = title
    title_ = re.sub(' ', '_', title_)
    # do not convert & into %26, title_ = re.sub('&', '%26', title_)
-    params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
+    try:
+        params = {'title': config['export'], 'pages': title_, 'action': 'submit'}
+    except KeyError:
+        params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
    if config['curonly']:
        params['curonly'] = 1
        params['limit'] = 1
@ -514,6 +562,8 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
        params['templates'] = 1

    xml = getXMLPageCore(params=params, config=config, session=session)
+    if xml == "":
+        raise ExportAbortedError(config['index'])
    if not "</page>" in xml:
        raise PageMissingError(params['title'], xml)
    else:
@ -574,9 +624,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None):

    if verbose:
        if (numberofedits == 1):
-           print '    %s, 1 edit' % (title.encode('utf-8'))
+           print '    %s, 1 edit' % (title.strip())
        else:
-           print '    %s, %d edits' % (title.encode('utf-8'), numberofedits)
+           print '    %s, %d edits' % (title.strip(), numberofedits)


 def cleanXML(xml=''):
@ -589,11 +639,12 @@ def cleanXML(xml=''):
    return xml


-def generateXMLDump(config={}, titles=[], start='', session=None):
+def generateXMLDump(config={}, titles=[], start=None, session=None):
    """ Generates a XML dump for a list of titles """
+    # TODO: titles is now unused.

    print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
-    header = getXMLHeader(config=config, session=session)
+    header, config = getXMLHeader(config=config, session=session)
    footer = '</mediawiki>\n'  # new line at the end
    xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
                                    config['date'],
@ -601,32 +652,21 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
    xmlfile = ''
    lock = True
    if start:
-        # remove the last chunk of xml dump (it is probably incomplete)
-        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r')
-        xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w')
-        prev = ''
+        print "Removing the last chunk of past XML dump: it is probably incomplete"
+        xmlfile = reverse_readline('%s/%s' % (config['path'], xmlfilename))
        c = 0
        for l in xmlfile:
-            # removing <page>\n until end of file
-            # lock to avoid write an empty line at the begining of file
-            if c != 0:
-                if not re.search(r'<title>%s</title>' % (start), l):
-                    xmlfile2.write(prev)
-                else:
-                    break
            c += 1
-            prev = l
-        xmlfile.close()
-        xmlfile2.close()
-        # subst xml with xml2
-        # remove previous xml dump
-        os.remove('%s/%s' % (config['path'], xmlfilename))
-        # move correctly truncated dump to its real name
-        os.rename(
-            '%s/%s2' %
-            (config['path'], xmlfilename), '%s/%s' %
-            (config['path'], xmlfilename)
-        )
+            if re.search(r'<title>%s</title>' % (start), l):
+                # Done searching. We try to truncate the file at this point:
+                # everything should be removed from the line before <title>,
+                # that is the last c+1 lines AKA lines from EOF - c to EOF.
+                # TODO: do something for users without GNU ed; replace os.
+                # Try file.seek and file.truncate in the generator again?
+                os.system("(echo '$-%d,$d'; echo wq ) | ed %s/%s" \
+                    % (c, config['path'], xmlfilename) )
+                print "Last %d lines removed." % (c+1)
+                break
    else:
        # requested complete xml dump
        lock = False
@ -636,7 +676,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None):

    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
    c = 1
-    for title in titles:
+    for title in readTitles(config, start):
        if not title.strip():
            continue
        if title == start:  # start downloading from start, included
@ -665,19 +705,59 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
    xmlfile.close()
    print 'XML dump saved at...', xmlfilename

-
-def saveTitles(config={}, titles=[]):
-    """ Save title list in a file """
+def readTitles(config={}, start=None):
+    """ Read title list from a file, from the title "start" """

    titlesfilename = '%s-%s-titles.txt' % (
        domain2prefix(config=config), config['date'])
-    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
-    output = u"%s\n--END--" % ('\n'.join(titles))
-    titlesfile.write(output.encode('utf-8'))
-    titlesfile.close()
+    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')

-    print 'Titles saved at...', titlesfilename
+    seeking = False
+    if start:
+        seeking = True

+    with titlesfile as f:
+        for line in f:
+            if line.strip() == '--END--':
+                break
+            elif seeking and line.strip() != start:
+                continue
+            elif seeking and line.strip() == start:
+                seeking = False
+                yield line.strip()
+            else:
+                yield line.strip()
+
+def reverse_readline(filename, buf_size=8192):
+    """a generator that returns the lines of a file in reverse order"""
+    # Original code by srohde, abdus_salam: cc by-sa 3.0
+    # http://stackoverflow.com/a/23646049/718903
+    with open(filename) as fh:
+        segment = None
+        offset = 0
+        fh.seek(0, os.SEEK_END)
+        total_size = remaining_size = fh.tell()
+        while remaining_size > 0:
+            offset = min(total_size, offset + buf_size)
+            fh.seek(-offset, os.SEEK_END)
+            buffer = fh.read(min(remaining_size, buf_size))
+            remaining_size -= buf_size
+            lines = buffer.split('\n')
+            # the first line of the buffer is probably not a complete line so
+            # we'll save it and append it to the last line of the next buffer
+            # we read
+            if segment is not None:
+                # if the previous chunk starts right from the beginning of line
+                # do not concat the segment to the last line of new chunk
+                # instead, yield the segment first 
+                if buffer[-1] is not '\n':
+                    lines[-1] += segment
+                else:
+                    yield segment
+            segment = lines[0]
+            for index in range(len(lines) - 1, 0, -1):
+                yield lines[index]
+        yield segment

 def saveImageNames(config={}, images=[], session=None):
    """ Save image list in a file, including filename, url and uploader """
@ -1489,12 +1569,11 @@ def checkXMLIntegrity(config={}, titles=[], session=None):


 def createNewDump(config={}, other={}):
-    titles = []
    images = []
    print 'Trying generating a new dump into a new directory...'
    if config['xml']:
-        titles += getPageTitles(config=config, session=other['session'])
-        saveTitles(config=config, titles=titles)
+        getPageTitles(config=config, session=other['session'])
+        titles=readTitles(config)
        generateXMLDump(config=config, titles=titles, session=other['session'])
        checkXMLIntegrity(
            config=config,
@ -1513,23 +1592,21 @@ def createNewDump(config={}, other={}):


 def resumePreviousDump(config={}, other={}):
-    titles = []
    images = []
    print 'Resuming previous dump process...'
    if config['xml']:
-        # load titles
-        lasttitle = ''
+        titles=readTitles(config)
        try:
-            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(
-                config=config, session=other['session']), config['date']), 'r')
-            raw = unicode(f.read(), 'utf-8')
-            titles = raw.split('\n')
-            lasttitle = titles[-1]
-            if not lasttitle:  # empty line at EOF ?
-                lasttitle = titles[-2]
-            f.close()
+            lasttitles = reverse_readline('%s/%s-%s-titles.txt' %
+                ( config['path'],
+                domain2prefix( config=config, session=other['session'] ),
+                config['date'])
+                )
+            lasttitle=lasttitles.next()
+            if lasttitle == '':
+                lasttitle=lasttitles.next()
        except:
-            pass  # probably file doesnot exists
+            pass  # probably file does not exists
        if lasttitle == '--END--':
            # titles list is complete
            print 'Title list was completed in the previous session'
@ -1537,13 +1614,13 @@ def resumePreviousDump(config={}, other={}):
            print 'Title list is incomplete. Reloading...'
            # do not resume, reload, to avoid inconsistences, deleted pages or
            # so
-            titles = getPageTitles(config=config, session=other['session'])
-            saveTitles(config=config, titles=titles)
+            getPageTitles(config=config, session=other['session'])
+
        # checking xml dump
        xmliscomplete = False
-        lastxmltitle = ''
+        lastxmltitle = None
        try:
-            f = open(
+            f = reverse_readline(
                '%s/%s-%s-%s.xml' %
                (config['path'],
                 domain2prefix(
@ -1551,27 +1628,26 @@ def resumePreviousDump(config={}, other={}):
                    session=other['session']),
                    config['date'],
                    config['curonly'] and 'current' or 'history'),
-                'r')
+                )
            for l in f:
-                if re.findall('</mediawiki>', l):
+                if l == '</mediawiki>':
                    # xml dump is complete
                    xmliscomplete = True
                    break
-                # weird if found more than 1, but maybe
-                xmltitles = re.findall(r'<title>([^<]+)</title>', l)
-                if xmltitles:
-                    lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
-            f.close()
+
+                xmltitle = re.search(r'<title>([^<]+)</title>', l)
+                if xmltitle:
+                    lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
+                    break
        except:
-            pass  # probably file doesnot exists
-        # removing --END-- before getXMLs
-        while titles and titles[-1] in ['', '--END--']:
-            titles = titles[:-1]
+            pass  # probably file does not exists
+
        if xmliscomplete:
            print 'XML dump was completed in the previous session'
        elif lastxmltitle:
            # resuming...
            print 'Resuming XML dump from "%s"' % (lastxmltitle)
+            titles = readTitles(config, start=lastxmltitle)
            generateXMLDump(
                config=config,
                titles=titles,
@ -1580,6 +1656,7 @@ def resumePreviousDump(config={}, other={}):
        else:
            # corrupt? only has XML header?
            print 'XML is corrupt? Regenerating...'
+            titles = readTitles(config)
            generateXMLDump(
                config=config, titles=titles, session=other['session'])