Merge pull request #378 from nemobis/wikia

More efficient Wikia download and launcher.py
4 years ago · 6e85afca82
parent 4eae50b2fb abd908914f
commit 6e85afca82
2 changed files with 55 additions and 30 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -751,7 +751,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
        c = 1
        for title in readTitles(config, start):
-            if not title.strip():
+            if not title:
                continue
            if title == start:  # start downloading from start, included
                lock = False
@ -767,8 +767,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
            except PageMissingError:
                logerror(
                    config=config,
-                    text=u'The page "%s" was missing in the wiki (probably deleted)' %
-                    (title.decode('utf-8'))
+                    text=u'The page "%s" was missing in the wiki (probably deleted)' % title
                )
            # here, XML is a correct <page> </page> chunk or
            # an empty string due to a deleted page (logged in errors log) or
@ -906,7 +905,8 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                        # End of continuation. We are done with this namespace.
                        break
                    
-    except KeyError:
+    except (KeyError, mwclient.errors.InvalidResponse) as e:
+        print(e)
        # TODO: check whether the KeyError was really for a missing arv API
        print "Warning. Could not use allrevisions. Wiki too old?"
        if config['curonly']:
@ -916,7 +916,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
            c = 0
            for title in readTitles(config, start=start):
                # TODO: respect verbose flag, reuse output from getXMLPage
-                print('    {}'.format(title.strip()))
+                print(u'    {}'.format(title))
                # TODO: as we're doing one page and revision at a time, we might
                # as well use xml format and exportnowrap=1 to use the string of,
                # XML as is, but need to check how well the library handles it.
@ -948,18 +948,23 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
            # refuses to return an arbitrary number of revisions (see above).
            print("Getting titles to export all the revisions of each")
            c = 0
-            for title in readTitles(config, start=start):
-                print('    {}'.format(title.strip()))
+            titlelist = []
+            # TODO: Decide a suitable number of a batched request. Careful:
+            # batched responses may not return all revisions.
+            for titlelist in readTitles(config, start=start, batch=False):
+                if type(titlelist) is not list:
+                    titlelist = [titlelist]
+                for title in titlelist:
+                    print(u'    {}'.format(title))
                # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
                # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
                pparams = {
                    'action': 'query',
-                    'titles': title,
+                    'titles': '|'.join(titlelist),
                    'prop': 'revisions',
-                    'rvlimit': 50,
+                    #'rvlimit': 50,
                    'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
                }
-                # TODO: we could actually batch titles a bit here if desired. How many?
                try:
                    prequest = site.api(http_method=config['http_method'], **pparams)
                except requests.exceptions.HTTPError as e:
@ -967,6 +972,12 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                        print("POST request to the API failed, retrying with GET")
                        config['http_method'] = "GET"
                        exportrequest = site.api(http_method=config['http_method'], **exportparams)
+                except mwclient.errors.InvalidResponse:
+                    logerror(
+                                config=config,
+                                text=u'Error: page inaccessible? Could not export page: %s' % ("; ".join(titlelist))
+                            )
+                    continue

                # Be ready to iterate if there is continuation.
                while True:
@ -978,7 +989,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                    except KeyError:
                        logerror(
                                config=config,
-                                text=u'Error: page inaccessible? Could not export page: %s' % (title.decode('utf-8'))
+                                text=u'Error: page inaccessible? Could not export page: %s' % ("; ".join(titlelist))
                            )
                        break
                    # Go through the data we got to build the XML.
@ -989,14 +1000,15 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                        except PageMissingError:
                            logerror(
                                config=config,
-                                text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
+                                text=u'Error: empty revision from API. Could not export page: %s' % ("; ".join(titlelist))
                            )
                            continue

                    # Get next batch of revisions if there's more.
                    if 'continue' in prequest.keys():
-                        print("Getting more revisions for page {}".format(title))
-                        pparams['rvcontinue'] = prequest['continue']['rvcontinue']
+                        print("Getting more revisions for the page")
+                        for key, value in prequest['continue']:
+                            params[key] = value
                    elif 'query-continue' in prequest.keys():
                        rvstartid = prequest['query-continue']['revisions']['rvstartid']
                        pparams['rvstartid'] = rvstartid
@ -1011,8 +1023,10 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                            config['http_method'] = "GET"
                            prequest = site.api(http_method=config['http_method'], **pparams)

-                # We're done iterating for this title.
-                c += 1
+                # We're done iterating for this title or titles.
+                c += len(titlelist)
+                # Reset for the next batch.
+                titlelist = []
                if c % 10 == 0:
                    print('Downloaded {} pages'.format(c))

@ -1042,7 +1056,6 @@ def makeXmlFromPage(page):
                size = 0
            revision = E.revision(
                E.id(to_unicode(rev['revid'])),
-                E.parentid(to_unicode(rev['parentid'])),
                E.timestamp(rev['timestamp']),
                E.text(to_unicode(rev['*']), space="preserve", bytes=to_unicode(size)),
            )
@ -1058,6 +1071,9 @@ def makeXmlFromPage(page):
                revision.append(E.comment(to_unicode(rev['comment'])))
            if 'contentmodel' in rev:
                revision.append(E.model(rev['contentmodel']))
+            # Sometimes a missing parentid is not replaced with a 0 as it should.
+            if 'parentid' in rev:
+                revision.append(E.parentid(to_unicode(rev['parentid'])))
            # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
            if 'sha1' in rev:
                revision.append(E.sha1(rev['sha1']))
@ -1067,28 +1083,37 @@ def makeXmlFromPage(page):
        raise PageMissingError(page['title'], e)
    return etree.tostring(p, pretty_print=True, encoding='unicode')

-def readTitles(config={}, start=None):
+def readTitles(config={}, start=None, batch=False):
    """ Read title list from a file, from the title "start" """

    titlesfilename = '%s-%s-titles.txt' % (
        domain2prefix(config=config), config['date'])
    titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')

+    titlelist = []
    seeking = False
    if start:
        seeking = True

    with titlesfile as f:
        for line in f:
-            if line.strip() == '--END--':
+            title = line.decode("utf-8").strip()
+            if title == '--END--':
                break
-            elif seeking and line.strip() != start:
+            elif seeking and title != start:
                continue
-            elif seeking and line.strip() == start:
+            elif seeking and title == start:
                seeking = False
-                yield line.strip()
+
+            if not batch:
+                yield title
            else:
-                yield line.strip()
+                titlelist.append(title)
+                if len(titlelist) < batch:
+                    continue
+                else:
+                    yield titlelist
+                    titlelist = []

 def reverse_readline(filename, buf_size=8192, truncate=False):
    """a generator that returns the lines of a file in reverse order"""
@ -1953,7 +1978,7 @@ def checkIndex(index=None, cookies=None, session=None):
    """ Checking index.php availability """
    r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
    if r.status_code >= 400:
-        print("ERROR: The wiki returned status code HTTP {}".format({}))
+        print("ERROR: The wiki returned status code HTTP {}".format(r.status_code))
        return False
    raw = r.text
    print 'Checking index.php...', index
@ -2164,7 +2189,7 @@ def resumePreviousDump(config={}, other={}):
            lastimage = lines[-1]
            f.close()
        except:
-            pass  # probably file doesnot exists
+            pass  # probably file does not exists
        if lastimage == u'--END--':
            print 'Image list was completed in the previous session'
        else:
--- a/launcher.py
+++ b/launcher.py
@ -46,7 +46,7 @@ def main():
        #check if compressed, in that case dump was finished previously
        compressed = False
        for f in os.listdir('.'):
-            if f.startswith(prefix) and f.endswith('.7z'):
+            if f.endswith('.7z') and f.split("-")[0] == prefix:
                compressed = True
                zipfilename = f
                break #stop searching, dot not explore subdirectories
@ -71,7 +71,7 @@ def main():
        wikidir = ''
        for f in os.listdir('.'):
            # Does not find numbered wikidumps not verify directories
-            if f.startswith(prefix) and f.endswith('wikidump'):
+            if f.endswith('wikidump') and f.split("-")[0] == prefix:
                wikidir = f
                started = True
                break #stop searching, dot not explore subdirectories
@ -82,14 +82,14 @@ def main():
        # typically they don't provide any crawl-delay value in their robots.txt).
        if started and wikidir: #then resume
            print 'Resuming download, using directory', wikidir
-            subprocess.call('./dumpgenerator.py --api=%s --xml --images --resume --path=%s' % (wiki, wikidir), shell=True)
+            subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False)
        else: #download from scratch
-            subprocess.call('./dumpgenerator.py --api=%s --xml --images --delay=1' % wiki, shell=True)
+            subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False)
            started = True
            #save wikidir now
            for f in os.listdir('.'):
                # Does not find numbered wikidumps not verify directories
-                if f.startswith(prefix) and f.endswith('wikidump'):
+                if f.endswith('wikidump') and f.split("-")[0] == prefix:
                    wikidir = f
                    break #stop searching, dot not explore subdirectories