Enhance the stability of the image dump and make it resumable (#88)

--- - Introduce `sha1File` - save more metadata(`size`, `sha1`) into `images.txt` - feat: better file dump: - validate image's size and sha1 - show progress - better resume > Improved the resume mechanism. (fix: #15) First check whether the `file` and `file.desc` exist, and then check whether the `size` and `sha1` of the file correspond to `images.txt`. If any check fails, the file and the .desc of the file will be downloaded again. If all pass, the download of this file is skipped. You can even delete random pictures and .desc files and try to resume again. - pre-work for incremental image dump - remove `start` param from `generatorImageDump()` > the images resume mechanism has changed. we don't need `start` for resuming anymore. - other minor improvements
1 year ago · bd6fb48e6a
parent 9981ea3d29
commit bd6fb48e6a
8 changed files with 111 additions and 48 deletions
--- a/wikiteam3/dumpgenerator/dump/generator.py
+++ b/wikiteam3/dumpgenerator/dump/generator.py
@ -214,7 +214,7 @@ class DumpGenerator:
                generateXMLDump(config=config, titles=titles, session=other["session"])

        if config.images:
-            # load images
+            # load images list
            lastimage = ""
            try:
                f = open(
@ -222,7 +222,7 @@ class DumpGenerator:
                    % (config.path, domain2prefix(config=config), config.date),
                    encoding="utf-8",
                )
-                lines = f.readlines()
+                lines = f.read().splitlines()
                for l in lines:
                    if re.search(r"\t", l):
                        images.append(l.split("\t"))
@ -232,6 +232,12 @@ class DumpGenerator:
                f.close()
            except FileNotFoundError:
                pass  # probably file does not exists
+            if len(images)>0 and len(images[0]) < 5:
+                print(
+                    "Warning: Detected old images list (images.txt) format.\n"+
+                    "You can delete 'images.txt' manually and restart the script."
+                )
+                sys.exit(1)
            if lastimage == "--END--":
                print("Image list was completed in the previous session")
            else:
@ -247,22 +253,31 @@ class DumpGenerator:
            except OSError:
                pass  # probably directory does not exist
            listdir.sort()
-            complete = True
-            lastfilename = ""
-            lastfilename2 = ""
-            c = 0
-            for filename, url, uploader in images:
-                lastfilename2 = lastfilename
+            c_desc = 0
+            c_images = 0
+            for filename, url, uploader, size, sha1 in images:
                # return always the complete filename, not the truncated
                lastfilename = filename
                filename2 = filename
                if len(filename2) > other["filenamelimit"]:
                    filename2 = truncateFilename(other=other, filename=filename2)
-                if filename2 not in listdir:
-                    complete = False
-                    break
-                c += 1
-            print("%d images were found in the directory from a previous session" % (c))
+                if filename2 in listdir:
+                    c_images += 1
+                if filename2+".desc" in listdir:
+                    c_desc += 1
+            print(f"{len(images)} records in images.txt, {c_images} images and {c_desc} .desc were saved in the previous session")
+            if c_desc < len(images):
+                complete = False
+            elif c_images < len(images):
+                complete = False
+                print("WARNING: Some images were not saved. You may want to delete their \n"
+                    +".desc files and re-run the script to redownload the missing images.\n"
+                    +"(If images URL are unavailable, you can ignore this warning.)\n"
+                    +"(In most cases, if the number of .desc files equals the number of \n"
+                    + "images.txt records, you can ignore this warning, images dump was completed.)")
+                sys.exit()
+            else: # c_desc == c_images == len(images)
+                complete = True
            if complete:
                # image dump is complete
                print("Image dump was completed in the previous session")
@ -273,7 +288,6 @@ class DumpGenerator:
                    config=config,
                    other=other,
                    images=images,
-                    start=lastfilename2,
                    session=other["session"],
                )

--- a/wikiteam3/dumpgenerator/dump/page/image.py
+++ b/wikiteam3/dumpgenerator/dump/page/image.py
@ -6,12 +6,12 @@ from typing import *

 from wikiteam3.dumpgenerator.cli import Delay
 from wikiteam3.utils import domain2prefix
-from wikiteam3.dumpgenerator.exceptions import PageMissingError
+from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSha1Error, FileSizeError
 from wikiteam3.dumpgenerator.api import getJSON
 from wikiteam3.dumpgenerator.api import handleStatusCode
 from wikiteam3.dumpgenerator.log import logerror
 from .page_xml import getXMLPage
-from wikiteam3.utils import truncateFilename
+from wikiteam3.utils import truncateFilename, sha1File
 from wikiteam3.utils import cleanHTML, undoHTMLEntities
 from wikiteam3.dumpgenerator.config import Config

@ -28,26 +28,21 @@ class Image:
            ]
        )

-    def generateImageDump(config: Config=None, other: Dict=None, images: Iterable[str]=None, start="", session=None):
-        """Save files and descriptions using a file list"""
+    def generateImageDump(config: Config=None, other: Dict=None, images: Iterable[str]=None, session=None):
+        """Save files and descriptions using a file list\n
+        Deprecated: `start` is not used anymore."""

        # fix use subdirectories md5
-        print('Retrieving images from "%s"' % (start and start or "start"))
+        print("Retrieving images...")
        imagepath = "%s/images" % (config.path)
        if not os.path.isdir(imagepath):
            print('Creating "%s" directory' % (imagepath))
            os.makedirs(imagepath)

-        c = 0
-        lock = True
-        if not start:
-            lock = False
-        for filename, url, uploader in images:
-            if filename == start:  # start downloading from start (included)
-                lock = False
-            if lock:
-                continue
-            Delay(config=config, session=session)
+        c_savedImageFiles = 0
+        c_savedImageDescs = 0
+
+        for filename, url, uploader, size, sha1 in images:

            # saving file
            # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
@ -58,7 +53,17 @@ class Image:
                filename2 = truncateFilename(other=other, filename=filename2)
                print("Filename is too long, truncating. Now it is:", filename2)
            filename3 = f"{imagepath}/{filename2}"
+            
+            # check if file already exists and has the same size and sha1
+            if (os.path.isfile(filename3) and os.path.isfile(filename3+".desc")
+            and os.path.getsize(filename3) == int(size)):
+                if sha1File(filename3) == sha1:
+                    c_savedImageFiles += 1
+                    print_msg=f"    {c_savedImageFiles}|sha1 matched: {filename2}"
+                    print(print_msg[0:70], end="\r")
+                    continue

+            Delay(config=config, session=session)
            original_url = url
            r = session.head(url=url, allow_redirects=True)
            original_url_redirected = len(r.history) > 0
@ -82,13 +87,22 @@ class Image:

            if r.status_code == 200:
                try:
-                    with open(filename3, "wb") as imagefile:
-                        imagefile.write(r.content)
+                    if len(r.content) == int(size):
+                        with open(filename3, "wb") as imagefile:
+                            imagefile.write(r.content)
+                        c_savedImageFiles += 1
+                    else:
+                        raise FileSizeError(file=filename3, size=size)
                except OSError:
                    logerror(
                        config=config, to_stdout=True,
                        text=f"File '{filename3}' could not be created by OS",
                    )
+                except FileSizeError as e:
+                    logerror(
+                        config=config, to_stdout=True,
+                        text=f"File '{e.file}' size is not match '{e.size}', skipping",
+                    )
            else:
                logerror(
                    config=config, to_stdout=True,
@ -134,6 +148,7 @@ class Image:

                with open(f"{imagepath}/{filename2}.desc", "w", encoding="utf-8") as f:
                    f.write(xmlfiledesc)
+                c_savedImageDescs += 1

                if xmlfiledesc == "":
                    logerror(
@ -147,11 +162,10 @@ class Image:
                    text=f"File {imagepath}/{filename2}.desc could not be created by OS",
                )

-            c += 1
-            if c % 10 == 0:
-                print(f"\n->  Downloaded {c} images\n")
+            print_msg = f"    {(len(images)-c_savedImageFiles)}: {filename2[0:30]}"
+            print(print_msg, " "*(70 - len(print_msg)), end="\r")

-        print(f"\n->  Downloaded {c} images\n")
+        print(f"Downloaded {c_savedImageFiles} images and {c_savedImageDescs} .desc files.")

    def getImageNames(config: Config=None, session=None):
        """Get list of image names"""
@ -300,7 +314,7 @@ class Image:
            params = {
                "action": "query",
                "list": "allimages",
-                "aiprop": "url|user",
+                "aiprop": "url|user|size|sha1",
                "aifrom": aifrom,
                "format": "json",
                "ailimit": 50,
@ -362,7 +376,9 @@ class Image:
                            + " contains unicode. Please file an issue with WikiTeam."
                        )
                    uploader = re.sub("_", " ", image["user"])
-                    images.append([filename, url, uploader])
+                    size = image["size"]
+                    sha1 = image["sha1"]
+                    images.append([filename, url, uploader, size, sha1])
            else:
                oldAPI = True
                break
@ -386,7 +402,7 @@ class Image:
                                    # TODO: Is it OK to set it higher, for speed?
                    "gapfrom": gapfrom,
                    "prop": "imageinfo",
-                    "iiprop": "user|url",
+                    "iiprop": "url|user|size|sha1",
                    "format": "json",
                }
                # FIXME Handle HTTP Errors HERE
@ -430,7 +446,9 @@ class Image:

                        filename = re.sub("_", " ", tmp_filename)
                        uploader = re.sub("_", " ", props["imageinfo"][0]["user"])
-                        images.append([filename, url, uploader])
+                        size = props["imageinfo"][0]["size"]
+                        sha1 = props["imageinfo"][0]["sha1"]
+                        images.append([filename, url, uploader, size, sha1])
                else:
                    # if the API doesn't return query data, then we're done
                    break
@ -443,7 +461,7 @@ class Image:
        return images

    def saveImageNames(config: Config=None, images: Iterable[str]=None, session=None):
-        """Save image list in a file, including filename, url and uploader"""
+        """Save image list in a file, including filename, url, uploader, size and sha1"""

        imagesfilename = "{}-{}-images.txt".format(
            domain2prefix(config=config), config.date
@ -454,8 +472,8 @@ class Image:
        imagesfile.write(
            "\n".join(
                [
-                    filename + "\t" + url + "\t" + uploader
-                    for filename, url, uploader in images
+                    filename + "\t" + url + "\t" + uploader + "\t" + str(size) + "\t" + sha1
+                    for filename, url, uploader, size, sha1 in images
                ]
            )
        )
--- a/wikiteam3/dumpgenerator/dump/page/page_titles.py
+++ b/wikiteam3/dumpgenerator/dump/page/page_titles.py
@ -52,8 +52,8 @@ def getPageTitlesScraper(config: Config=None, session=None):
            config.index, namespace
        )
        r = session.get(url=url, timeout=30)
-        raw = str(r.text)
-        raw = str(cleanHTML(raw))
+        raw = r.text
+        raw = cleanHTML(raw)

        r_title = 'title="(?P<title>[^>]+)">'
        r_suballpages = ""
@ -224,7 +224,7 @@ def readTitles(config: Config=None, start=None, batch=False):

    with titlesfile as f:
        for line in f:
-            title = str(line).strip()
+            title = line.strip()
            if title == "--END--":
                break
            elif seeking and title != start:
--- a/wikiteam3/dumpgenerator/dump/page/page_xml.py
+++ b/wikiteam3/dumpgenerator/dump/page/page_xml.py
@ -25,7 +25,7 @@ def getXMLPageCore(headers: Dict=None, params: Dict=None, config: Config=None, s
    maxretries = config.retries  # x retries and skip
    increment = 20  # increment every retry

-    while not re.search(r"</mediawiki>", str(xml)):
+    while not re.search(r"</mediawiki>", xml):
        if c > 0 and c < maxretries:
            wait = (
                increment * c < maxseconds and increment * c or maxseconds
--- a/wikiteam3/dumpgenerator/dump/xmlrev/xml_header.py
+++ b/wikiteam3/dumpgenerator/dump/xmlrev/xml_header.py
@ -27,7 +27,7 @@ def getXMLHeader(config: Config=None, session=None) -> Tuple[str, Config]:
                + "?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1",
                timeout=10,
            )
-            xml = str(r.text)
+            xml: str = r.text
            # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19
            if not re.match(r"\s*<mediawiki", xml):
                r = session.get(
--- a/wikiteam3/dumpgenerator/exceptions.py
+++ b/wikiteam3/dumpgenerator/exceptions.py
@ -13,3 +13,21 @@ class ExportAbortedError(Exception):

    def __str__(self):
        return "Export from '%s' did not return anything." % self.index
+
+
+class FileSizeError(Exception):
+    def __init__(self, file, size):
+        self.file = file
+        self.size = size
+
+    def __str__(self):
+        return "File '%s' size is not match '%s'." % (self.file, self.size)
+
+
+class FileSha1Error(Exception):
+    def __init__(self, file, sha1):
+        self.file = file
+        self.sha1 = sha1
+
+    def __str__(self):
+        return "File '%s' sha1 is not match '%s'." % (self.file, self.sha1)
--- a/wikiteam3/utils/init.py
+++ b/wikiteam3/utils/init.py
@ -1,5 +1,5 @@
 from .uprint import uprint
-from .util import removeIP, cleanXML, cleanHTML, undoHTMLEntities
+from .util import removeIP, cleanXML, cleanHTML, undoHTMLEntities, sha1File

 from .user_agent import getUserAgent
 from .domain import domain2prefix
--- a/wikiteam3/utils/util.py
+++ b/wikiteam3/utils/util.py
@ -1,3 +1,4 @@
+import hashlib
 import re
 import sys

@ -74,3 +75,15 @@ def cleanXML(xml: str = "") -> str:
    if re.search(r"</mediawiki>", xml):
        xml = xml.split("</mediawiki>")[0]
    return xml
+
+def sha1File(filename: str = "") -> str:
+    """Return the SHA1 hash of a file"""
+
+    sha1 = hashlib.sha1()
+    with open(filename, "rb") as f:
+        while True:
+            data = f.read(65536)
+            if not data:
+                break
+            sha1.update(data)
+    return sha1.hexdigest()