Enhance the stability of the image dump and make it resumable (#88)

---

- Introduce `sha1File`
- save more metadata(`size`, `sha1`) into `images.txt`
- feat: better file dump:
  - validate image's size and sha1
  - show progress
  - better resume
    > Improved the resume mechanism. (fix: #15)
First check whether the `file` and `file.desc` exist, and then check
whether the `size` and `sha1` of the file correspond to `images.txt`. If
any check fails, the file and the .desc of the file will be downloaded
again. If all pass, the download of this file is skipped.
You can even delete random pictures and .desc files and try to resume
again.
  - pre-work for incremental image dump
  - remove `start` param from `generatorImageDump()`
> the images resume mechanism has changed. we don't need `start` for
resuming anymore.
- other minor improvements
pull/475/head
yzqzss 1 year ago committed by GitHub
parent 9981ea3d29
commit bd6fb48e6a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -214,7 +214,7 @@ class DumpGenerator:
generateXMLDump(config=config, titles=titles, session=other["session"])
if config.images:
# load images
# load images list
lastimage = ""
try:
f = open(
@ -222,7 +222,7 @@ class DumpGenerator:
% (config.path, domain2prefix(config=config), config.date),
encoding="utf-8",
)
lines = f.readlines()
lines = f.read().splitlines()
for l in lines:
if re.search(r"\t", l):
images.append(l.split("\t"))
@ -232,6 +232,12 @@ class DumpGenerator:
f.close()
except FileNotFoundError:
pass # probably file does not exists
if len(images)>0 and len(images[0]) < 5:
print(
"Warning: Detected old images list (images.txt) format.\n"+
"You can delete 'images.txt' manually and restart the script."
)
sys.exit(1)
if lastimage == "--END--":
print("Image list was completed in the previous session")
else:
@ -247,22 +253,31 @@ class DumpGenerator:
except OSError:
pass # probably directory does not exist
listdir.sort()
complete = True
lastfilename = ""
lastfilename2 = ""
c = 0
for filename, url, uploader in images:
lastfilename2 = lastfilename
c_desc = 0
c_images = 0
for filename, url, uploader, size, sha1 in images:
# return always the complete filename, not the truncated
lastfilename = filename
filename2 = filename
if len(filename2) > other["filenamelimit"]:
filename2 = truncateFilename(other=other, filename=filename2)
if filename2 not in listdir:
complete = False
break
c += 1
print("%d images were found in the directory from a previous session" % (c))
if filename2 in listdir:
c_images += 1
if filename2+".desc" in listdir:
c_desc += 1
print(f"{len(images)} records in images.txt, {c_images} images and {c_desc} .desc were saved in the previous session")
if c_desc < len(images):
complete = False
elif c_images < len(images):
complete = False
print("WARNING: Some images were not saved. You may want to delete their \n"
+".desc files and re-run the script to redownload the missing images.\n"
+"(If images URL are unavailable, you can ignore this warning.)\n"
+"(In most cases, if the number of .desc files equals the number of \n"
+ "images.txt records, you can ignore this warning, images dump was completed.)")
sys.exit()
else: # c_desc == c_images == len(images)
complete = True
if complete:
# image dump is complete
print("Image dump was completed in the previous session")
@ -273,7 +288,6 @@ class DumpGenerator:
config=config,
other=other,
images=images,
start=lastfilename2,
session=other["session"],
)

@ -6,12 +6,12 @@ from typing import *
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.utils import domain2prefix
from wikiteam3.dumpgenerator.exceptions import PageMissingError
from wikiteam3.dumpgenerator.exceptions import PageMissingError, FileSha1Error, FileSizeError
from wikiteam3.dumpgenerator.api import getJSON
from wikiteam3.dumpgenerator.api import handleStatusCode
from wikiteam3.dumpgenerator.log import logerror
from .page_xml import getXMLPage
from wikiteam3.utils import truncateFilename
from wikiteam3.utils import truncateFilename, sha1File
from wikiteam3.utils import cleanHTML, undoHTMLEntities
from wikiteam3.dumpgenerator.config import Config
@ -28,26 +28,21 @@ class Image:
]
)
def generateImageDump(config: Config=None, other: Dict=None, images: Iterable[str]=None, start="", session=None):
"""Save files and descriptions using a file list"""
def generateImageDump(config: Config=None, other: Dict=None, images: Iterable[str]=None, session=None):
"""Save files and descriptions using a file list\n
Deprecated: `start` is not used anymore."""
# fix use subdirectories md5
print('Retrieving images from "%s"' % (start and start or "start"))
print("Retrieving images...")
imagepath = "%s/images" % (config.path)
if not os.path.isdir(imagepath):
print('Creating "%s" directory' % (imagepath))
os.makedirs(imagepath)
c = 0
lock = True
if not start:
lock = False
for filename, url, uploader in images:
if filename == start: # start downloading from start (included)
lock = False
if lock:
continue
Delay(config=config, session=session)
c_savedImageFiles = 0
c_savedImageDescs = 0
for filename, url, uploader, size, sha1 in images:
# saving file
# truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
@ -58,7 +53,17 @@ class Image:
filename2 = truncateFilename(other=other, filename=filename2)
print("Filename is too long, truncating. Now it is:", filename2)
filename3 = f"{imagepath}/{filename2}"
# check if file already exists and has the same size and sha1
if (os.path.isfile(filename3) and os.path.isfile(filename3+".desc")
and os.path.getsize(filename3) == int(size)):
if sha1File(filename3) == sha1:
c_savedImageFiles += 1
print_msg=f" {c_savedImageFiles}|sha1 matched: {filename2}"
print(print_msg[0:70], end="\r")
continue
Delay(config=config, session=session)
original_url = url
r = session.head(url=url, allow_redirects=True)
original_url_redirected = len(r.history) > 0
@ -82,13 +87,22 @@ class Image:
if r.status_code == 200:
try:
with open(filename3, "wb") as imagefile:
imagefile.write(r.content)
if len(r.content) == int(size):
with open(filename3, "wb") as imagefile:
imagefile.write(r.content)
c_savedImageFiles += 1
else:
raise FileSizeError(file=filename3, size=size)
except OSError:
logerror(
config=config, to_stdout=True,
text=f"File '{filename3}' could not be created by OS",
)
except FileSizeError as e:
logerror(
config=config, to_stdout=True,
text=f"File '{e.file}' size is not match '{e.size}', skipping",
)
else:
logerror(
config=config, to_stdout=True,
@ -134,6 +148,7 @@ class Image:
with open(f"{imagepath}/{filename2}.desc", "w", encoding="utf-8") as f:
f.write(xmlfiledesc)
c_savedImageDescs += 1
if xmlfiledesc == "":
logerror(
@ -147,11 +162,10 @@ class Image:
text=f"File {imagepath}/{filename2}.desc could not be created by OS",
)
c += 1
if c % 10 == 0:
print(f"\n-> Downloaded {c} images\n")
print_msg = f" {(len(images)-c_savedImageFiles)}: {filename2[0:30]}"
print(print_msg, " "*(70 - len(print_msg)), end="\r")
print(f"\n-> Downloaded {c} images\n")
print(f"Downloaded {c_savedImageFiles} images and {c_savedImageDescs} .desc files.")
def getImageNames(config: Config=None, session=None):
"""Get list of image names"""
@ -300,7 +314,7 @@ class Image:
params = {
"action": "query",
"list": "allimages",
"aiprop": "url|user",
"aiprop": "url|user|size|sha1",
"aifrom": aifrom,
"format": "json",
"ailimit": 50,
@ -362,7 +376,9 @@ class Image:
+ " contains unicode. Please file an issue with WikiTeam."
)
uploader = re.sub("_", " ", image["user"])
images.append([filename, url, uploader])
size = image["size"]
sha1 = image["sha1"]
images.append([filename, url, uploader, size, sha1])
else:
oldAPI = True
break
@ -386,7 +402,7 @@ class Image:
# TODO: Is it OK to set it higher, for speed?
"gapfrom": gapfrom,
"prop": "imageinfo",
"iiprop": "user|url",
"iiprop": "url|user|size|sha1",
"format": "json",
}
# FIXME Handle HTTP Errors HERE
@ -430,7 +446,9 @@ class Image:
filename = re.sub("_", " ", tmp_filename)
uploader = re.sub("_", " ", props["imageinfo"][0]["user"])
images.append([filename, url, uploader])
size = props["imageinfo"][0]["size"]
sha1 = props["imageinfo"][0]["sha1"]
images.append([filename, url, uploader, size, sha1])
else:
# if the API doesn't return query data, then we're done
break
@ -443,7 +461,7 @@ class Image:
return images
def saveImageNames(config: Config=None, images: Iterable[str]=None, session=None):
"""Save image list in a file, including filename, url and uploader"""
"""Save image list in a file, including filename, url, uploader, size and sha1"""
imagesfilename = "{}-{}-images.txt".format(
domain2prefix(config=config), config.date
@ -454,8 +472,8 @@ class Image:
imagesfile.write(
"\n".join(
[
filename + "\t" + url + "\t" + uploader
for filename, url, uploader in images
filename + "\t" + url + "\t" + uploader + "\t" + str(size) + "\t" + sha1
for filename, url, uploader, size, sha1 in images
]
)
)

@ -52,8 +52,8 @@ def getPageTitlesScraper(config: Config=None, session=None):
config.index, namespace
)
r = session.get(url=url, timeout=30)
raw = str(r.text)
raw = str(cleanHTML(raw))
raw = r.text
raw = cleanHTML(raw)
r_title = 'title="(?P<title>[^>]+)">'
r_suballpages = ""
@ -224,7 +224,7 @@ def readTitles(config: Config=None, start=None, batch=False):
with titlesfile as f:
for line in f:
title = str(line).strip()
title = line.strip()
if title == "--END--":
break
elif seeking and title != start:

@ -25,7 +25,7 @@ def getXMLPageCore(headers: Dict=None, params: Dict=None, config: Config=None, s
maxretries = config.retries # x retries and skip
increment = 20 # increment every retry
while not re.search(r"</mediawiki>", str(xml)):
while not re.search(r"</mediawiki>", xml):
if c > 0 and c < maxretries:
wait = (
increment * c < maxseconds and increment * c or maxseconds

@ -27,7 +27,7 @@ def getXMLHeader(config: Config=None, session=None) -> Tuple[str, Config]:
+ "?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1",
timeout=10,
)
xml = str(r.text)
xml: str = r.text
# Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19
if not re.match(r"\s*<mediawiki", xml):
r = session.get(

@ -13,3 +13,21 @@ class ExportAbortedError(Exception):
def __str__(self):
return "Export from '%s' did not return anything." % self.index
class FileSizeError(Exception):
def __init__(self, file, size):
self.file = file
self.size = size
def __str__(self):
return "File '%s' size is not match '%s'." % (self.file, self.size)
class FileSha1Error(Exception):
def __init__(self, file, sha1):
self.file = file
self.sha1 = sha1
def __str__(self):
return "File '%s' sha1 is not match '%s'." % (self.file, self.sha1)

@ -1,5 +1,5 @@
from .uprint import uprint
from .util import removeIP, cleanXML, cleanHTML, undoHTMLEntities
from .util import removeIP, cleanXML, cleanHTML, undoHTMLEntities, sha1File
from .user_agent import getUserAgent
from .domain import domain2prefix

@ -1,3 +1,4 @@
import hashlib
import re
import sys
@ -74,3 +75,15 @@ def cleanXML(xml: str = "") -> str:
if re.search(r"</mediawiki>", xml):
xml = xml.split("</mediawiki>")[0]
return xml
def sha1File(filename: str = "") -> str:
"""Return the SHA1 hash of a file"""
sha1 = hashlib.sha1()
with open(filename, "rb") as f:
while True:
data = f.read(65536)
if not data:
break
sha1.update(data)
return sha1.hexdigest()
Loading…
Cancel
Save