You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
wikiteam/wikiteam3/dumpgenerator/dump/generator.py

282 lines
11 KiB
Python

try:
import contextlib
import http.cookiejar
import os
import re
import sys
import traceback
from file_read_backwards import FileReadBackwards
except ImportError:
print(
"""
Please install poetry with:
$ pip install poetry.
Then rerun py with:
$ poetry run python py
"""
)
sys.exit(1)
from typing import *
from wikiteam3.dumpgenerator.cli import bye, getParameters, welcome
from wikiteam3.dumpgenerator.config import Config, loadConfig, saveConfig
from wikiteam3.dumpgenerator.dump.image.image import Image
from wikiteam3.dumpgenerator.dump.misc.index_php import saveIndexPHP
from wikiteam3.dumpgenerator.dump.misc.site_info import saveSiteInfo
from wikiteam3.dumpgenerator.dump.misc.special_logs import saveLogs
from wikiteam3.dumpgenerator.dump.misc.special_version import saveSpecialVersion
from wikiteam3.dumpgenerator.dump.xmldump.xml_dump import generateXMLDump
from wikiteam3.dumpgenerator.dump.xmldump.xml_integrity import checkXMLIntegrity
from wikiteam3.dumpgenerator.log import logerror
from wikiteam3.utils import avoidWikimediaProjects, domain2prefix, undoHTMLEntities
# From https://stackoverflow.com/a/57008707
class Tee:
def __init__(self, filename):
self.file = open(filename, "w", encoding="utf-8")
self.stdout = sys.stdout
def __enter__(self):
sys.stdout = self
def __exit__(self, exc_type, exc_value, tb):
sys.stdout = self.stdout
if exc_type is not None:
self.file.write(traceback.format_exc())
self.file.close()
def write(self, data):
self.file.write(data)
self.stdout.write(data)
def flush(self):
self.file.flush()
self.stdout.flush()
class DumpGenerator:
configfilename = "config.json"
@staticmethod
def __init__(params=None):
"""Main function"""
configfilename = DumpGenerator.configfilename
config, other = getParameters(params=params)
avoidWikimediaProjects(config=config, other=other)
with (
Tee(other["stdout_log_path"])
if other["stdout_log_path"] is not None
else contextlib.nullcontext()
):
print(welcome())
print(f"Analysing {config.api if config.api else config.index}")
# creating path or resuming if desired
c = 2
# to avoid concat blabla-2, blabla-2-3, and so on...
originalpath = config.path
# do not enter if resume is requested from begining
while not other["resume"] and os.path.isdir(config.path):
print('\nWarning!: "%s" path exists' % (config.path))
reply = ""
if config.failfast:
reply = "yes"
while reply.lower() not in ["yes", "y", "no", "n"]:
reply = input(
'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? '
% (config.path, config.path, configfilename)
)
if reply.lower() in ["yes", "y"]:
if not os.path.isfile(f"{config.path}/{configfilename}"):
print("No config file found. I can't resume. Aborting.")
sys.exit()
print("You have selected: YES")
other["resume"] = True
break
elif reply.lower() in ["no", "n"]:
print("You have selected: NO")
other["resume"] = False
config.path = "%s-%d" % (originalpath, c)
print(f'Trying to use path "{config.path}"...')
c += 1
if other["resume"]:
print("Loading config file...")
config = loadConfig(config=config, configfilename=configfilename)
else:
os.mkdir(config.path)
saveConfig(config=config, configfilename=configfilename)
if other["resume"]:
DumpGenerator.resumePreviousDump(config=config, other=other)
else:
DumpGenerator.createNewDump(config=config, other=other)
saveIndexPHP(config=config, session=other["session"])
saveSpecialVersion(config=config, session=other["session"])
saveSiteInfo(config=config, session=other["session"])
bye()
@staticmethod
def createNewDump(config: Config = None, other: Dict = None):
# we do lazy title dumping here :)
images = []
print("Trying generating a new dump into a new directory...")
if config.xml:
generateXMLDump(config=config, session=other["session"])
checkXMLIntegrity(config=config, session=other["session"])
if config.images:
images += Image.getImageNames(config=config, session=other["session"])
Image.saveImageNames(config=config, images=images, session=other["session"])
Image.generateImageDump(
config=config, other=other, images=images, session=other["session"]
)
if config.logs:
saveLogs(config=config, session=other["session"])
@staticmethod
def resumePreviousDump(config: Config = None, other: Dict = None):
images = []
print("Resuming previous dump process...")
if config.xml:
# checking xml dump
xmliscomplete = False
lastxmltitle = None
lastxmlrevid = None
try:
with FileReadBackwards(
"%s/%s-%s-%s.xml"
% (
config.path,
domain2prefix(config=config, session=other["session"]),
config.date,
"current" if config.curonly else "history",
),
encoding="utf-8",
) as frb:
for l in frb:
if l.strip() == "</mediawiki>":
# xml dump is complete
xmliscomplete = True
break
if xmlrevid := re.search(r" <id>([^<]+)</id>", l):
lastxmlrevid = int(xmlrevid.group(1))
if xmltitle := re.search(r"<title>([^<]+)</title>", l):
lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
break
except:
pass # probably file does not exists
if xmliscomplete:
print("XML dump was completed in the previous session")
elif lastxmltitle:
# resuming...
print(
f'Resuming XML dump from "{lastxmltitle}" (revision id {lastxmlrevid})'
)
generateXMLDump(
config=config,
session=other["session"],
resume=True,
)
else:
# corrupt? only has XML header?
print("XML is corrupt? Regenerating...")
generateXMLDump(config=config, session=other["session"])
if config.images:
# load images list
lastimage = ""
imagesFilePath = "{}/{}-{}-images.txt".format(
config.path,
domain2prefix(config=config),
config.date,
)
if os.path.exists(imagesFilePath):
with open(imagesFilePath) as f:
lines = f.read().splitlines()
images.extend(l.split("\t") for l in lines if re.search(r"\t", l))
if len(lines) == 0: # empty file
lastimage = "--EMPTY--"
if not lastimage:
lastimage = lines[-1].strip()
if lastimage == "":
lastimage = lines[-2].strip()
if images and len(images[0]) < 5:
print(
"Warning: Detected old images list (images.txt) format.\n"
+ "You can delete 'images.txt' manually and restart the script."
)
sys.exit(1)
if lastimage == "--END--":
print("Image list was completed in the previous session")
else:
print("Image list is incomplete. Reloading...")
# do not resume, reload, to avoid inconsistences, deleted images or
# so
images = Image.getImageNames(config=config, session=other["session"])
Image.saveImageNames(config=config, images=images)
# checking images directory
listdir = []
try:
listdir = os.listdir(f"{config.path}/images")
except OSError:
pass # probably directory does not exist
listdir = set(listdir)
c_desc = 0
c_images = 0
c_checked = 0
for filename, url, uploader, size, sha1 in images:
lastfilename = filename
if other["filenamelimit"] < len(filename.encode("utf-8")):
logerror(
config=config,
to_stdout=True,
text=f"Filename too long(>240 bytes), skipping: {filename}",
)
continue
if filename in listdir:
c_images += 1
if f"{filename}.desc" in listdir:
c_desc += 1
c_checked += 1
if c_checked % 100000 == 0:
print(f"checked {c_checked}/{len(images)} records", end="\r")
print(
f"{len(images)} records in images.txt, {c_images} images and {c_desc} .desc were saved in the previous session"
)
if c_desc < len(images):
complete = False
elif c_images < len(images):
complete = False
print(
"WARNING: Some images were not saved. You may want to delete their \n"
+ ".desc files and re-run the script to redownload the missing images.\n"
+ "(If images URL are unavailable, you can ignore this warning.)\n"
+ "(In most cases, if the number of .desc files equals the number of \n"
+ "images.txt records, you can ignore this warning, images dump was completed.)"
)
sys.exit()
else: # c_desc == c_images == len(images)
complete = True
if complete:
# image dump is complete
print("Image dump was completed in the previous session")
else:
# we resume from previous image, which may be corrupted (or missing
# .desc) by the previous session ctrl-c or abort
Image.generateImageDump(
config=config,
other=other,
images=images,
session=other["session"],
)