From 43945c467f5912c39bef43e43154fc0b92b51929 Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Sun, 11 Sep 2022 16:41:17 -0700 Subject: [PATCH 1/2] Work around unicode titles not working with resuming Before, you would get UnicodeWarning: Unicode unequal comparison failed to convert both arguments to Unicode - interpreting them as being unequal. The %s versus {} change was needed because otherwise you would get UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-5: ordinal not in range(128). There is probably a better way of solving that, but this one does work. --- dumpgenerator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index bd27ff1..7c671ef 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -730,7 +730,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None): if config['xmlrevisions']: if start: - print("WARNING: will try to start the download from title: {}".format(start)) + print("WARNING: will try to start the download from title: %s" % start) xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') else: print 'Retrieving the XML for every page from the beginning' @@ -1160,7 +1160,7 @@ def reverse_readline(filename, buf_size=8192, truncate=False): fh.truncate raise StopIteration else: - yield segment + yield segment.decode('utf-8') segment = lines[0] for index in range(len(lines) - 1, 0, -1): if truncate and '' in segment: @@ -1169,8 +1169,8 @@ def reverse_readline(filename, buf_size=8192, truncate=False): fh.truncate raise StopIteration else: - yield lines[index] - yield segment + yield lines[index].decode('utf-8') + yield segment.decode('utf-8') def saveImageNames(config={}, images=[], session=None): """ Save image list in a file, including filename, url and uploader """ From 9b2c6e40ae5ed33f8821365ea64946da4b6fc6ee Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Fri, 16 Sep 2022 22:19:59 -0700 Subject: [PATCH 2/2] Fix truncation when resuming There already was code that looks like it was supposed to truncate files, but it calculated the index wrong and didn't properly check all lines. It worked out, though, because it didn't actually call the truncate function. Now, truncation occurs to the last `` tag. If the XML file ends with a `` tag, then nothing gets truncated. The page is added after that; if nothing was truncated, this will result in the same page being listed twice (which already happened with the missing truncation), but if truncation did happen then the file should no longer be invalid. --- dumpgenerator.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 7c671ef..c5d109c 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1155,21 +1155,19 @@ def reverse_readline(filename, buf_size=8192, truncate=False): lines[-1] += segment else: if truncate and '' in segment: - pages = buffer.split('') - fh.seek(-offset+buf_size-len(pages[-1]), os.SEEK_END) - fh.truncate + fh.seek(-offset+buffer.rindex('')+len('\n'), os.SEEK_END) + fh.truncate() raise StopIteration else: yield segment.decode('utf-8') - segment = lines[0] for index in range(len(lines) - 1, 0, -1): + segment = lines[index] if truncate and '' in segment: - pages = buffer.split('') - fh.seek(-offset-len(pages[-1]), os.SEEK_END) - fh.truncate + fh.seek(-offset+buffer.rindex('\n')+len('\n'), os.SEEK_END) + fh.truncate() raise StopIteration else: - yield lines[index].decode('utf-8') + yield segment.decode('utf-8') yield segment.decode('utf-8') def saveImageNames(config={}, images=[], session=None):