Force str() everywhere (also misc stdout formatting)

Signed-off-by: Elsie Hupp <github@elsiehupp.com>
2 years ago · efcde3006e
parent a436ea191c
commit efcde3006e
20 changed files with 136 additions and 79 deletions
--- a/dist/wikiteam3-3.0.0-py3-none-any.whl
+++ b/dist/wikiteam3-3.0.0-py3-none-any.whl
--- a/dist/wikiteam3-3.0.0.tar.gz
+++ b/dist/wikiteam3-3.0.0.tar.gz
--- a/wikiteam3/dumpgenerator/cli.py
+++ b/wikiteam3/dumpgenerator/cli.py
@ -306,4 +306,13 @@ def getParameters(params=[]):
        print("Which expands to:")
        print("  " + config["path"])

+    if config["delay"] == 0.5:
+        print("--delay is the default value of 0.5")
+        print(
+            "There will be a 0.5 second delay between HTTP calls in order to keep the server from timing you out."
+        )
+        print(
+            "If you know that this is unnecessary, you can manually specify '--delay 0.0'."
+        )
+
    return config, other
--- a/wikiteam3/dumpgenerator/delay.py
+++ b/wikiteam3/dumpgenerator/delay.py
@ -1,8 +1,37 @@
+import itertools
+import threading
 import time
+import sys


-def delay(config={}, session=None):
-    """Add a delay if configured for that"""
-    if config["delay"] > 0:
-        print("Sleeping... %.2f seconds..." % (config["delay"]))
-        time.sleep(config["delay"])
+class Delay:
+
+    done: bool = True
+    ellipses: str = "."
+
+    def animate(self):
+        try:
+            while not self.done:
+                sys.stdout.write("\r    " + self.ellipses)
+                sys.stdout.flush()
+                self.ellipses += "."
+                time.sleep(0.1)
+        except KeyboardInterrupt:
+            sys.exit()
+
+    def __init__(self, config={}, session=None):
+        """Add a delay if configured for that"""
+        if config["delay"] > 0:
+            self.done = False
+
+            ellipses_animation = threading.Thread(target=self.animate)
+            ellipses_animation.start()
+
+            # sys.stdout.write("\rSleeping %.2f seconds..." % (config["delay"]))
+            # sys.stdout.flush()
+
+            time.sleep(config["delay"])
+            self.done = True
+
+            sys.stdout.write("\r                           \r")
+            sys.stdout.flush()
--- a/wikiteam3/dumpgenerator/greeter.py
+++ b/wikiteam3/dumpgenerator/greeter.py
@ -28,7 +28,7 @@ def welcome():
        "# Copyright (C) 2011-%d WikiTeam developers                           #\n"
        % (datetime.datetime.now().year)
    )
-    message += """
+    message += """#                                                                       #
 # This program is free software: you can redistribute it and/or modify  #
 # it under the terms of the GNU General Public License as published by  #
 # the Free Software Foundation, either version 3 of the License, or     #
--- a/wikiteam3/dumpgenerator/image.py
+++ b/wikiteam3/dumpgenerator/image.py
@ -3,7 +3,7 @@ import re
 import sys
 import urllib

-from .delay import delay
+from .delay import Delay
 from .domain import domain2prefix
 from .exceptions import PageMissingError
 from .get_json import getJSON
@ -46,7 +46,7 @@ class Image:
                lock = False
            if lock:
                continue
-            delay(config=config, session=session)
+            Delay(config=config, session=session)

            # saving file
            # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
@ -162,12 +162,12 @@ class Image:
                    text=f"File {imagepath}/{filename2}.desc could not be created by OS",
                )

-            delay(config=config, session=session)
+            Delay(config=config, session=session)
            c += 1
            if c % 10 == 0:
-                print("    Downloaded %d images" % (c))
+                print(f"\n->  Downloaded {c} images\n")

-        print("Downloaded %d images" % (c))
+        print(f"\n->  Downloaded {c} images\n")

    def getImageNames(config={}, session=None):
        """Get list of image names"""
@ -203,8 +203,8 @@ class Image:
                params={"title": "Special:Imagelist", "limit": limit, "offset": offset},
                timeout=30,
            )
-            raw = r.text
-            delay(config=config, session=session)
+            raw = str(r.text)
+            Delay(config=config, session=session)
            # delicate wiki
            if re.search(
                r"(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)",
@ -225,7 +225,7 @@ class Image:
                    print("No more retries, exit...")
                    break

-            raw = cleanHTML(raw)
+            raw = str(cleanHTML(raw))
            # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
            # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
            # href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
@ -311,7 +311,7 @@ class Image:
            r = session.get(url=config["api"], params=params, timeout=30)
            handleStatusCode(r)
            jsonimages = getJSON(r)
-            delay(config=config, session=session)
+            Delay(config=config, session=session)

            if "query" in jsonimages:
                aifrom = ""
@ -386,7 +386,7 @@ class Image:
                r = session.get(url=config["api"], params=params, timeout=30)
                handleStatusCode(r)
                jsonimages = getJSON(r)
-                delay(config=config, session=session)
+                Delay(config=config, session=session)

                if "query" in jsonimages:
                    gapfrom = ""
--- a/wikiteam3/dumpgenerator/index_check.py
+++ b/wikiteam3/dumpgenerator/index_check.py
@ -7,7 +7,7 @@ def checkIndex(index=None, cookies=None, session=None):
    if r.status_code >= 400:
        print(f"ERROR: The wiki returned status code HTTP {r.status_code}")
        return False
-    raw = r.text
+    raw = str(r.text)
    print("Checking index.php...", index)
    # Workaround for issue 71
    if (
--- a/wikiteam3/dumpgenerator/index_php.py
+++ b/wikiteam3/dumpgenerator/index_php.py
@ -1,6 +1,6 @@
 import os

-from .delay import delay
+from .delay import Delay
 from .util import removeIP


@ -12,8 +12,8 @@ def saveIndexPHP(config={}, session=None):
    else:
        print("Downloading index.php (Main Page) as index.html")
        r = session.post(url=config["index"], params={}, timeout=10)
-        raw = r.text
-        delay(config=config, session=session)
-        raw = removeIP(raw=raw)
+        raw = str(r.text)
+        Delay(config=config, session=session)
+        raw = str(removeIP(raw=raw))
        with open("%s/index.html" % (config["path"]), "w", encoding="utf-8") as outfile:
            outfile.write(str(raw))
--- a/wikiteam3/dumpgenerator/logs.py
+++ b/wikiteam3/dumpgenerator/logs.py
@ -1,4 +1,4 @@
-from .delay import delay
+from .delay import Delay


 def saveLogs(config={}, session=None):
@ -19,4 +19,4 @@ def saveLogs(config={}, session=None):
    <option value="">Todos los registros</option>
    </select>
 """
-    delay(config=config, session=session)
+    Delay(config=config, session=session)
--- a/wikiteam3/dumpgenerator/namespaces.py
+++ b/wikiteam3/dumpgenerator/namespaces.py
@ -1,6 +1,6 @@
 import re

-from .delay import delay
+from .delay import Delay
 from .get_json import getJSON


@ -13,8 +13,8 @@ def getNamespacesScraper(config={}, session=None):
        r = session.post(
            url=config["index"], params={"title": "Special:Allpages"}, timeout=30
        )
-        raw = r.text
-        delay(config=config, session=session)
+        raw = str(r.text)
+        Delay(config=config, session=session)

        # [^>]*? to include selected="selected"
        m = re.compile(
@ -59,7 +59,7 @@ def getNamespacesAPI(config={}, session=None):
            timeout=30,
        )
        result = getJSON(r)
-        delay(config=config, session=session)
+        Delay(config=config, session=session)
        try:
            nsquery = result["query"]["namespaces"]
        except KeyError:
--- a/wikiteam3/dumpgenerator/page_special_version.py
+++ b/wikiteam3/dumpgenerator/page_special_version.py
@ -1,6 +1,6 @@
 import os

-from .delay import delay
+from .delay import Delay
 from .util import removeIP


@ -14,9 +14,9 @@ def saveSpecialVersion(config={}, session=None):
        r = session.post(
            url=config["index"], params={"title": "Special:Version"}, timeout=10
        )
-        raw = r.text
-        delay(config=config, session=session)
-        raw = removeIP(raw=raw)
+        raw = str(r.text)
+        Delay(config=config, session=session)
+        raw = str(removeIP(raw=raw))
        with open(
            "%s/Special:Version.html" % (config["path"]), "w", encoding="utf-8"
        ) as outfile:
--- a/wikiteam3/dumpgenerator/page_titles.py
+++ b/wikiteam3/dumpgenerator/page_titles.py
@ -1,9 +1,10 @@
 import re
+import sys
 from urllib.parse import urlparse

 import mwclient

-from .delay import delay
+from .delay import Delay
 from .domain import domain2prefix
 from .namespaces import getNamespacesAPI, getNamespacesScraper
 from .util import cleanHTML, undoHTMLEntities
@ -19,7 +20,7 @@ def getPageTitlesAPI(config={}, session=None):
            continue

        c = 0
-        print("    Retrieving titles in the namespace %d" % (namespace))
+        sys.stdout.write("    Retrieving titles in the namespace %d" % (namespace))
        apiurl = urlparse(config["api"])
        site = mwclient.Site(
            apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme
@ -34,8 +35,11 @@ def getPageTitlesAPI(config={}, session=None):
            print("Probably a loop, switching to next namespace")
            titles = list(set(titles))

-        print("    %d titles retrieved in the namespace %d" % (c, namespace))
-        delay(config=config, session=session)
+        sys.stdout.write(
+            "\r    %d titles retrieved in the namespace %d\n" % (c, namespace)
+        )
+        sys.stdout.flush()
+        Delay(config=config, session=session)


 def getPageTitlesScraper(config={}, session=None):
@ -48,8 +52,8 @@ def getPageTitlesScraper(config={}, session=None):
            config["index"], namespace
        )
        r = session.get(url=url, timeout=30)
-        raw = r.text
-        raw = cleanHTML(raw)
+        raw = str(r.text)
+        raw = str(cleanHTML(raw))

        r_title = 'title="(?P<title>[^>]+)">'
        r_suballpages = ""
@ -114,10 +118,10 @@ def getPageTitlesScraper(config={}, session=None):
                if name not in checked_suballpages:
                    # to avoid reload dupe subpages links
                    checked_suballpages.append(name)
-                    delay(config=config, session=session)
+                    Delay(config=config, session=session)
                    r = session.get(url=url, timeout=10)
                    # print ('Fetching URL: ', url)
-                    raw = r.text
+                    raw = str(r.text)
                    raw = cleanHTML(raw)
                    rawacum += raw  # merge it after removed junk
                    print(
@ -131,7 +135,7 @@ def getPageTitlesScraper(config={}, session=None):
                        "pages",
                    )

-                delay(config=config, session=session)
+                Delay(config=config, session=session)
            oldfr = currfr
            c += 1

--- a/wikiteam3/dumpgenerator/page_xml.py
+++ b/wikiteam3/dumpgenerator/page_xml.py
@ -23,7 +23,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
    maxretries = config["retries"]  # x retries and skip
    increment = 20  # increment every retry

-    while not re.search(r"</mediawiki>", xml):
+    while not re.search(r"</mediawiki>", str(xml)):
        if c > 0 and c < maxretries:
            wait = (
                increment * c < maxseconds and increment * c or maxseconds
@ -86,7 +86,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
            xml = ""
        c += 1

-    return xml
+    return str(xml)


 def getXMLPage(config={}, title="", verbose=True, session=None):
@ -114,7 +114,7 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
    if "templates" in config and config["templates"]:
        params["templates"] = 1

-    xml = getXMLPageCore(params=params, config=config, session=session)
+    xml = str(getXMLPageCore(params=params, config=config, session=session))
    if xml == "":
        raise ExportAbortedError(config["index"])
    if "</page>" not in xml:
@ -132,8 +132,8 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
    # else, warning about Special:Export truncating large page histories
    r_timestamp = "<timestamp>([^<]+)</timestamp>"

-    numberofedits = 0
-    numberofedits += len(re.findall(r_timestamp, xml))
+    edit_count = 0
+    edit_count += len(re.findall(r_timestamp, xml))

    # search for timestamps in xml to avoid analysing empty pages like
    # Special:Allpages and the random one
@ -183,16 +183,16 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
                        params["limit"] = params["limit"] / 2
                        continue
                    xml = xml2
-                    numberofedits += len(re.findall(r_timestamp, xml))
+                    edit_count += len(re.findall(r_timestamp, xml))
            else:
                params["offset"] = ""  # no more edits in this page history
    yield "</page>\n"

    if verbose:
-        if numberofedits == 1:
+        if edit_count == 1:
            uprint("    %s, 1 edit" % (title.strip()))
        else:
-            uprint("    %s, %d edits" % (title.strip(), numberofedits))
+            uprint("    %s, %d edits" % (title.strip(), edit_count))


 def makeXmlPageFromRaw(xml):
@ -252,11 +252,11 @@ def makeXmlFromPage(page):
    except KeyError as e:
        print(e)
        raise PageMissingError(page["title"], e)
-    return etree.tostring(p, pretty_print=True, encoding="unicode")
+    return str(etree.tostring(p, pretty_print=True, encoding="utf-8"))


 def fixBOM(request):
    """Strip Unicode BOM"""
    if request.text.startswith("\ufeff"):
        request.encoding = "utf-8-sig"
-    return request.text
+    return str(request.text)
--- a/wikiteam3/dumpgenerator/site_info.py
+++ b/wikiteam3/dumpgenerator/site_info.py
@ -1,7 +1,7 @@
 import json
 import os

-from .delay import delay
+from .delay import Delay
 from .get_json import getJSON


@ -51,7 +51,7 @@ def saveSiteInfo(config={}, session=None):
                    timeout=10,
                )
            result = getJSON(r)
-            delay(config=config, session=session)
+            Delay(config=config, session=session)
            with open(
                "%s/siteinfo.json" % (config["path"]), "w", encoding="utf-8"
            ) as outfile:
--- a/wikiteam3/dumpgenerator/util.py
+++ b/wikiteam3/dumpgenerator/util.py
@ -29,7 +29,7 @@ def cleanHTML(raw=""):
        print(raw[:250])
        print("This wiki doesn't use marks to split content")
        sys.exit()
-    return raw
+    return str(raw)


 def undoHTMLEntities(text=""):
@ -68,8 +68,9 @@ def cleanXML(xml=""):
    # for Windows compatibility.
    # If the encoding has to stay as is, we'll have
    # to change all the file encodings, as well.
-    if re.search(rb"</siteinfo>\n", xml):
-        xml = xml.split("</siteinfo>\n")[1].encode("utf-8")
-    if re.search(rb"</mediawiki>", xml):
-        xml = xml.split("</mediawiki>")[0].encode("utf-8")
+    xml = str(xml)
+    if re.search(r"</siteinfo>\n", str(xml)):
+        xml = xml.split("</siteinfo>\n")[1]
+    if re.search(r"</mediawiki>", str(xml)):
+        xml = xml.split("</mediawiki>")[0]
    return xml
--- a/wikiteam3/dumpgenerator/wiki_check.py
+++ b/wikiteam3/dumpgenerator/wiki_check.py
@ -13,7 +13,7 @@ def getWikiEngine(url=""):
    r = session.post(url=url, timeout=30)
    if r.status_code == 405 or r.text == "":
        r = session.get(url=url, timeout=120)
-    result = r.text
+    result = str(r.text)

    wikiengine = "Unknown"
    if re.search(
--- a/wikiteam3/dumpgenerator/xml_dump.py
+++ b/wikiteam3/dumpgenerator/xml_dump.py
@ -1,7 +1,7 @@
 import re
 import sys

-from .delay import delay
+from .delay import Delay
 from .domain import domain2prefix
 from .exceptions import PageMissingError
 from .log_error import logerror
@ -34,7 +34,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
                "{}/{}".format(config["path"], xmlfilename), "a", encoding="utf-8"
            )
        else:
-            print("Retrieving the XML for every page from the beginning")
+            print("\nRetrieving the XML for every page from the beginning\n")
            xmlfile = open(
                "{}/{}".format(config["path"], xmlfilename), "w", encoding="utf-8"
            )
@ -46,7 +46,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
                # Due to how generators work, it's expected this may be less
                # TODO: get the page title and reuse the usual format "X title, y edits"
                print("        %d more revisions exported" % numrevs)
-                xml = cleanXML(xml=xml)
+                xml = str(cleanXML(xml=xml))
                xmlfile.write(str(xml))
        except AttributeError as e:
            print(e)
@ -57,7 +57,8 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):

    else:
        print(
-            'Retrieving the XML for every page from "%s"' % (start and start or "start")
+            '\nRetrieving the XML for every page from "%s"\n'
+            % (start and start or "start")
        )
        if start:
            print(
@ -84,9 +85,9 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
                lock = False
            if lock:
                continue
-            delay(config=config, session=session)
+            Delay(config=config, session=session)
            if c % 10 == 0:
-                print("Downloaded %d pages" % (c))
+                print(f"\n->  Downloaded {c} pages\n")
            try:
                for xml in getXMLPage(config=config, title=title, session=session):
                    xml = cleanXML(xml=xml)
--- a/wikiteam3/dumpgenerator/xml_header.py
+++ b/wikiteam3/dumpgenerator/xml_header.py
@ -26,7 +26,7 @@ def getXMLHeader(config={}, session=None):
                + "?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1",
                timeout=10,
            )
-            xml = r.text
+            xml = str(r.text)
            # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19
            if not re.match(r"\s*<mediawiki", xml):
                r = requests.get(
@ -35,7 +35,7 @@ def getXMLHeader(config={}, session=None):
                    timeout=10,
                )
                try:
-                    xml = r.json()["query"]["export"]["*"]
+                    xml = str(r.json()["query"]["export"]["*"])
                except KeyError:
                    pass
            if not re.match(r"\s*<mediawiki", xml):
@ -46,7 +46,7 @@ def getXMLHeader(config={}, session=None):
                    + randomtitle,
                    timeout=10,
                )
-                xml = r.text
+                xml = str(r.text)
            # Again try without exportnowrap
            if not re.match(r"\s*<mediawiki", xml):
                r = requests.get(
@ -56,7 +56,7 @@ def getXMLHeader(config={}, session=None):
                    timeout=10,
                )
                try:
-                    xml = r.json()["query"]["export"]["*"]
+                    xml = str(r.json()["query"]["export"]["*"])
                except KeyError:
                    pass
        except requests.exceptions.RetryError:
@ -74,7 +74,7 @@ def getXMLHeader(config={}, session=None):
            )
        except PageMissingError as pme:
            # The <page> does not exist. Not a problem, if we get the <siteinfo>.
-            xml = pme.xml
+            xml = str(pme.xml)
        # Issue 26: Account for missing "Special" namespace.
        # Hope the canonical special name has not been removed.
        # http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
@ -107,10 +107,11 @@ def getXMLHeader(config={}, session=None):
                        ]
                    )
            except PageMissingError as pme:
-                xml = pme.xml
+                xml = str(pme.xml)
            except ExportAbortedError:
                pass

+    xml = str(xml)
    header = xml.split("</mediawiki>")[0]
    if not re.match(r"\s*<mediawiki", xml):
        if config["xmlrevisions"]:
@ -121,7 +122,8 @@ def getXMLHeader(config={}, session=None):
            config["xmlrevisions"] = False
            header, config = getXMLHeader(config=config, session=session)
        else:
+            print(xml)
            print("XML export on this wiki is broken, quitting.")
            logerror("XML export on this wiki is broken, quitting.")
            sys.exit()
-    return header, config
+    return str(header), config
--- a/wikiteam3/dumpgenerator/xml_revisions.py
+++ b/wikiteam3/dumpgenerator/xml_revisions.py
@ -141,7 +141,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                        # but we only need the inner <page>: we can live with
                        # duplication and non-ordering of page titles, but the
                        # repeated header is confusing and would not even be valid
-                        xml = exportrequest["query"]["export"]["*"]
+                        xml = str(exportrequest["query"]["export"]["*"])
                        yield makeXmlPageFromRaw(xml)

                    if "continue" in arvrequest:
@ -210,10 +210,10 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                            http_method=config["http_method"], **exportparams
                        )

-                xml = exportrequest["query"]["export"]["*"]
+                xml = str(exportrequest["query"]["export"]["*"])
                c += 1
                if c % 10 == 0:
-                    print(f"Downloaded {c} pages")
+                    print(f"\n->  Downloaded {c} pages\n")
                # Because we got the fancy XML from the JSON format, clean it:
                yield makeXmlPageFromRaw(xml)
        else:
@ -279,8 +279,8 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                    # Go through the data we got to build the XML.
                    for pageid in pages:
                        try:
-                            xml = makeXmlFromPage(pages[pageid])
-                            yield xml
+                            xml = str(makeXmlFromPage(pages[pageid]))
+                            yield str(xml)
                        except PageMissingError:
                            logerror(
                                config=config,
@ -320,7 +320,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
                # Reset for the next batch.
                titlelist = []
                if c % 10 == 0:
-                    print(f"Downloaded {c} pages")
+                    print(f"\n->  Downloaded {c} pages\n")

    except mwclient.errors.MwClientError as e:
        print(e)
--- a/wikiteam3/dumpgenerator/xml_truncate.py
+++ b/wikiteam3/dumpgenerator/xml_truncate.py
@ -14,5 +14,16 @@ def truncateXMLDump(filename: str) -> None:
            xml_line = frb.readline()
    incomplete_segment_size = len(incomplete_segment.encode("utf-8"))
    file_size = os.path.getsize(filename)
-    with open(filename, "r+", encoding="utf-8") as fh:
-        fh.truncate(file_size - incomplete_segment_size)
+    if file_size > incomplete_segment_size:
+        with open(filename, "r+", encoding="utf-8") as fh:
+            fh.truncate(file_size - incomplete_segment_size)
+    else:
+        print(
+            'len(incomplete_segment.encode("utf-8")) returned '
+            + str(incomplete_segment_size)
+            + ", while os.path.getsize(filename) returned "
+            + str(file_size)
+            + ", so fh.truncate() would be fh.truncate("
+            + str(file_size - incomplete_segment_size)
+            + "), which would be illegal. Something is seriously wrong here!"
+        )