Force str() everywhere (also misc stdout formatting)

Signed-off-by: Elsie Hupp <github@elsiehupp.com>
pull/446/head
Elsie Hupp 2 years ago
parent a436ea191c
commit efcde3006e

Binary file not shown.

Binary file not shown.

@ -306,4 +306,13 @@ def getParameters(params=[]):
print("Which expands to:")
print(" " + config["path"])
if config["delay"] == 0.5:
print("--delay is the default value of 0.5")
print(
"There will be a 0.5 second delay between HTTP calls in order to keep the server from timing you out."
)
print(
"If you know that this is unnecessary, you can manually specify '--delay 0.0'."
)
return config, other

@ -1,8 +1,37 @@
import itertools
import threading
import time
import sys
def delay(config={}, session=None):
"""Add a delay if configured for that"""
if config["delay"] > 0:
print("Sleeping... %.2f seconds..." % (config["delay"]))
time.sleep(config["delay"])
class Delay:
done: bool = True
ellipses: str = "."
def animate(self):
try:
while not self.done:
sys.stdout.write("\r " + self.ellipses)
sys.stdout.flush()
self.ellipses += "."
time.sleep(0.1)
except KeyboardInterrupt:
sys.exit()
def __init__(self, config={}, session=None):
"""Add a delay if configured for that"""
if config["delay"] > 0:
self.done = False
ellipses_animation = threading.Thread(target=self.animate)
ellipses_animation.start()
# sys.stdout.write("\rSleeping %.2f seconds..." % (config["delay"]))
# sys.stdout.flush()
time.sleep(config["delay"])
self.done = True
sys.stdout.write("\r \r")
sys.stdout.flush()

@ -28,7 +28,7 @@ def welcome():
"# Copyright (C) 2011-%d WikiTeam developers #\n"
% (datetime.datetime.now().year)
)
message += """
message += """# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #

@ -3,7 +3,7 @@ import re
import sys
import urllib
from .delay import delay
from .delay import Delay
from .domain import domain2prefix
from .exceptions import PageMissingError
from .get_json import getJSON
@ -46,7 +46,7 @@ class Image:
lock = False
if lock:
continue
delay(config=config, session=session)
Delay(config=config, session=session)
# saving file
# truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
@ -162,12 +162,12 @@ class Image:
text=f"File {imagepath}/{filename2}.desc could not be created by OS",
)
delay(config=config, session=session)
Delay(config=config, session=session)
c += 1
if c % 10 == 0:
print(" Downloaded %d images" % (c))
print(f"\n-> Downloaded {c} images\n")
print("Downloaded %d images" % (c))
print(f"\n-> Downloaded {c} images\n")
def getImageNames(config={}, session=None):
"""Get list of image names"""
@ -203,8 +203,8 @@ class Image:
params={"title": "Special:Imagelist", "limit": limit, "offset": offset},
timeout=30,
)
raw = r.text
delay(config=config, session=session)
raw = str(r.text)
Delay(config=config, session=session)
# delicate wiki
if re.search(
r"(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)",
@ -225,7 +225,7 @@ class Image:
print("No more retries, exit...")
break
raw = cleanHTML(raw)
raw = str(cleanHTML(raw))
# archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
# wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
# href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
@ -311,7 +311,7 @@ class Image:
r = session.get(url=config["api"], params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
Delay(config=config, session=session)
if "query" in jsonimages:
aifrom = ""
@ -386,7 +386,7 @@ class Image:
r = session.get(url=config["api"], params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
Delay(config=config, session=session)
if "query" in jsonimages:
gapfrom = ""

@ -7,7 +7,7 @@ def checkIndex(index=None, cookies=None, session=None):
if r.status_code >= 400:
print(f"ERROR: The wiki returned status code HTTP {r.status_code}")
return False
raw = r.text
raw = str(r.text)
print("Checking index.php...", index)
# Workaround for issue 71
if (

@ -1,6 +1,6 @@
import os
from .delay import delay
from .delay import Delay
from .util import removeIP
@ -12,8 +12,8 @@ def saveIndexPHP(config={}, session=None):
else:
print("Downloading index.php (Main Page) as index.html")
r = session.post(url=config["index"], params={}, timeout=10)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
raw = str(r.text)
Delay(config=config, session=session)
raw = str(removeIP(raw=raw))
with open("%s/index.html" % (config["path"]), "w", encoding="utf-8") as outfile:
outfile.write(str(raw))

@ -1,4 +1,4 @@
from .delay import delay
from .delay import Delay
def saveLogs(config={}, session=None):
@ -19,4 +19,4 @@ def saveLogs(config={}, session=None):
<option value="">Todos los registros</option>
</select>
"""
delay(config=config, session=session)
Delay(config=config, session=session)

@ -1,6 +1,6 @@
import re
from .delay import delay
from .delay import Delay
from .get_json import getJSON
@ -13,8 +13,8 @@ def getNamespacesScraper(config={}, session=None):
r = session.post(
url=config["index"], params={"title": "Special:Allpages"}, timeout=30
)
raw = r.text
delay(config=config, session=session)
raw = str(r.text)
Delay(config=config, session=session)
# [^>]*? to include selected="selected"
m = re.compile(
@ -59,7 +59,7 @@ def getNamespacesAPI(config={}, session=None):
timeout=30,
)
result = getJSON(r)
delay(config=config, session=session)
Delay(config=config, session=session)
try:
nsquery = result["query"]["namespaces"]
except KeyError:

@ -1,6 +1,6 @@
import os
from .delay import delay
from .delay import Delay
from .util import removeIP
@ -14,9 +14,9 @@ def saveSpecialVersion(config={}, session=None):
r = session.post(
url=config["index"], params={"title": "Special:Version"}, timeout=10
)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
raw = str(r.text)
Delay(config=config, session=session)
raw = str(removeIP(raw=raw))
with open(
"%s/Special:Version.html" % (config["path"]), "w", encoding="utf-8"
) as outfile:

@ -1,9 +1,10 @@
import re
import sys
from urllib.parse import urlparse
import mwclient
from .delay import delay
from .delay import Delay
from .domain import domain2prefix
from .namespaces import getNamespacesAPI, getNamespacesScraper
from .util import cleanHTML, undoHTMLEntities
@ -19,7 +20,7 @@ def getPageTitlesAPI(config={}, session=None):
continue
c = 0
print(" Retrieving titles in the namespace %d" % (namespace))
sys.stdout.write(" Retrieving titles in the namespace %d" % (namespace))
apiurl = urlparse(config["api"])
site = mwclient.Site(
apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme
@ -34,8 +35,11 @@ def getPageTitlesAPI(config={}, session=None):
print("Probably a loop, switching to next namespace")
titles = list(set(titles))
print(" %d titles retrieved in the namespace %d" % (c, namespace))
delay(config=config, session=session)
sys.stdout.write(
"\r %d titles retrieved in the namespace %d\n" % (c, namespace)
)
sys.stdout.flush()
Delay(config=config, session=session)
def getPageTitlesScraper(config={}, session=None):
@ -48,8 +52,8 @@ def getPageTitlesScraper(config={}, session=None):
config["index"], namespace
)
r = session.get(url=url, timeout=30)
raw = r.text
raw = cleanHTML(raw)
raw = str(r.text)
raw = str(cleanHTML(raw))
r_title = 'title="(?P<title>[^>]+)">'
r_suballpages = ""
@ -114,10 +118,10 @@ def getPageTitlesScraper(config={}, session=None):
if name not in checked_suballpages:
# to avoid reload dupe subpages links
checked_suballpages.append(name)
delay(config=config, session=session)
Delay(config=config, session=session)
r = session.get(url=url, timeout=10)
# print ('Fetching URL: ', url)
raw = r.text
raw = str(r.text)
raw = cleanHTML(raw)
rawacum += raw # merge it after removed junk
print(
@ -131,7 +135,7 @@ def getPageTitlesScraper(config={}, session=None):
"pages",
)
delay(config=config, session=session)
Delay(config=config, session=session)
oldfr = currfr
c += 1

@ -23,7 +23,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
maxretries = config["retries"] # x retries and skip
increment = 20 # increment every retry
while not re.search(r"</mediawiki>", xml):
while not re.search(r"</mediawiki>", str(xml)):
if c > 0 and c < maxretries:
wait = (
increment * c < maxseconds and increment * c or maxseconds
@ -86,7 +86,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
xml = ""
c += 1
return xml
return str(xml)
def getXMLPage(config={}, title="", verbose=True, session=None):
@ -114,7 +114,7 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
if "templates" in config and config["templates"]:
params["templates"] = 1
xml = getXMLPageCore(params=params, config=config, session=session)
xml = str(getXMLPageCore(params=params, config=config, session=session))
if xml == "":
raise ExportAbortedError(config["index"])
if "</page>" not in xml:
@ -132,8 +132,8 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
# else, warning about Special:Export truncating large page histories
r_timestamp = "<timestamp>([^<]+)</timestamp>"
numberofedits = 0
numberofedits += len(re.findall(r_timestamp, xml))
edit_count = 0
edit_count += len(re.findall(r_timestamp, xml))
# search for timestamps in xml to avoid analysing empty pages like
# Special:Allpages and the random one
@ -183,16 +183,16 @@ def getXMLPage(config={}, title="", verbose=True, session=None):
params["limit"] = params["limit"] / 2
continue
xml = xml2
numberofedits += len(re.findall(r_timestamp, xml))
edit_count += len(re.findall(r_timestamp, xml))
else:
params["offset"] = "" # no more edits in this page history
yield "</page>\n"
if verbose:
if numberofedits == 1:
if edit_count == 1:
uprint(" %s, 1 edit" % (title.strip()))
else:
uprint(" %s, %d edits" % (title.strip(), numberofedits))
uprint(" %s, %d edits" % (title.strip(), edit_count))
def makeXmlPageFromRaw(xml):
@ -252,11 +252,11 @@ def makeXmlFromPage(page):
except KeyError as e:
print(e)
raise PageMissingError(page["title"], e)
return etree.tostring(p, pretty_print=True, encoding="unicode")
return str(etree.tostring(p, pretty_print=True, encoding="utf-8"))
def fixBOM(request):
"""Strip Unicode BOM"""
if request.text.startswith("\ufeff"):
request.encoding = "utf-8-sig"
return request.text
return str(request.text)

@ -1,7 +1,7 @@
import json
import os
from .delay import delay
from .delay import Delay
from .get_json import getJSON
@ -51,7 +51,7 @@ def saveSiteInfo(config={}, session=None):
timeout=10,
)
result = getJSON(r)
delay(config=config, session=session)
Delay(config=config, session=session)
with open(
"%s/siteinfo.json" % (config["path"]), "w", encoding="utf-8"
) as outfile:

@ -29,7 +29,7 @@ def cleanHTML(raw=""):
print(raw[:250])
print("This wiki doesn't use marks to split content")
sys.exit()
return raw
return str(raw)
def undoHTMLEntities(text=""):
@ -68,8 +68,9 @@ def cleanXML(xml=""):
# for Windows compatibility.
# If the encoding has to stay as is, we'll have
# to change all the file encodings, as well.
if re.search(rb"</siteinfo>\n", xml):
xml = xml.split("</siteinfo>\n")[1].encode("utf-8")
if re.search(rb"</mediawiki>", xml):
xml = xml.split("</mediawiki>")[0].encode("utf-8")
xml = str(xml)
if re.search(r"</siteinfo>\n", str(xml)):
xml = xml.split("</siteinfo>\n")[1]
if re.search(r"</mediawiki>", str(xml)):
xml = xml.split("</mediawiki>")[0]
return xml

@ -13,7 +13,7 @@ def getWikiEngine(url=""):
r = session.post(url=url, timeout=30)
if r.status_code == 405 or r.text == "":
r = session.get(url=url, timeout=120)
result = r.text
result = str(r.text)
wikiengine = "Unknown"
if re.search(

@ -1,7 +1,7 @@
import re
import sys
from .delay import delay
from .delay import Delay
from .domain import domain2prefix
from .exceptions import PageMissingError
from .log_error import logerror
@ -34,7 +34,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
"{}/{}".format(config["path"], xmlfilename), "a", encoding="utf-8"
)
else:
print("Retrieving the XML for every page from the beginning")
print("\nRetrieving the XML for every page from the beginning\n")
xmlfile = open(
"{}/{}".format(config["path"], xmlfilename), "w", encoding="utf-8"
)
@ -46,7 +46,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
# Due to how generators work, it's expected this may be less
# TODO: get the page title and reuse the usual format "X title, y edits"
print(" %d more revisions exported" % numrevs)
xml = cleanXML(xml=xml)
xml = str(cleanXML(xml=xml))
xmlfile.write(str(xml))
except AttributeError as e:
print(e)
@ -57,7 +57,8 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
else:
print(
'Retrieving the XML for every page from "%s"' % (start and start or "start")
'\nRetrieving the XML for every page from "%s"\n'
% (start and start or "start")
)
if start:
print(
@ -84,9 +85,9 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
lock = False
if lock:
continue
delay(config=config, session=session)
Delay(config=config, session=session)
if c % 10 == 0:
print("Downloaded %d pages" % (c))
print(f"\n-> Downloaded {c} pages\n")
try:
for xml in getXMLPage(config=config, title=title, session=session):
xml = cleanXML(xml=xml)

@ -26,7 +26,7 @@ def getXMLHeader(config={}, session=None):
+ "?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1",
timeout=10,
)
xml = r.text
xml = str(r.text)
# Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19
if not re.match(r"\s*<mediawiki", xml):
r = requests.get(
@ -35,7 +35,7 @@ def getXMLHeader(config={}, session=None):
timeout=10,
)
try:
xml = r.json()["query"]["export"]["*"]
xml = str(r.json()["query"]["export"]["*"])
except KeyError:
pass
if not re.match(r"\s*<mediawiki", xml):
@ -46,7 +46,7 @@ def getXMLHeader(config={}, session=None):
+ randomtitle,
timeout=10,
)
xml = r.text
xml = str(r.text)
# Again try without exportnowrap
if not re.match(r"\s*<mediawiki", xml):
r = requests.get(
@ -56,7 +56,7 @@ def getXMLHeader(config={}, session=None):
timeout=10,
)
try:
xml = r.json()["query"]["export"]["*"]
xml = str(r.json()["query"]["export"]["*"])
except KeyError:
pass
except requests.exceptions.RetryError:
@ -74,7 +74,7 @@ def getXMLHeader(config={}, session=None):
)
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
xml = str(pme.xml)
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
@ -107,10 +107,11 @@ def getXMLHeader(config={}, session=None):
]
)
except PageMissingError as pme:
xml = pme.xml
xml = str(pme.xml)
except ExportAbortedError:
pass
xml = str(xml)
header = xml.split("</mediawiki>")[0]
if not re.match(r"\s*<mediawiki", xml):
if config["xmlrevisions"]:
@ -121,7 +122,8 @@ def getXMLHeader(config={}, session=None):
config["xmlrevisions"] = False
header, config = getXMLHeader(config=config, session=session)
else:
print(xml)
print("XML export on this wiki is broken, quitting.")
logerror("XML export on this wiki is broken, quitting.")
sys.exit()
return header, config
return str(header), config

@ -141,7 +141,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
# but we only need the inner <page>: we can live with
# duplication and non-ordering of page titles, but the
# repeated header is confusing and would not even be valid
xml = exportrequest["query"]["export"]["*"]
xml = str(exportrequest["query"]["export"]["*"])
yield makeXmlPageFromRaw(xml)
if "continue" in arvrequest:
@ -210,10 +210,10 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
http_method=config["http_method"], **exportparams
)
xml = exportrequest["query"]["export"]["*"]
xml = str(exportrequest["query"]["export"]["*"])
c += 1
if c % 10 == 0:
print(f"Downloaded {c} pages")
print(f"\n-> Downloaded {c} pages\n")
# Because we got the fancy XML from the JSON format, clean it:
yield makeXmlPageFromRaw(xml)
else:
@ -279,8 +279,8 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
# Go through the data we got to build the XML.
for pageid in pages:
try:
xml = makeXmlFromPage(pages[pageid])
yield xml
xml = str(makeXmlFromPage(pages[pageid]))
yield str(xml)
except PageMissingError:
logerror(
config=config,
@ -320,7 +320,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
# Reset for the next batch.
titlelist = []
if c % 10 == 0:
print(f"Downloaded {c} pages")
print(f"\n-> Downloaded {c} pages\n")
except mwclient.errors.MwClientError as e:
print(e)

@ -14,5 +14,16 @@ def truncateXMLDump(filename: str) -> None:
xml_line = frb.readline()
incomplete_segment_size = len(incomplete_segment.encode("utf-8"))
file_size = os.path.getsize(filename)
with open(filename, "r+", encoding="utf-8") as fh:
fh.truncate(file_size - incomplete_segment_size)
if file_size > incomplete_segment_size:
with open(filename, "r+", encoding="utf-8") as fh:
fh.truncate(file_size - incomplete_segment_size)
else:
print(
'len(incomplete_segment.encode("utf-8")) returned '
+ str(incomplete_segment_size)
+ ", while os.path.getsize(filename) returned "
+ str(file_size)
+ ", so fh.truncate() would be fh.truncate("
+ str(file_size - incomplete_segment_size)
+ "), which would be illegal. Something is seriously wrong here!"
)

Loading…
Cancel
Save