Version 20230611.01. Extra very simple check on validity of old.reddit.com returned body.

pull/17/head
arkiver 12 months ago
parent 15a0a1a6f5
commit a974b81618

@ -59,7 +59,7 @@ if not WGET_AT:
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = '20230607.06'
VERSION = '20230611.01'
TRACKER_ID = 'reddit'
TRACKER_HOST = 'legacy-api.arpa.li'
MULTI_ITEM_SIZE = 20

@ -686,9 +686,10 @@ wget.callbacks.write_to_warc = function(url, http_stat)
return false
end
end
if string.match(url["url"], "^https?://www%.reddit%.com/") then
if string.match(url["url"], "^https?://www%.reddit%.com/")
or string.match(url["url"], "^https?://old%.reddit%.com/") then
local html = read_file(http_stat["local_file"])
if (
if status_code == 200 and (
string.match(url["url"], "^https?://[^/]+/r/")
and (
not string.match(html, "<title>")
@ -697,6 +698,9 @@ wget.callbacks.write_to_warc = function(url, http_stat)
) or (
string.match(url["url"], "^https?://[^/]+/svc/")
and not string.match(html, "</[^<>%s]+>%s*$")
) or (
string.match(url["url"], "^https?://old%.reddit%.com/api/morechildren$")
and not JSON:decode(html)["success"]
) then
retry_url = true
return false

Loading…
Cancel
Save