diff --git a/pipeline.py b/pipeline.py index d2056e2..64f7558 100644 --- a/pipeline.py +++ b/pipeline.py @@ -59,7 +59,7 @@ if not WGET_AT: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = '20230607.03' +VERSION = '20230607.04' TRACKER_ID = 'reddit' TRACKER_HOST = 'legacy-api.arpa.li' MULTI_ITEM_SIZE = 20 diff --git a/reddit.lua b/reddit.lua index b7561a3..0123cb4 100644 --- a/reddit.lua +++ b/reddit.lua @@ -271,6 +271,10 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) downloaded[url] = true + if abortgrab then + return {} + end + local function check(urla) if no_more_svc and string.match(urla, "^https?://[^/]+/svc/") then @@ -556,6 +560,11 @@ wget.callbacks.get_urls = function(file, url, is_css, iri) end selftext = child["data"]["selftext"] checknewurl(child["data"]["permalink"]) + -- temp + if child["data"]["is_video"] then + error() + end + -- if child["data"]["is_video"] and not child["data"]["secure_media"] then io.stdout:write("Video still being processed.\n") io.stdout:flush() @@ -658,6 +667,14 @@ wget.callbacks.write_to_warc = function(url, http_stat) retry_url = true return false end + if string.match(url["url"], "/api/info%.json") then + local html = read_file(http_stat["local_file"]) + if string.match(html, "v%.redd%.it") + or string.match(html, "reddit_video") then + abort_item() + return false + end + end if string.match(url["url"], "^https?://www%.reddit%.com/") then local html = read_file(http_stat["local_file"]) if (