Version 20230509.01. Support for new design.

1 year ago · b2654e9317
parent 7f4db17348
commit b2654e9317
2 changed files with 42 additions and 6 deletions
--- a/pipeline.py
+++ b/pipeline.py
@ -61,7 +61,7 @@ if not WGET_AT:
 #
 # Update this each time you make a non-cosmetic change.
 # It will be added to the WARC files and reported to the tracker.
-VERSION = '20221021.01'
+VERSION = '20230509.01'
 TRACKER_ID = 'reddit'
 TRACKER_HOST = 'legacy-api.arpa.li'
 MULTI_ITEM_SIZE = 20
--- a/reddit.lua
+++ b/reddit.lua
@ -115,6 +115,7 @@ allowed = function(url, parenturl)
    or string.match(url, "^https?://old%.reddit%.com/gold%?")
    or string.match(url, "^https?://[^/]+/over18.+dest=https%%3A%%2F%%2Fold%.reddit%.com")
    or string.match(url, "^https?://old%.[^%?]+%?utm_source=reddit")
    or string.match(url, "/%?context=1$")
    or (
      string.match(url, "^https?://gateway%.reddit%.com/")
      and not string.match(url, "/morecomments/")
@ -148,6 +149,10 @@ allowed = function(url, parenturl)
    return false
  end
  if string.match(url, "^https?://www%.reddit%.com/svc/") then
    return true
  end
  local tested = {}
  for s in string.gmatch(url, "([^/]+)") do
    if tested[s] == nil then
@ -259,10 +264,15 @@ end
 wget.callbacks.get_urls = function(file, url, is_css, iri)
  local urls = {}
  local html = nil
  local no_more_svc = false
  downloaded[url] = true
  local function check(urla)
    if no_more_svc
      and string.match(urla, "^https?://[^/]+/svc/") then
      return nil
    end
    local origurl = url
    local url = string.match(urla, "^([^#]+)")
    local url_ = string.match(url, "^(.-)%.?$")
@ -334,7 +344,8 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
  end
  if string.match(url, "^https?://www%.reddit%.com/")
-    and not string.match(url, "/api/") then
+    and not string.match(url, "/api/")
    and not string.match(url, "^https?://[^/]+/svc/") then
    check(string.gsub(url, "^https?://www%.reddit%.com/", "https://old.reddit.com/"))
  end
@ -357,9 +368,9 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
      and not string.match(url, "%.mpd")
    ) then
    html = read_file(file)
-    if string.match(url, "^https?://www%.reddit%.com/[^/]+/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]*/?$") then
+    --[[if string.match(url, "^https?://www%.reddit%.com/[^/]+/[^/]+/comments/[0-9a-z]+/[^/]+/[0-9a-z]*/?$") then
      check(url .. "?utm_source=reddit&utm_medium=web2x&context=3")
-    end
+    end]]
    if string.match(url, "^https?://old%.reddit%.com/api/morechildren$") then
      html = string.gsub(html, '\\"', '"')
    elseif string.match(url, "^https?://old%.reddit%.com/r/[^/]+/comments/")
@ -385,7 +396,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
                              post_data=post_data})
        end
      end
-    elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]")
+    --[[elseif string.match(url, "^https?://www%.reddit%.com/r/[^/]+/comments/[^/]")
      or string.match(url, "^https?://www%.reddit%.com/user/[^/]+/comments/[^/]")
      or string.match(url, "^https?://www%.reddit%.com/comments/[^/]")
      or string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/t3_[^%?]") then
@ -435,7 +446,7 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
            post_data=post_data
          })
        end
-      end
+      end]]
    end
    if string.match(url, "^https?://gateway%.reddit%.com/desktopapi/v1/morecomments/") then
      for s in string.gmatch(html, '"permalink"%s*:%s*"([^"]+)"') do
@ -508,6 +519,19 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
        checknewshorturl(url)
      end
    end
    if string.match(url, "^https?://www%.reddit%.com/svc/") then
      for src_url, cursor in string.gmatch(html, '<faceplate%-partial[^>]+src="([^"]+)"[^>]*>%s*<input%s+type="hidden"%s+name="cursor"%s+value="([^"]+)"%s*/>') do
        src_url = string.gsub(src_url, "&amp;", "&")
        local requested_s = src_url .. cursor
        if not requested_children[requested_s] then
          print("posting with cursor", cursor)
          table.insert(urls, {url=
            urlparse.absolute(url, src_url),
            post_data="cursor=" .. cursor-- .. "&csrf_token=" .. csrf_token
          })
        end
      end
    end
    if string.match(url, "^https?://www%.reddit.com/api/info%.json%?id=t") then
      json = load_json_file(html)
      if not json or not json["data"] or not json["data"]["children"] then
@ -537,8 +561,20 @@ wget.callbacks.get_urls = function(file, url, is_css, iri)
        if crosspost_parent and crosspost_parent ~= string.match(url, "(t[0-9]_[a-z0-9]+)") then
          is_crosspost = true
        end
        local id = child["data"]["id"]
        local subreddit = child["data"]["subreddit"]
        if child["kind"] == "t1" then
          check("https://www.reddit.com/svc/shreddit/comments/" .. subreddit .. "/" .. child["data"]["link_id"] .. "/t1_" .. id .. "?render-mode=partial&shredtop=")
        elseif child["kind"] == "t3" then
          check("https://www.reddit.com/svc/shreddit/comments/" .. subreddit .. "/t3_" .. id .. "?render-mode=partial")
        else
          io.stdout:write("Kind is not supported.\n")
          io.stdout:flush()
          abort_item()
        end
      end
    end
    no_more_svc = true
    for newurl in string.gmatch(string.gsub(html, "&quot;", '"'), '([^"%s]+)') do
      checknewurl(newurl)
    end