From 9d1ea0c6888125db219b9007f8434e1bdb472dec Mon Sep 17 00:00:00 2001 From: Arkiver2 Date: Fri, 22 Feb 2019 01:15:18 +0100 Subject: [PATCH] rewrite --- .gitignore | 4 + JSON.lua | 1053 ++++++++++++++++++++++++++++++++++++++++++++ README.md | 48 +- cookies | 1 + get-wget-lua.sh | 0 ignore-list | 0 pipeline.py | 269 ++++++----- reddit.lua | 296 ++++++++----- warrior-install.sh | 17 + wget-lua-warrior | Bin 10 files changed, 1474 insertions(+), 214 deletions(-) create mode 100644 .gitignore create mode 100644 JSON.lua mode change 100644 => 100755 get-wget-lua.sh create mode 100644 ignore-list create mode 100755 warrior-install.sh mode change 100644 => 100755 wget-lua-warrior diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..44ebf6a --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*~ +*.pyc +data/ +wget-lua diff --git a/JSON.lua b/JSON.lua new file mode 100644 index 0000000..5f11425 --- /dev/null +++ b/JSON.lua @@ -0,0 +1,1053 @@ +-- -*- coding: utf-8 -*- +-- +-- Simple JSON encoding and decoding in pure Lua. +-- +-- Copyright 2010-2014 Jeffrey Friedl +-- http://regex.info/blog/ +-- +-- Latest version: http://regex.info/blog/lua/json +-- +-- This code is released under a Creative Commons CC-BY "Attribution" License: +-- http://creativecommons.org/licenses/by/3.0/deed.en_US +-- +-- It can be used for any purpose so long as the copyright notice above, +-- the web-page links above, and the 'AUTHOR_NOTE' string below are +-- maintained. Enjoy. +-- +local VERSION = 20141223.14 -- version history at end of file +local AUTHOR_NOTE = "-[ JSON.lua package by Jeffrey Friedl (http://regex.info/blog/lua/json) version 20141223.14 ]-" + +-- +-- The 'AUTHOR_NOTE' variable exists so that information about the source +-- of the package is maintained even in compiled versions. It's also +-- included in OBJDEF below mostly to quiet warnings about unused variables. +-- +local OBJDEF = { + VERSION = VERSION, + AUTHOR_NOTE = AUTHOR_NOTE, +} + + +-- +-- Simple JSON encoding and decoding in pure Lua. +-- http://www.json.org/ +-- +-- +-- JSON = assert(loadfile "JSON.lua")() -- one-time load of the routines +-- +-- local lua_value = JSON:decode(raw_json_text) +-- +-- local raw_json_text = JSON:encode(lua_table_or_value) +-- local pretty_json_text = JSON:encode_pretty(lua_table_or_value) -- "pretty printed" version for human readability +-- +-- +-- +-- DECODING (from a JSON string to a Lua table) +-- +-- +-- JSON = assert(loadfile "JSON.lua")() -- one-time load of the routines +-- +-- local lua_value = JSON:decode(raw_json_text) +-- +-- If the JSON text is for an object or an array, e.g. +-- { "what": "books", "count": 3 } +-- or +-- [ "Larry", "Curly", "Moe" ] +-- +-- the result is a Lua table, e.g. +-- { what = "books", count = 3 } +-- or +-- { "Larry", "Curly", "Moe" } +-- +-- +-- The encode and decode routines accept an optional second argument, +-- "etc", which is not used during encoding or decoding, but upon error +-- is passed along to error handlers. It can be of any type (including nil). +-- +-- +-- +-- ERROR HANDLING +-- +-- With most errors during decoding, this code calls +-- +-- JSON:onDecodeError(message, text, location, etc) +-- +-- with a message about the error, and if known, the JSON text being +-- parsed and the byte count where the problem was discovered. You can +-- replace the default JSON:onDecodeError() with your own function. +-- +-- The default onDecodeError() merely augments the message with data +-- about the text and the location if known (and if a second 'etc' +-- argument had been provided to decode(), its value is tacked onto the +-- message as well), and then calls JSON.assert(), which itself defaults +-- to Lua's built-in assert(), and can also be overridden. +-- +-- For example, in an Adobe Lightroom plugin, you might use something like +-- +-- function JSON:onDecodeError(message, text, location, etc) +-- LrErrors.throwUserError("Internal Error: invalid JSON data") +-- end +-- +-- or even just +-- +-- function JSON.assert(message) +-- LrErrors.throwUserError("Internal Error: " .. message) +-- end +-- +-- If JSON:decode() is passed a nil, this is called instead: +-- +-- JSON:onDecodeOfNilError(message, nil, nil, etc) +-- +-- and if JSON:decode() is passed HTML instead of JSON, this is called: +-- +-- JSON:onDecodeOfHTMLError(message, text, nil, etc) +-- +-- The use of the fourth 'etc' argument allows stronger coordination +-- between decoding and error reporting, especially when you provide your +-- own error-handling routines. Continuing with the the Adobe Lightroom +-- plugin example: +-- +-- function JSON:onDecodeError(message, text, location, etc) +-- local note = "Internal Error: invalid JSON data" +-- if type(etc) = 'table' and etc.photo then +-- note = note .. " while processing for " .. etc.photo:getFormattedMetadata('fileName') +-- end +-- LrErrors.throwUserError(note) +-- end +-- +-- : +-- : +-- +-- for i, photo in ipairs(photosToProcess) do +-- : +-- : +-- local data = JSON:decode(someJsonText, { photo = photo }) +-- : +-- : +-- end +-- +-- +-- +-- +-- +-- DECODING AND STRICT TYPES +-- +-- Because both JSON objects and JSON arrays are converted to Lua tables, +-- it's not normally possible to tell which original JSON type a +-- particular Lua table was derived from, or guarantee decode-encode +-- round-trip equivalency. +-- +-- However, if you enable strictTypes, e.g. +-- +-- JSON = assert(loadfile "JSON.lua")() --load the routines +-- JSON.strictTypes = true +-- +-- then the Lua table resulting from the decoding of a JSON object or +-- JSON array is marked via Lua metatable, so that when re-encoded with +-- JSON:encode() it ends up as the appropriate JSON type. +-- +-- (This is not the default because other routines may not work well with +-- tables that have a metatable set, for example, Lightroom API calls.) +-- +-- +-- ENCODING (from a lua table to a JSON string) +-- +-- JSON = assert(loadfile "JSON.lua")() -- one-time load of the routines +-- +-- local raw_json_text = JSON:encode(lua_table_or_value) +-- local pretty_json_text = JSON:encode_pretty(lua_table_or_value) -- "pretty printed" version for human readability +-- local custom_pretty = JSON:encode(lua_table_or_value, etc, { pretty = true, indent = "| ", align_keys = false }) +-- +-- On error during encoding, this code calls: +-- +-- JSON:onEncodeError(message, etc) +-- +-- which you can override in your local JSON object. +-- +-- The 'etc' in the error call is the second argument to encode() +-- and encode_pretty(), or nil if it wasn't provided. +-- +-- +-- PRETTY-PRINTING +-- +-- An optional third argument, a table of options, allows a bit of +-- configuration about how the encoding takes place: +-- +-- pretty = JSON:encode(val, etc, { +-- pretty = true, -- if false, no other options matter +-- indent = " ", -- this provides for a three-space indent per nesting level +-- align_keys = false, -- see below +-- }) +-- +-- encode() and encode_pretty() are identical except that encode_pretty() +-- provides a default options table if none given in the call: +-- +-- { pretty = true, align_keys = false, indent = " " } +-- +-- For example, if +-- +-- JSON:encode(data) +-- +-- produces: +-- +-- {"city":"Kyoto","climate":{"avg_temp":16,"humidity":"high","snowfall":"minimal"},"country":"Japan","wards":11} +-- +-- then +-- +-- JSON:encode_pretty(data) +-- +-- produces: +-- +-- { +-- "city": "Kyoto", +-- "climate": { +-- "avg_temp": 16, +-- "humidity": "high", +-- "snowfall": "minimal" +-- }, +-- "country": "Japan", +-- "wards": 11 +-- } +-- +-- The following three lines return identical results: +-- JSON:encode_pretty(data) +-- JSON:encode_pretty(data, nil, { pretty = true, align_keys = false, indent = " " }) +-- JSON:encode (data, nil, { pretty = true, align_keys = false, indent = " " }) +-- +-- An example of setting your own indent string: +-- +-- JSON:encode_pretty(data, nil, { pretty = true, indent = "| " }) +-- +-- produces: +-- +-- { +-- | "city": "Kyoto", +-- | "climate": { +-- | | "avg_temp": 16, +-- | | "humidity": "high", +-- | | "snowfall": "minimal" +-- | }, +-- | "country": "Japan", +-- | "wards": 11 +-- } +-- +-- An example of setting align_keys to true: +-- +-- JSON:encode_pretty(data, nil, { pretty = true, indent = " ", align_keys = true }) +-- +-- produces: +-- +-- { +-- "city": "Kyoto", +-- "climate": { +-- "avg_temp": 16, +-- "humidity": "high", +-- "snowfall": "minimal" +-- }, +-- "country": "Japan", +-- "wards": 11 +-- } +-- +-- which I must admit is kinda ugly, sorry. This was the default for +-- encode_pretty() prior to version 20141223.14. +-- +-- +-- AMBIGUOUS SITUATIONS DURING THE ENCODING +-- +-- During the encode, if a Lua table being encoded contains both string +-- and numeric keys, it fits neither JSON's idea of an object, nor its +-- idea of an array. To get around this, when any string key exists (or +-- when non-positive numeric keys exist), numeric keys are converted to +-- strings. +-- +-- For example, +-- JSON:encode({ "one", "two", "three", SOMESTRING = "some string" })) +-- produces the JSON object +-- {"1":"one","2":"two","3":"three","SOMESTRING":"some string"} +-- +-- To prohibit this conversion and instead make it an error condition, set +-- JSON.noKeyConversion = true +-- + + + + +-- +-- SUMMARY OF METHODS YOU CAN OVERRIDE IN YOUR LOCAL LUA JSON OBJECT +-- +-- assert +-- onDecodeError +-- onDecodeOfNilError +-- onDecodeOfHTMLError +-- onEncodeError +-- +-- If you want to create a separate Lua JSON object with its own error handlers, +-- you can reload JSON.lua or use the :new() method. +-- +--------------------------------------------------------------------------- + +local default_pretty_indent = " " +local default_pretty_options = { pretty = true, align_keys = false, indent = default_pretty_indent } + +local isArray = { __tostring = function() return "JSON array" end } isArray.__index = isArray +local isObject = { __tostring = function() return "JSON object" end } isObject.__index = isObject + + +function OBJDEF:newArray(tbl) + return setmetatable(tbl or {}, isArray) +end + +function OBJDEF:newObject(tbl) + return setmetatable(tbl or {}, isObject) +end + +local function unicode_codepoint_as_utf8(codepoint) + -- + -- codepoint is a number + -- + if codepoint <= 127 then + return string.char(codepoint) + + elseif codepoint <= 2047 then + -- + -- 110yyyxx 10xxxxxx <-- useful notation from http://en.wikipedia.org/wiki/Utf8 + -- + local highpart = math.floor(codepoint / 0x40) + local lowpart = codepoint - (0x40 * highpart) + return string.char(0xC0 + highpart, + 0x80 + lowpart) + + elseif codepoint <= 65535 then + -- + -- 1110yyyy 10yyyyxx 10xxxxxx + -- + local highpart = math.floor(codepoint / 0x1000) + local remainder = codepoint - 0x1000 * highpart + local midpart = math.floor(remainder / 0x40) + local lowpart = remainder - 0x40 * midpart + + highpart = 0xE0 + highpart + midpart = 0x80 + midpart + lowpart = 0x80 + lowpart + + -- + -- Check for an invalid character (thanks Andy R. at Adobe). + -- See table 3.7, page 93, in http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#G28070 + -- + if ( highpart == 0xE0 and midpart < 0xA0 ) or + ( highpart == 0xED and midpart > 0x9F ) or + ( highpart == 0xF0 and midpart < 0x90 ) or + ( highpart == 0xF4 and midpart > 0x8F ) + then + return "?" + else + return string.char(highpart, + midpart, + lowpart) + end + + else + -- + -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx + -- + local highpart = math.floor(codepoint / 0x40000) + local remainder = codepoint - 0x40000 * highpart + local midA = math.floor(remainder / 0x1000) + remainder = remainder - 0x1000 * midA + local midB = math.floor(remainder / 0x40) + local lowpart = remainder - 0x40 * midB + + return string.char(0xF0 + highpart, + 0x80 + midA, + 0x80 + midB, + 0x80 + lowpart) + end +end + +function OBJDEF:onDecodeError(message, text, location, etc) + if text then + if location then + message = string.format("%s at char %d of: %s", message, location, text) + else + message = string.format("%s: %s", message, text) + end + end + + if etc ~= nil then + message = message .. " (" .. OBJDEF:encode(etc) .. ")" + end + + if self.assert then + self.assert(false, message) + else + assert(false, message) + end +end + +OBJDEF.onDecodeOfNilError = OBJDEF.onDecodeError +OBJDEF.onDecodeOfHTMLError = OBJDEF.onDecodeError + +function OBJDEF:onEncodeError(message, etc) + if etc ~= nil then + message = message .. " (" .. OBJDEF:encode(etc) .. ")" + end + + if self.assert then + self.assert(false, message) + else + assert(false, message) + end +end + +local function grok_number(self, text, start, etc) + -- + -- Grab the integer part + -- + local integer_part = text:match('^-?[1-9]%d*', start) + or text:match("^-?0", start) + + if not integer_part then + self:onDecodeError("expected number", text, start, etc) + end + + local i = start + integer_part:len() + + -- + -- Grab an optional decimal part + -- + local decimal_part = text:match('^%.%d+', i) or "" + + i = i + decimal_part:len() + + -- + -- Grab an optional exponential part + -- + local exponent_part = text:match('^[eE][-+]?%d+', i) or "" + + i = i + exponent_part:len() + + local full_number_text = integer_part .. decimal_part .. exponent_part + local as_number = tonumber(full_number_text) + + if not as_number then + self:onDecodeError("bad number", text, start, etc) + end + + return as_number, i +end + + +local function grok_string(self, text, start, etc) + + if text:sub(start,start) ~= '"' then + self:onDecodeError("expected string's opening quote", text, start, etc) + end + + local i = start + 1 -- +1 to bypass the initial quote + local text_len = text:len() + local VALUE = "" + while i <= text_len do + local c = text:sub(i,i) + if c == '"' then + return VALUE, i + 1 + end + if c ~= '\\' then + VALUE = VALUE .. c + i = i + 1 + elseif text:match('^\\b', i) then + VALUE = VALUE .. "\b" + i = i + 2 + elseif text:match('^\\f', i) then + VALUE = VALUE .. "\f" + i = i + 2 + elseif text:match('^\\n', i) then + VALUE = VALUE .. "\n" + i = i + 2 + elseif text:match('^\\r', i) then + VALUE = VALUE .. "\r" + i = i + 2 + elseif text:match('^\\t', i) then + VALUE = VALUE .. "\t" + i = i + 2 + else + local hex = text:match('^\\u([0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF])', i) + if hex then + i = i + 6 -- bypass what we just read + + -- We have a Unicode codepoint. It could be standalone, or if in the proper range and + -- followed by another in a specific range, it'll be a two-code surrogate pair. + local codepoint = tonumber(hex, 16) + if codepoint >= 0xD800 and codepoint <= 0xDBFF then + -- it's a hi surrogate... see whether we have a following low + local lo_surrogate = text:match('^\\u([dD][cdefCDEF][0123456789aAbBcCdDeEfF][0123456789aAbBcCdDeEfF])', i) + if lo_surrogate then + i = i + 6 -- bypass the low surrogate we just read + codepoint = 0x2400 + (codepoint - 0xD800) * 0x400 + tonumber(lo_surrogate, 16) + else + -- not a proper low, so we'll just leave the first codepoint as is and spit it out. + end + end + VALUE = VALUE .. unicode_codepoint_as_utf8(codepoint) + + else + + -- just pass through what's escaped + VALUE = VALUE .. text:match('^\\(.)', i) + i = i + 2 + end + end + end + + self:onDecodeError("unclosed string", text, start, etc) +end + +local function skip_whitespace(text, start) + + local _, match_end = text:find("^[ \n\r\t]+", start) -- [http://www.ietf.org/rfc/rfc4627.txt] Section 2 + if match_end then + return match_end + 1 + else + return start + end +end + +local grok_one -- assigned later + +local function grok_object(self, text, start, etc) + if text:sub(start,start) ~= '{' then + self:onDecodeError("expected '{'", text, start, etc) + end + + local i = skip_whitespace(text, start + 1) -- +1 to skip the '{' + + local VALUE = self.strictTypes and self:newObject { } or { } + + if text:sub(i,i) == '}' then + return VALUE, i + 1 + end + local text_len = text:len() + while i <= text_len do + local key, new_i = grok_string(self, text, i, etc) + + i = skip_whitespace(text, new_i) + + if text:sub(i, i) ~= ':' then + self:onDecodeError("expected colon", text, i, etc) + end + + i = skip_whitespace(text, i + 1) + + local new_val, new_i = grok_one(self, text, i) + + VALUE[key] = new_val + + -- + -- Expect now either '}' to end things, or a ',' to allow us to continue. + -- + i = skip_whitespace(text, new_i) + + local c = text:sub(i,i) + + if c == '}' then + return VALUE, i + 1 + end + + if text:sub(i, i) ~= ',' then + self:onDecodeError("expected comma or '}'", text, i, etc) + end + + i = skip_whitespace(text, i + 1) + end + + self:onDecodeError("unclosed '{'", text, start, etc) +end + +local function grok_array(self, text, start, etc) + if text:sub(start,start) ~= '[' then + self:onDecodeError("expected '['", text, start, etc) + end + + local i = skip_whitespace(text, start + 1) -- +1 to skip the '[' + local VALUE = self.strictTypes and self:newArray { } or { } + if text:sub(i,i) == ']' then + return VALUE, i + 1 + end + + local VALUE_INDEX = 1 + + local text_len = text:len() + while i <= text_len do + local val, new_i = grok_one(self, text, i) + + -- can't table.insert(VALUE, val) here because it's a no-op if val is nil + VALUE[VALUE_INDEX] = val + VALUE_INDEX = VALUE_INDEX + 1 + + i = skip_whitespace(text, new_i) + + -- + -- Expect now either ']' to end things, or a ',' to allow us to continue. + -- + local c = text:sub(i,i) + if c == ']' then + return VALUE, i + 1 + end + if text:sub(i, i) ~= ',' then + self:onDecodeError("expected comma or '['", text, i, etc) + end + i = skip_whitespace(text, i + 1) + end + self:onDecodeError("unclosed '['", text, start, etc) +end + + +grok_one = function(self, text, start, etc) + -- Skip any whitespace + start = skip_whitespace(text, start) + + if start > text:len() then + self:onDecodeError("unexpected end of string", text, nil, etc) + end + + if text:find('^"', start) then + return grok_string(self, text, start, etc) + + elseif text:find('^[-0123456789 ]', start) then + return grok_number(self, text, start, etc) + + elseif text:find('^%{', start) then + return grok_object(self, text, start, etc) + + elseif text:find('^%[', start) then + return grok_array(self, text, start, etc) + + elseif text:find('^true', start) then + return true, start + 4 + + elseif text:find('^false', start) then + return false, start + 5 + + elseif text:find('^null', start) then + return nil, start + 4 + + else + self:onDecodeError("can't parse JSON", text, start, etc) + end +end + +function OBJDEF:decode(text, etc) + if type(self) ~= 'table' or self.__index ~= OBJDEF then + OBJDEF:onDecodeError("JSON:decode must be called in method format", nil, nil, etc) + end + + if text == nil then + self:onDecodeOfNilError(string.format("nil passed to JSON:decode()"), nil, nil, etc) + elseif type(text) ~= 'string' then + self:onDecodeError(string.format("expected string argument to JSON:decode(), got %s", type(text)), nil, nil, etc) + end + + if text:match('^%s*$') then + return nil + end + + if text:match('^%s*<') then + -- Can't be JSON... we'll assume it's HTML + self:onDecodeOfHTMLError(string.format("html passed to JSON:decode()"), text, nil, etc) + end + + -- + -- Ensure that it's not UTF-32 or UTF-16. + -- Those are perfectly valid encodings for JSON (as per RFC 4627 section 3), + -- but this package can't handle them. + -- + if text:sub(1,1):byte() == 0 or (text:len() >= 2 and text:sub(2,2):byte() == 0) then + self:onDecodeError("JSON package groks only UTF-8, sorry", text, nil, etc) + end + + local success, value = pcall(grok_one, self, text, 1, etc) + + if success then + return value + else + -- if JSON:onDecodeError() didn't abort out of the pcall, we'll have received the error message here as "value", so pass it along as an assert. + if self.assert then + self.assert(false, value) + else + assert(false, value) + end + -- and if we're still here, return a nil and throw the error message on as a second arg + return nil, value + end +end + +local function backslash_replacement_function(c) + if c == "\n" then + return "\\n" + elseif c == "\r" then + return "\\r" + elseif c == "\t" then + return "\\t" + elseif c == "\b" then + return "\\b" + elseif c == "\f" then + return "\\f" + elseif c == '"' then + return '\\"' + elseif c == '\\' then + return '\\\\' + else + return string.format("\\u%04x", c:byte()) + end +end + +local chars_to_be_escaped_in_JSON_string + = '[' + .. '"' -- class sub-pattern to match a double quote + .. '%\\' -- class sub-pattern to match a backslash + .. '%z' -- class sub-pattern to match a null + .. '\001' .. '-' .. '\031' -- class sub-pattern to match control characters + .. ']' + +local function json_string_literal(value) + local newval = value:gsub(chars_to_be_escaped_in_JSON_string, backslash_replacement_function) + return '"' .. newval .. '"' +end + +local function object_or_array(self, T, etc) + -- + -- We need to inspect all the keys... if there are any strings, we'll convert to a JSON + -- object. If there are only numbers, it's a JSON array. + -- + -- If we'll be converting to a JSON object, we'll want to sort the keys so that the + -- end result is deterministic. + -- + local string_keys = { } + local number_keys = { } + local number_keys_must_be_strings = false + local maximum_number_key + + for key in pairs(T) do + if type(key) == 'string' then + table.insert(string_keys, key) + elseif type(key) == 'number' then + table.insert(number_keys, key) + if key <= 0 or key >= math.huge then + number_keys_must_be_strings = true + elseif not maximum_number_key or key > maximum_number_key then + maximum_number_key = key + end + else + self:onEncodeError("can't encode table with a key of type " .. type(key), etc) + end + end + + if #string_keys == 0 and not number_keys_must_be_strings then + -- + -- An empty table, or a numeric-only array + -- + if #number_keys > 0 then + return nil, maximum_number_key -- an array + elseif tostring(T) == "JSON array" then + return nil + elseif tostring(T) == "JSON object" then + return { } + else + -- have to guess, so we'll pick array, since empty arrays are likely more common than empty objects + return nil + end + end + + table.sort(string_keys) + + local map + if #number_keys > 0 then + -- + -- If we're here then we have either mixed string/number keys, or numbers inappropriate for a JSON array + -- It's not ideal, but we'll turn the numbers into strings so that we can at least create a JSON object. + -- + + if self.noKeyConversion then + self:onEncodeError("a table with both numeric and string keys could be an object or array; aborting", etc) + end + + -- + -- Have to make a shallow copy of the source table so we can remap the numeric keys to be strings + -- + map = { } + for key, val in pairs(T) do + map[key] = val + end + + table.sort(number_keys) + + -- + -- Throw numeric keys in there as strings + -- + for _, number_key in ipairs(number_keys) do + local string_key = tostring(number_key) + if map[string_key] == nil then + table.insert(string_keys , string_key) + map[string_key] = T[number_key] + else + self:onEncodeError("conflict converting table with mixed-type keys into a JSON object: key " .. number_key .. " exists both as a string and a number.", etc) + end + end + end + + return string_keys, nil, map +end + +-- +-- Encode +-- +-- 'options' is nil, or a table with possible keys: +-- pretty -- if true, return a pretty-printed version +-- indent -- a string (usually of spaces) used to indent each nested level +-- align_keys -- if true, align all the keys when formatting a table +-- +local encode_value -- must predeclare because it calls itself +function encode_value(self, value, parents, etc, options, indent) + + if value == nil then + return 'null' + + elseif type(value) == 'string' then + return json_string_literal(value) + + elseif type(value) == 'number' then + if value ~= value then + -- + -- NaN (Not a Number). + -- JSON has no NaN, so we have to fudge the best we can. This should really be a package option. + -- + return "null" + elseif value >= math.huge then + -- + -- Positive infinity. JSON has no INF, so we have to fudge the best we can. This should + -- really be a package option. Note: at least with some implementations, positive infinity + -- is both ">= math.huge" and "<= -math.huge", which makes no sense but that's how it is. + -- Negative infinity is properly "<= -math.huge". So, we must be sure to check the ">=" + -- case first. + -- + return "1e+9999" + elseif value <= -math.huge then + -- + -- Negative infinity. + -- JSON has no INF, so we have to fudge the best we can. This should really be a package option. + -- + return "-1e+9999" + else + return tostring(value) + end + + elseif type(value) == 'boolean' then + return tostring(value) + + elseif type(value) ~= 'table' then + self:onEncodeError("can't convert " .. type(value) .. " to JSON", etc) + + else + -- + -- A table to be converted to either a JSON object or array. + -- + local T = value + + if type(options) ~= 'table' then + options = {} + end + if type(indent) ~= 'string' then + indent = "" + end + + if parents[T] then + self:onEncodeError("table " .. tostring(T) .. " is a child of itself", etc) + else + parents[T] = true + end + + local result_value + + local object_keys, maximum_number_key, map = object_or_array(self, T, etc) + if maximum_number_key then + -- + -- An array... + -- + local ITEMS = { } + for i = 1, maximum_number_key do + table.insert(ITEMS, encode_value(self, T[i], parents, etc, options, indent)) + end + + if options.pretty then + result_value = "[ " .. table.concat(ITEMS, ", ") .. " ]" + else + result_value = "[" .. table.concat(ITEMS, ",") .. "]" + end + + elseif object_keys then + -- + -- An object + -- + local TT = map or T + + if options.pretty then + + local KEYS = { } + local max_key_length = 0 + for _, key in ipairs(object_keys) do + local encoded = encode_value(self, tostring(key), parents, etc, options, indent) + if options.align_keys then + max_key_length = math.max(max_key_length, #encoded) + end + table.insert(KEYS, encoded) + end + local key_indent = indent .. tostring(options.indent or "") + local subtable_indent = key_indent .. string.rep(" ", max_key_length) .. (options.align_keys and " " or "") + local FORMAT = "%s%" .. string.format("%d", max_key_length) .. "s: %s" + + local COMBINED_PARTS = { } + for i, key in ipairs(object_keys) do + local encoded_val = encode_value(self, TT[key], parents, etc, options, subtable_indent) + table.insert(COMBINED_PARTS, string.format(FORMAT, key_indent, KEYS[i], encoded_val)) + end + result_value = "{\n" .. table.concat(COMBINED_PARTS, ",\n") .. "\n" .. indent .. "}" + + else + + local PARTS = { } + for _, key in ipairs(object_keys) do + local encoded_val = encode_value(self, TT[key], parents, etc, options, indent) + local encoded_key = encode_value(self, tostring(key), parents, etc, options, indent) + table.insert(PARTS, string.format("%s:%s", encoded_key, encoded_val)) + end + result_value = "{" .. table.concat(PARTS, ",") .. "}" + + end + else + -- + -- An empty array/object... we'll treat it as an array, though it should really be an option + -- + result_value = "[]" + end + + parents[T] = false + return result_value + end +end + + +function OBJDEF:encode(value, etc, options) + if type(self) ~= 'table' or self.__index ~= OBJDEF then + OBJDEF:onEncodeError("JSON:encode must be called in method format", etc) + end + return encode_value(self, value, {}, etc, options or nil) +end + +function OBJDEF:encode_pretty(value, etc, options) + if type(self) ~= 'table' or self.__index ~= OBJDEF then + OBJDEF:onEncodeError("JSON:encode_pretty must be called in method format", etc) + end + return encode_value(self, value, {}, etc, options or default_pretty_options) +end + +function OBJDEF.__tostring() + return "JSON encode/decode package" +end + +OBJDEF.__index = OBJDEF + +function OBJDEF:new(args) + local new = { } + + if args then + for key, val in pairs(args) do + new[key] = val + end + end + + return setmetatable(new, OBJDEF) +end + +return OBJDEF:new() + +-- +-- Version history: +-- +-- 20141223.14 The encode_pretty() routine produced fine results for small datasets, but isn't really +-- appropriate for anything large, so with help from Alex Aulbach I've made the encode routines +-- more flexible, and changed the default encode_pretty() to be more generally useful. +-- +-- Added a third 'options' argument to the encode() and encode_pretty() routines, to control +-- how the encoding takes place. +-- +-- Updated docs to add assert() call to the loadfile() line, just as good practice so that +-- if there is a problem loading JSON.lua, the appropriate error message will percolate up. +-- +-- 20140920.13 Put back (in a way that doesn't cause warnings about unused variables) the author string, +-- so that the source of the package, and its version number, are visible in compiled copies. +-- +-- 20140911.12 Minor lua cleanup. +-- Fixed internal reference to 'JSON.noKeyConversion' to reference 'self' instead of 'JSON'. +-- (Thanks to SmugMug's David Parry for these.) +-- +-- 20140418.11 JSON nulls embedded within an array were being ignored, such that +-- ["1",null,null,null,null,null,"seven"], +-- would return +-- {1,"seven"} +-- It's now fixed to properly return +-- {1, nil, nil, nil, nil, nil, "seven"} +-- Thanks to "haddock" for catching the error. +-- +-- 20140116.10 The user's JSON.assert() wasn't always being used. Thanks to "blue" for the heads up. +-- +-- 20131118.9 Update for Lua 5.3... it seems that tostring(2/1) produces "2.0" instead of "2", +-- and this caused some problems. +-- +-- 20131031.8 Unified the code for encode() and encode_pretty(); they had been stupidly separate, +-- and had of course diverged (encode_pretty didn't get the fixes that encode got, so +-- sometimes produced incorrect results; thanks to Mattie for the heads up). +-- +-- Handle encoding tables with non-positive numeric keys (unlikely, but possible). +-- +-- If a table has both numeric and string keys, or its numeric keys are inappropriate +-- (such as being non-positive or infinite), the numeric keys are turned into +-- string keys appropriate for a JSON object. So, as before, +-- JSON:encode({ "one", "two", "three" }) +-- produces the array +-- ["one","two","three"] +-- but now something with mixed key types like +-- JSON:encode({ "one", "two", "three", SOMESTRING = "some string" })) +-- instead of throwing an error produces an object: +-- {"1":"one","2":"two","3":"three","SOMESTRING":"some string"} +-- +-- To maintain the prior throw-an-error semantics, set +-- JSON.noKeyConversion = true +-- +-- 20131004.7 Release under a Creative Commons CC-BY license, which I should have done from day one, sorry. +-- +-- 20130120.6 Comment update: added a link to the specific page on my blog where this code can +-- be found, so that folks who come across the code outside of my blog can find updates +-- more easily. +-- +-- 20111207.5 Added support for the 'etc' arguments, for better error reporting. +-- +-- 20110731.4 More feedback from David Kolf on how to make the tests for Nan/Infinity system independent. +-- +-- 20110730.3 Incorporated feedback from David Kolf at http://lua-users.org/wiki/JsonModules: +-- +-- * When encoding lua for JSON, Sparse numeric arrays are now handled by +-- spitting out full arrays, such that +-- JSON:encode({"one", "two", [10] = "ten"}) +-- returns +-- ["one","two",null,null,null,null,null,null,null,"ten"] +-- +-- In 20100810.2 and earlier, only up to the first non-null value would have been retained. +-- +-- * When encoding lua for JSON, numeric value NaN gets spit out as null, and infinity as "1+e9999". +-- Version 20100810.2 and earlier created invalid JSON in both cases. +-- +-- * Unicode surrogate pairs are now detected when decoding JSON. +-- +-- 20100810.2 added some checking to ensure that an invalid Unicode character couldn't leak in to the UTF-8 encoding +-- +-- 20100731.1 initial public release +-- diff --git a/README.md b/README.md index 992c0a7..d3f2bf1 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Running without a warrior ------------------------- To run this outside the warrior, clone this repository, cd into its directory and run: - pip install seesaw + pip install --upgrade seesaw ./get-wget-lua.sh then start downloading with: @@ -32,9 +32,9 @@ For more options, run: run-pipeline --help -If you don't have root access and/or your version of pip is very old, you can replace "pip install seesaw" with: +If you don't have root access and/or your version of pip is very old, you can replace "pip install --upgrade seesaw" with: - wget https://raw.github.com/pypa/pip/master/contrib/get-pip.py ; python get-pip.py --user ; ~/.local/bin/pip install --user seesaw + wget https://raw.github.com/pypa/pip/master/contrib/get-pip.py ; python get-pip.py --user ; ~/.local/bin/pip install --upgrade --user seesaw so that pip and seesaw are installed in your home, then run @@ -56,26 +56,32 @@ Distribution-specific setup ### For Debian/Ubuntu: adduser --system --group --shell /bin/bash archiveteam - apt-get install -y git-core libgnutls-dev lua5.1 liblua5.1-0 liblua5.1-0-dev screen python-dev python-pip bzip2 zlib1g-dev - pip install seesaw + apt-get update && apt-get install -y git-core libgnutls-dev lua5.1 liblua5.1-0 liblua5.1-0-dev screen python-dev python-pip bzip2 zlib1g-dev flex autoconf + pip install --upgrade seesaw su -c "cd /home/archiveteam; git clone https://github.com/ArchiveTeam/reddit-grab.git; cd reddit-grab; ./get-wget-lua.sh" archiveteam screen su -c "cd /home/archiveteam/reddit-grab/; run-pipeline pipeline.py --concurrent 2 --address '127.0.0.1' YOURNICKHERE" archiveteam [... ctrl+A D to detach ...] +In __Debian Jessie__, the `libgnutls-dev` package was renamed to `libgnutls28-dev`. So, you need to do the following instead: + + adduser --system --group --shell /bin/bash archiveteam + apt-get update && apt-get install -y git-core libgnutls28-dev lua5.1 liblua5.1-0 liblua5.1-0-dev screen python-dev python-pip bzip2 zlib1g-dev flex autoconf + [... pretty much the same as above ...] + Wget-lua is also available on [ArchiveTeam's PPA](https://launchpad.net/~archiveteam/+archive/wget-lua) for Ubuntu. ### For CentOS: -Ensure that you have the CentOS equivalent of bzip2 installed as well. You might need the EPEL repository to be enabled. +Ensure that you have the CentOS equivalent of bzip2 installed as well. You will the EPEL repository to be enabled. - yum -y install gnutls-devel lua-devel python-pip zlib-devel - pip install seesaw + yum -y install autoconf automake flex gnutls-devel lua-devel python-pip zlib-devel + pip install --upgrade seesaw [... pretty much the same as above ...] ### For openSUSE: zypper install liblua5_1 lua51 lua51-devel screen python-pip libgnutls-devel bzip2 python-devel gcc make - pip install seesaw + pip install --upgrade seesaw [... pretty much the same as above ...] ### For OS X: @@ -83,7 +89,7 @@ Ensure that you have the CentOS equivalent of bzip2 installed as well. You might You need Homebrew. Ensure that you have the OS X equivalent of bzip2 installed as well. brew install python lua gnutls - pip install seesaw + pip install --upgrade seesaw [... pretty much the same as above ...] **There is a known issue with some packaged versions of rsync. If you get errors during the upload stage, reddit-grab will not work with your rsync version.** @@ -97,12 +103,21 @@ This supposedly fixes it: Ensure that you have the Arch equivalent of bzip2 installed as well. 1. Make sure you have `python2-pip` installed. -2. Install [https://aur.archlinux.org/packages/wget-lua/](the wget-lua package from the AUR). -3. Run `pip2 install seesaw`. +2. Install [the wget-lua package from the AUR](https://aur.archlinux.org/packages/wget-lua/). +3. Run `pip2 install --upgrade seesaw`. 4. Modify the run-pipeline script in seesaw to point at `#!/usr/bin/python2` instead of `#!/usr/bin/python`. 5. `useradd --system --group users --shell /bin/bash --create-home archiveteam` 6. `screen su -c "cd /home/archiveteam/reddit-grab/; run-pipeline pipeline.py --concurrent 2 --address '127.0.0.1' YOURNICKHERE" archiveteam` +### For Alpine Linux: + + apk add lua5.1 git python bzip2 bash rsync gcc libc-dev lua5.1-dev zlib-dev gnutls-dev autoconf flex make + python -m ensurepip + pip install -U seesaw + git clone https://github.com/ArchiveTeam/reddit-grab + cd reddit-grab; ./get-wget-lua.sh + run-pipeline pipeline.py --concurrent 2 --address '127.0.0.1' YOURNICKHERE + ### For FreeBSD: Honestly, I have no idea. `./get-wget-lua.sh` supposedly doesn't work due to differences in the `tar` that ships with FreeBSD. Another problem is the apparent absence of Lua 5.1 development headers. If you figure this out, please do let us know on IRC (irc.efnet.org #archiveteam). @@ -134,6 +149,12 @@ If you're sure that you followed the steps to install `seesaw`, permissions on y chmod o+rX -R /usr/local/lib/python2.7/dist-packages +### run-pipeline: command not found + +Install `seesaw` using `pip2` instead of `pip`. + + pip2 install seesaw + ### Issues in the code If you notice a bug and want to file a bug report, please use the GitHub issues tracker. @@ -142,4 +163,5 @@ Are you a developer? Help write code for us! Look at our [developer documentatio ### Other problems -Have an issue not listed here? Join us on IRC and ask! We can be found at irc.efnet.org #deaddit. +Have an issue not listed here? Join us on IRC and ask! We can be found at irc.efnet.org #shreddit. + diff --git a/cookies b/cookies index 8c506c1..3cabb29 100644 --- a/cookies +++ b/cookies @@ -1 +1,2 @@ .reddit.com TRUE / FALSE 0 over18 1 + diff --git a/get-wget-lua.sh b/get-wget-lua.sh old mode 100644 new mode 100755 diff --git a/ignore-list b/ignore-list new file mode 100644 index 0000000..e69de29 diff --git a/pipeline.py b/pipeline.py index d799ecb..55c8f3c 100644 --- a/pipeline.py +++ b/pipeline.py @@ -5,6 +5,7 @@ import hashlib import os.path import random from seesaw.config import realize, NumberConfigValue +from seesaw.externalprocess import ExternalProcess from seesaw.item import ItemInterpolation, ItemValue from seesaw.task import SimpleTask, LimitConcurrent from seesaw.tracker import GetItemFromTracker, PrepareStatsForTracker, \ @@ -15,8 +16,15 @@ import subprocess import sys import time import string -import requests import re +import random + +try: + import warcio + from warcio.archiveiterator import ArchiveIterator + from warcio.warcwriter import WARCWriter +except: + raise Exception("Please install warc with 'sudo pip install warcio --upgrade'.") import seesaw from seesaw.externalprocess import WgetDownload @@ -24,10 +32,12 @@ from seesaw.pipeline import Pipeline from seesaw.project import Project from seesaw.util import find_executable +from tornado import httpclient + # check the seesaw version -if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"): - raise Exception("This pipeline needs seesaw version 0.8.5 or higher.") +if StrictVersion(seesaw.__version__) < StrictVersion('0.8.5'): + raise Exception('This pipeline needs seesaw version 0.8.5 or higher.') ########################################################################### @@ -37,21 +47,21 @@ if StrictVersion(seesaw.__version__) < StrictVersion("0.8.5"): # 1. does not crash with --version, and # 2. prints the required version string WGET_LUA = find_executable( - "Wget+Lua", - ["GNU Wget 1.14.lua.20130523-9a5c"], + 'Wget+Lua', + ['GNU Wget 1.14.lua.20130523-9a5c', 'GNU Wget 1.14.lua.20160530-955376b'], [ - "./wget-lua", - "./wget-lua-warrior", - "./wget-lua-local", - "../wget-lua", - "../../wget-lua", - "/home/warrior/wget-lua", - "/usr/bin/wget-lua" + './wget-lua', + './wget-lua-warrior', + './wget-lua-local', + '../wget-lua', + '../../wget-lua', + '/home/warrior/wget-lua', + '/usr/bin/wget-lua' ] ) if not WGET_LUA: - raise Exception("No usable Wget+Lua found.") + raise Exception('No usable Wget+Lua found.') ########################################################################### @@ -59,7 +69,7 @@ if not WGET_LUA: # # Update this each time you make a non-cosmetic change. # It will be added to the WARC files and reported to the tracker. -VERSION = "20150620.02" +VERSION = '20190222.01' USER_AGENT = 'ArchiveTeam' TRACKER_ID = 'reddit' TRACKER_HOST = 'tracker.archiveteam.org' @@ -73,7 +83,7 @@ TRACKER_HOST = 'tracker.archiveteam.org' # each item. class CheckIP(SimpleTask): def __init__(self): - SimpleTask.__init__(self, "CheckIP") + SimpleTask.__init__(self, 'CheckIP') self._counter = 0 def process(self, item): @@ -106,39 +116,98 @@ class CheckIP(SimpleTask): class PrepareDirectories(SimpleTask): def __init__(self, warc_prefix): - SimpleTask.__init__(self, "PrepareDirectories") + SimpleTask.__init__(self, 'PrepareDirectories') self.warc_prefix = warc_prefix def process(self, item): - item_name = item["item_name"] + item_name = item['item_name'] escaped_item_name = item_name.replace(':', '_').replace('/', '_').replace('~', '_') - dirname = "/".join((item["data_dir"], escaped_item_name)) + item_hash = hashlib.sha1(item_name.encode('utf-8')).hexdigest() + dirname = '/'.join((item['data_dir'], item_hash)) if os.path.isdir(dirname): shutil.rmtree(dirname) os.makedirs(dirname) - item["item_dir"] = dirname - item["warc_file_base"] = "%s-%s-%s" % (self.warc_prefix, escaped_item_name, - time.strftime("%Y%m%d-%H%M%S")) + item['item_dir'] = dirname + item['warc_file_base'] = '%s-%s-%s' % (self.warc_prefix, item_hash, + time.strftime('%Y%m%d-%H%M%S')) - open("%(item_dir)s/%(warc_file_base)s.warc.gz" % item, "w").close() + open('%(item_dir)s/%(warc_file_base)s.warc.gz' % item, 'w').close() + open('%(item_dir)s/%(warc_file_base)s_data.txt' % item, 'w').close() + + +class Deduplicate(SimpleTask): + def __init__(self): + SimpleTask.__init__(self, 'Deduplicate') + + def process(self, item): + digests = {} + input_filename = '%(item_dir)s/%(warc_file_base)s.warc.gz' % item + output_filename = '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item + with open(input_filename, 'rb') as f_in, \ + open(output_filename, 'wb') as f_out: + writer = WARCWriter(filebuf=f_out, gzip=True) + for record in ArchiveIterator(f_in): + url = record.rec_headers.get_header('WARC-Target-URI') + if url is not None and url.startswith('<'): + url = re.search('^<(.+)>$', url).group(1) + record.rec_headers.replace_header('WARC-Target-URI', url) + if record.rec_headers.get_header('WARC-Type') == 'response': + digest = record.rec_headers.get_header('WARC-Payload-Digest') + if digest in digests: + writer.write_record( + self._record_response_to_revisit(writer, record, + digests[digest]) + ) + else: + digests[digest] = ( + record.rec_headers.get_header('WARC-Record-ID'), + record.rec_headers.get_header('WARC-Date'), + record.rec_headers.get_header('WARC-Target-URI') + ) + writer.write_record(record) + elif record.rec_headers.get_header('WARC-Type') == 'warcinfo': + record.rec_headers.replace_header('WARC-Filename', output_filename) + writer.write_record(record) + else: + writer.write_record(record) + + def _record_response_to_revisit(self, writer, record, duplicate): + warc_headers = record.rec_headers + warc_headers.replace_header('WARC-Refers-To', duplicate[0]) + warc_headers.replace_header('WARC-Refers-To-Date', duplicate[1]) + warc_headers.replace_header('WARC-Refers-To-Target-URI', duplicate[2]) + warc_headers.replace_header('WARC-Type', 'revisit') + warc_headers.replace_header('WARC-Truncated', 'length') + warc_headers.replace_header('WARC-Profile', + 'http://netpreserve.org/warc/1.0/' \ + 'revisit/identical-payload-digest') + warc_headers.remove_header('WARC-Block-Digest') + warc_headers.remove_header('Content-Length') + return writer.create_warc_record( + record.rec_headers.get_header('WARC-Target-URI'), + 'revisit', + warc_headers=warc_headers, + http_headers=record.http_headers + ) class MoveFiles(SimpleTask): def __init__(self): - SimpleTask.__init__(self, "MoveFiles") + SimpleTask.__init__(self, 'MoveFiles') def process(self, item): - # NEW for 2014! Check if wget was compiled with zlib support - if os.path.exists("%(item_dir)s/%(warc_file_base)s.warc" % item): + if os.path.exists('%(item_dir)s/%(warc_file_base)s.warc' % item): raise Exception('Please compile wget with zlib support!') - os.rename("%(item_dir)s/%(warc_file_base)s.warc.gz" % item, - "%(data_dir)s/%(warc_file_base)s.warc.gz" % item) + os.rename('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item, + '%(data_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item) + os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item, + '%(data_dir)s/%(warc_file_base)s_data.txt' % item) - shutil.rmtree("%(item_dir)s" % item) + shutil.rmtree('%(item_dir)s' % item) def get_hash(filename): @@ -163,62 +232,54 @@ def stats_id_function(item): class WgetArgs(object): + post_chars = string.digits + string.ascii_lowercase + + def int_to_str(self, i): + d, m = divmod(i, 36) + if d > 0: + return self.int_to_str(d) + self.post_chars[m] + return self.post_chars[m] + def realize(self, item): wget_args = [ WGET_LUA, - "-U", USER_AGENT, - "-nv", - "--lua-script", "reddit.lua", - "--load-cookies", "cookies", - "-o", ItemInterpolation("%(item_dir)s/wget.log"), - "--no-check-certificate", - "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), - "--truncate-output", - "-e", "robots=off", - "--rotate-dns", - "--recursive", "--level=inf", - "--no-parent", - "--page-requisites", - "--timeout", "30", - "--tries", "inf", - "--domains", "reddit.com,redditmedia.com", - "--span-hosts", - "--waitretry", "30", - "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), - "--warc-header", "operator: Archive Team", - "--warc-header", "reddit-dld-script-version: " + VERSION, - "--warc-header", ItemInterpolation("reddit-user: %(item_name)s"), + '-U', USER_AGENT, + '-nv', + '--lua-script', 'reddit.lua', + '--load-cookies', 'cookies', + '-o', ItemInterpolation('%(item_dir)s/wget.log'), + '--no-check-certificate', + '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), + '--truncate-output', + '-e', 'robots=off', + '--rotate-dns', + '--recursive', '--level=inf', + '--no-parent', + '--page-requisites', + '--timeout', '30', + '--tries', 'inf', + '--domains', 'reddit.com', + '--span-hosts', + '--waitretry', '30', + '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), + '--warc-header', 'operator: Archive Team', + '--warc-header', 'reddit-dld-script-version: ' + VERSION, + '--warc-header', ItemInterpolation('reddit-item: %(item_name)s') ] item_name = item['item_name'] - assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value - - assert item_type in ('36comments') - - if item_type == '36comments': - suffixes = string.digits + string.ascii_lowercase - for url in ['http://redd.it/{0}{1}'.format(item_value, a) for a in suffixes]: - wget_args.append(url) -# for suffix in suffixes: -# commenturl = 'https://www.reddit.com/comments/{0}{1}/'.format(item_value, suffix) -# html = requests.get(commenturl, headers={'User-Agent': 'ArchiveTeam'}) -# print('Downloaded', html.status_code, getattr(html, 'reason')) -# sys.stdout.flush() -# if html.status_code == 200: -# if not html.text: -# raise Exception('Something went wrong during the download. ({0})'.format(html.status_code)) -# else: -# for origurl in re.findall(r'href="(https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/{0}{1}\/[^"]+)"'.format(item_value, suffix), html.text): -# if (re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/', origurl) or re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/', origurl)) and not re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/.', origurl): -# wget_args.append(origurl) -# elif html.status_code == 404: -# print('This url is 404.') -# else: -# raise Exception('Something went wrong during the download. ({0})'.format(html.status_code)) + + if item_type in ('posts'): + start, end = item_value.split('-') + for i in range(int(start), int(end)+1): + post_id = self.int_to_str(i) + wget_args.extend(['--warc-header', 'reddit-post: {}'.format(post_id)]) + wget_args.append('https://www.reddit.com/comments/{}'.format(post_id)) + wget_args.append('https://old.reddit.com/comments/{}'.format(post_id)) else: raise Exception('Unknown item') @@ -237,59 +298,67 @@ class WgetArgs(object): # This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project( - title="reddit", - project_html=""" - -

www.reddit.com Website · Leaderboard

-

Grabbing reddit.

- """ + title='reddit', + project_html=''' + +

reddit.com Website · Leaderboard

+

Archiving everything from reddit.

+ ''' ) pipeline = Pipeline( CheckIP(), - GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, + GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), - PrepareDirectories(warc_prefix="reddit"), + PrepareDirectories(warc_prefix='reddit'), WgetDownload( WgetArgs(), max_tries=2, - accept_on_exit_code=[0, 8], + accept_on_exit_code=[0, 4, 8], env={ - "item_dir": ItemValue("item_dir"), - "item_value": ItemValue("item_value"), - "item_type": ItemValue("item_type"), + 'item_dir': ItemValue('item_dir'), + 'item_value': ItemValue('item_value'), + 'item_type': ItemValue('item_type'), + 'warc_file_base': ItemValue('warc_file_base') } ), + Deduplicate(), PrepareStatsForTracker( - defaults={"downloader": downloader, "version": VERSION}, + defaults={'downloader': downloader, 'version': VERSION}, file_groups={ - "data": [ - ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz") + 'data': [ + ItemInterpolation('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz') ] }, id_function=stats_id_function, ), MoveFiles(), - LimitConcurrent(NumberConfigValue(min=1, max=4, default="1", - name="shared:rsync_threads", title="Rsync threads", - description="The maximum number of concurrent uploads."), + LimitConcurrent(NumberConfigValue(min=1, max=20, default='20', + name='shared:rsync_threads', title='Rsync threads', + description='The maximum number of concurrent uploads.'), UploadWithTracker( - "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), + 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ - ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") + ItemInterpolation('%(data_dir)s/%(warc_file_base)s-deduplicated.warc.gz'), + ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt') ], - rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), + rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), rsync_extra_args=[ - "--recursive", - "--partial", - "--partial-dir", ".rsync-tmp", + '--sockopts=SO_SNDBUF=8388608,SO_RCVBUF=8388608', + '--recursive', + '--partial', + '--partial-dir', '.rsync-tmp', + '--min-size', '1', + '--no-compress', + '--compress-level', '0' ] ), ), SendDoneToTracker( - tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), - stats=ItemValue("stats") + tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), + stats=ItemValue('stats') ) ) + diff --git a/reddit.lua b/reddit.lua index e170969..5f5adf3 100644 --- a/reddit.lua +++ b/reddit.lua @@ -1,22 +1,32 @@ -dofile("urlcode.lua") dofile("table_show.lua") +dofile("urlcode.lua") +JSON = (loadfile "JSON.lua")() -local url_count = 0 -local tries = 0 local item_type = os.getenv('item_type') local item_value = os.getenv('item_value') +local item_dir = os.getenv('item_dir') +local warc_file_base = os.getenv('warc_file_base') +local url_count = 0 +local tries = 0 local downloaded = {} local addedtolist = {} +local abortgrab = false --- Do not download these urls: -downloaded["http://pixel.redditmedia.com/pixel/of_destiny.png?v=q1Ga4BM4n71zceWwjRg4266wx1BqgGjx8isnnrLeBUv%2FXq%2Bk60QeBpQruPDKFQFv%2FDWVNxp63YPBIKv8pMk%2BhrkV3HA5b7GO"] = true -downloaded["http://pixel.redditmedia.com/pixel/of_doom.png"] = true -downloaded["http://pixel.redditmedia.com/pixel/of_delight.png"] = true -downloaded["http://pixel.redditmedia.com/pixel/of_discovery.png"] = true -downloaded["http://pixel.redditmedia.com/pixel/of_diversity.png"] = true -downloaded["http://pixel.redditmedia.com/click"] = true -downloaded["https://stats.redditmedia.com/"] = true +local posts = {} +local requested_children = {} + +for ignore in io.open("ignore-list", "r"):lines() do + downloaded[ignore] = true +end + +load_json_file = function(file) + if file then + return JSON:decode(file) + else + return nil + end +end read_file = function(file) if file then @@ -29,141 +39,218 @@ read_file = function(file) end end -wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason) - local url = urlpos["url"]["url"] - local html = urlpos["link_expect_html"] - - if downloaded[url] == true or addedtolist[url] == true then +allowed = function(url, parenturl) + if string.match(url, "'+") + or string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") + or string.match(url, "^https?://[^/]*reddit%.com/login") + or string.match(url, "^https?://[^/]*reddit%.com/register") + or string.match(url, "%?sort=") + or string.match(url, "^https?://www%.reddit%.com/") --TEMP + or string.match(url, "/%.rss$") then return false end - - if (downloaded[url] ~= true or addedtolist[url] ~= true) then - if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "%?sort=") or string.match(url, "%?ref=") or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com") or string.match(url, "thumbs%.redditmedia%.com")) then - addedtolist[url] = true - return true - else + + local tested = {} + for s in string.gmatch(url, "([^/]+)") do + if tested[s] == nil then + tested[s] = 0 + end + if tested[s] == 6 then return false end - else + tested[s] = tested[s] + 1 + end + + if url .. "/" == parenturl then return false end + + if string.match(url, "^https?://i%.redd%.it/") + or string.match(url, "^https?://[^/]*redditmedia%.com/") + or string.match(url, "^https://old.reddit.com/api/morechildren$") then + return true + end + + for s in string.gmatch(url, "([a-z0-9]+)") do + if posts[s] then + return true + end + end + + return false end +wget.callbacks.download_child_p = function(urlpos, parent, depth, start_url_parsed, iri, verdict, reason) + local url = urlpos["url"]["url"] + local html = urlpos["link_expect_html"] + + if string.match(url, "[<>\\%*%$;%^%[%],%(%){}]") then + return false + end + + if (downloaded[url] ~= true and addedtolist[url] ~= true) + and (allowed(url, parent["url"]) or html == 0) then + addedtolist[url] = true + return true + end + + return false +end wget.callbacks.get_urls = function(file, url, is_css, iri) local urls = {} local html = nil + + downloaded[url] = true - if downloaded[url] ~= true then - downloaded[url] = true - end - - local function check(url) - if (downloaded[url] ~= true and addedtolist[url] ~= true) and (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") or (string.match(url, "redditmedia%.com")) and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "thumbs%.redditmedia%.com") or string.match(url, "%?sort=") or string.match(url, "%?ref=") or string.match(url, "%?count=") or string.match(url, "%.rss") or string.match(url, "%?originalUrl=") or string.match(url, "m%.reddit%.com")) then - if string.match(url, "&") then - table.insert(urls, { url=string.gsub(url, "&", "&") }) - addedtolist[url] = true - addedtolist[string.gsub(url, "&", "&")] = true - elseif string.match(url, "#") then - table.insert(urls, { url=string.match(url, "(https?//:[^#]+)#") }) - addedtolist[url] = true - addedtolist[string.match(url, "(https?//:[^#]+)#")] = true - else - table.insert(urls, { url=url }) - addedtolist[url] = true - end + local function check(urla) + local origurl = url + local url = string.match(urla, "^([^#]+)") + local url_ = string.gsub(url, "&", "&") + if (downloaded[url_] ~= true and addedtolist[url_] ~= true) + and allowed(url_, origurl) then + table.insert(urls, { url=url_ }) + addedtolist[url_] = true + addedtolist[url] = true end end - - if string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z]") and not (string.match(url, "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") or string.match(url, "/related/"..item_value)) then + + local function checknewurl(newurl) + if string.match(newurl, "^https?:////") then + check(string.gsub(newurl, ":////", "://")) + elseif string.match(newurl, "^https?://") then + check(newurl) + elseif string.match(newurl, "^https?:\\/\\?/") then + check(string.gsub(newurl, "\\", "")) + elseif string.match(newurl, "^\\/\\/") then + check(string.match(url, "^(https?:)")..string.gsub(newurl, "\\", "")) + elseif string.match(newurl, "^//") then + check(string.match(url, "^(https?:)")..newurl) + elseif string.match(newurl, "^\\/") then + check(string.match(url, "^(https?://[^/]+)")..string.gsub(newurl, "\\", "")) + elseif string.match(newurl, "^/") then + check(string.match(url, "^(https?://[^/]+)")..newurl) + elseif string.match(newurl, "^%./") then + checknewurl(string.match(newurl, "^%.(.+)")) + end + end + + local function checknewshorturl(newurl) + if string.match(newurl, "^%?") then + check(string.match(url, "^(https?://[^%?]+)")..newurl) + elseif not (string.match(newurl, "^https?:\\?/\\?//?/?") + or string.match(newurl, "^[/\\]") + or string.match(newurl, "^%./") + or string.match(newurl, "^[jJ]ava[sS]cript:") + or string.match(newurl, "^[mM]ail[tT]o:") + or string.match(newurl, "^vine:") + or string.match(newurl, "^android%-app:") + or string.match(newurl, "^ios%-app:") + or string.match(newurl, "^%${")) then + check(string.match(url, "^(https?://.+/)")..newurl) + end + end + + if string.match(url, "^https?://www%.reddit%.com/comments/[a-z0-9]+$") + or string.match(url, "^https?://old%.reddit%.com/comments/[a-z0-9]+$") then + posts[string.match(url, "[a-z0-9]+$")] = true + end + + if allowed(url, nil) + and not string.match(url, "^https?://[^/]*redditmedia%.com/") + and not string.match(url, "^https?://[^/]*redditstatic%.com/") then html = read_file(file) - for newurl in string.gmatch(html, '"thumbnail[^"]+"[^"]+"[^"]+"[^"]+"(//[^"]+)"') do - if downloaded[string.gsub(newurl, "//", "http://")] ~= true and addedtolist[string.gsub(newurl, "//", "http://")] ~= true then - table.insert(urls, { url=string.gsub(newurl, "//", "http://") }) - addedtolist[string.gsub(newurl, "//", "http://")] = true + if string.match(url, "^https://old.reddit.com/api/morechildren$") then + html = string.gsub(html, '\\"', '"') + end + if string.match(url, "^https?://old%.reddit%.com/") then + for s in string.gmatch(html, "(return%s+morechildren%(this,%s*'[^']+',%s*'[^']+',%s*'[^']+',%s*[0-9]+,%s*'[^']+'%))") do + local link_id, sort, children, depth, limit_children = string.match(s, "%(this,%s*'([^']+)',%s*'([^']+)',%s*'([^']+)',%s*([0-9]+),%s*'([^']+)'%)$") + local id = string.match(children, "^([^,]+)") + local subreddit = string.match(html, 'data%-subreddit="([^"]+)"') + local post_data = "link_id=" .. link_id .. "&sort=" .. sort .. "&children=" .. string.gsub(children, ",", "%%2C") .. "&depth=" .. depth .. "&id=t1_" .. id .. "&limit_children=" .. limit_children .. "&r=" .. subreddit .. "&renderstyle=html" + if requested_children[post_data] == nil then + requested_children[post_data] = true + table.insert(urls, {url="https://old.reddit.com/api/morechildren", + post_data=post_data}) + end end end - for newurl in string.gmatch(html, '"(https?://[^"]+)"') do - check(newurl) + for newurl in string.gmatch(string.gsub(html, """, '"'), '([^"]+)') do + checknewurl(newurl) end - for newurl in string.gmatch(html, "'(https?://[^']+)'") do - check(newurl) + for newurl in string.gmatch(string.gsub(html, "'", "'"), "([^']+)") do + checknewurl(newurl) end - for newurl in string.gmatch(html, '("/[^"]+)"') do - if string.match(newurl, '"//') then - check(string.gsub(newurl, '"//', 'http://')) - elseif not string.match(newurl, '"//') then - check(string.match(url, "(https?://[^/]+)/")..string.match(newurl, '"(/.+)')) - end + for newurl in string.gmatch(html, ">%s*([^<%s]+)") do + checknewurl(newurl) end - for newurl in string.gmatch(html, "('/[^']+)'") do - if string.match(newurl, "'//") then - check(string.gsub(newurl, "'//", "http://")) - elseif not string.match(newurl, "'//") then - check(string.match(url, '(https?://[^/]+)/')..string.match(newurl, "'(/.+)")) - end + for newurl in string.gmatch(html, "[^%-]href='([^']+)'") do + checknewshorturl(newurl) + end + for newurl in string.gmatch(html, '[^%-]href="([^"]+)"') do + checknewshorturl(newurl) + end + for newurl in string.gmatch(html, ":%s*url%(([^%)]+)%)") do + checknewurl(newurl) end end - + return urls end - wget.callbacks.httploop_result = function(url, err, http_stat) - -- NEW for 2014: Slightly more verbose messages because people keep - -- complaining that it's not moving or not working status_code = http_stat["statcode"] url_count = url_count + 1 - io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. ". \n") + io.stdout:write(url_count .. "=" .. status_code .. " " .. url["url"] .. " \n") io.stdout:flush() - if (status_code >= 200 and status_code <= 399) then - if string.match(url.url, "https://") then - local newurl = string.gsub(url.url, "https://", "http://") - downloaded[newurl] = true - else - downloaded[url.url] = true + if (status_code >= 300 and status_code <= 399) then + local newloc = string.match(http_stat["newloc"], "^([^#]+)") + if string.match(newloc, "^//") then + newloc = string.match(url["url"], "^(https?:)") .. string.match(newloc, "^//(.+)") + elseif string.match(newloc, "^/") then + newloc = string.match(url["url"], "^(https?://[^/]+)") .. newloc + elseif not string.match(newloc, "^https?://") then + newloc = string.match(url["url"], "^(https?://.+/)") .. newloc + end + if downloaded[newloc] == true or addedtolist[newloc] == true then + return wget.actions.EXIT end end - if status_code >= 500 or - (status_code >= 400 and status_code ~= 404 and status_code ~= 403) then + if (status_code >= 200 and status_code <= 399) then + downloaded[url["url"]] = true + downloaded[string.gsub(url["url"], "https?://", "http://")] = true + end - io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n") + if abortgrab == true then + io.stdout:write("ABORTING...\n") + return wget.actions.ABORT + end + + if status_code >= 500 + or (status_code >= 400 and status_code ~= 403 and status_code ~= 404) + or status_code == 0 then + io.stdout:write("Server returned "..http_stat.statcode.." ("..err.."). Sleeping.\n") io.stdout:flush() - - os.execute("sleep 10") - - tries = tries + 1 - - if tries >= 6 then + local maxtries = 8 + if not allowed(url["url"], nil) then + maxtries = 2 + end + if tries > maxtries then io.stdout:write("\nI give up...\n") io.stdout:flush() tries = 0 - if string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z]") and not string.match(url["url"], "[^a-z0-9]"..item_value.."[0-9a-z][0-9a-z]") then + if allowed(url["url"], nil) then return wget.actions.ABORT else return wget.actions.EXIT end else - return wget.actions.CONTINUE - end - elseif status_code == 0 then - - io.stdout:write("\nServer returned "..http_stat.statcode..". Sleeping.\n") - io.stdout:flush() - - os.execute("sleep 10") - - tries = tries + 1 - - if tries >= 6 then - io.stdout:write("\nI give up...\n") - io.stdout:flush() - tries = 0 - return wget.actions.ABORT - else + os.execute("sleep " .. math.floor(math.pow(2, tries))) + tries = tries + 1 return wget.actions.CONTINUE end end @@ -178,3 +265,10 @@ wget.callbacks.httploop_result = function(url, err, http_stat) return wget.actions.NOTHING end + +wget.callbacks.before_exit = function(exit_status, exit_status_string) + if abortgrab == true then + return wget.exits.IO_FAIL + end + return exit_status +end diff --git a/warrior-install.sh b/warrior-install.sh new file mode 100755 index 0000000..135477f --- /dev/null +++ b/warrior-install.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +PIP=pip + +if type pip3 > /dev/null 2>&1 +then + PIP=pip3 +fi + +echo "Installing warcio" +if ! sudo $PIP install warcio --upgrade +then + exit 1 +fi + +exit 0 + diff --git a/wget-lua-warrior b/wget-lua-warrior old mode 100644 new mode 100755