mirror of https://github.com/koreader/koreader
Revert NewsDownloader (#8953)
My volumize PR introduced too many issues. Revert to previous version of NewsDownloader until they can be resolved. Fixes #8867 Reintroduces #8799, #6234reviewable/pr8958/r1
parent
c9bfb74d2a
commit
d53ee056cc
@ -1,409 +0,0 @@
|
||||
local BD = require("ui/bidi")
|
||||
local DownloadBackend = require("epubdownloadbackend")
|
||||
local NewsHelpers = require("http_utilities")
|
||||
local dateparser = require("lib.dateparser")
|
||||
local logger = require("logger")
|
||||
local md5 = require("ffi/sha2").md5
|
||||
local util = require("util")
|
||||
local _ = require("gettext")
|
||||
local N_ = _.ngettext
|
||||
local FFIUtil = require("ffi/util")
|
||||
local T = FFIUtil.template
|
||||
|
||||
local FeedSource = {
|
||||
file_extension = ".epub"
|
||||
}
|
||||
|
||||
function FeedSource:new(o)
|
||||
o = o or {}
|
||||
self.__index = self
|
||||
setmetatable(o, self)
|
||||
return o
|
||||
end
|
||||
|
||||
function FeedSource:getInitializedFeeds(feed_list, progress_callback, error_callback)
|
||||
local initialized_feeds = {}
|
||||
local unsupported_feeds_urls = {}
|
||||
|
||||
for idx, feed in ipairs(feed_list) do
|
||||
local url = feed[1]
|
||||
-- Show a UI update
|
||||
progress_callback(T(
|
||||
_("Setting up feed %1 of %2."),
|
||||
idx,
|
||||
url
|
||||
))
|
||||
-- Initialize the feed
|
||||
local ok, response = pcall(function()
|
||||
return self:initializeDocument(
|
||||
self:fetchDocumentByUrl(url)
|
||||
)
|
||||
end)
|
||||
-- If the initialization worked, add the feed
|
||||
-- to a list of initialized feeds
|
||||
if ok and response then
|
||||
table.insert(initialized_feeds, {
|
||||
config = feed,
|
||||
document = response,
|
||||
})
|
||||
else
|
||||
table.insert(unsupported_feeds_urls, {
|
||||
url .. ": " .. response
|
||||
})
|
||||
end
|
||||
end
|
||||
|
||||
if #unsupported_feeds_urls > 0 then
|
||||
-- When some errors are present, we get a sour message that includes
|
||||
-- information about the source of the error.
|
||||
local unsupported_urls = ""
|
||||
for key, value in pairs(unsupported_feeds_urls) do
|
||||
-- Create the error message.
|
||||
-- unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2]
|
||||
unsupported_urls = value[1] .. "\n\n"
|
||||
-- Not sure what this does.
|
||||
if key ~= #unsupported_feeds_urls then
|
||||
unsupported_urls = BD.url(unsupported_urls) .. ", "
|
||||
end
|
||||
end
|
||||
error_callback(
|
||||
T(N_("Could not initialize a feed:\n\n%2\n\nPlease review your feed configuration.", "Could not initialize %1 feeds:\n\n%2\n\nPlease review your feed configurations.", #unsupported_feeds_urls),
|
||||
#unsupported_feeds_urls, unsupported_urls)
|
||||
)
|
||||
end
|
||||
|
||||
return initialized_feeds
|
||||
end
|
||||
|
||||
-- This function contacts the feed website and attempts to get
|
||||
-- the RSS/Atom document with a list of the latest items.
|
||||
function FeedSource:fetchDocumentByUrl(url)
|
||||
local document
|
||||
-- Get the XML document representing the feed
|
||||
local ok, response = pcall(function()
|
||||
local success, content = NewsHelpers:getUrlContent(url)
|
||||
if (success) then
|
||||
return content
|
||||
else
|
||||
error("Failed to download content for url: " .. url, 0)
|
||||
end
|
||||
end)
|
||||
-- Check to see if a response is available to deserialize.
|
||||
if ok then
|
||||
-- Deserialize the XML document into something Lua can use
|
||||
document = NewsHelpers:deserializeXMLString(response)
|
||||
end
|
||||
-- Return the document or any errors that may have occured
|
||||
if ok or document then
|
||||
return document
|
||||
else
|
||||
if not ok then
|
||||
error("(Reason: Failed to download feed document)", 0)
|
||||
else
|
||||
error("(Reason: Error during feed document deserialization)", 0)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
-- Supply this method with the XML document returned by the feed,
|
||||
-- and it will initialized the document by extracting the feed title,
|
||||
-- feed items, and items count.
|
||||
function FeedSource:initializeDocument(document)
|
||||
local feed_title
|
||||
local feed_items
|
||||
local total_items
|
||||
|
||||
local ok = pcall(function()
|
||||
return self:getFeedType(
|
||||
document,
|
||||
function()
|
||||
-- RSS callback
|
||||
feed_title = util.htmlEntitiesToUtf8(document.rss.channel.title)
|
||||
feed_items = document.rss.channel.item
|
||||
total_items = #document.rss.channel.item
|
||||
end,
|
||||
function()
|
||||
-- Atom callback
|
||||
feed_title = FeedSource:getFeedTitle(document.feed.title)
|
||||
feed_items = document.feed.entry
|
||||
total_items = #document.feed.entry
|
||||
end,
|
||||
function()
|
||||
-- RDF callback
|
||||
feed_title = util.htmlEntitiesToUtf8(document["rdf:RDF"].channel.title)
|
||||
feed_items = document["rdf:RDF"].item
|
||||
total_items = #document["rdf:RDF"].item
|
||||
end
|
||||
)
|
||||
end)
|
||||
|
||||
if ok then
|
||||
document.title = feed_title
|
||||
document.items = feed_items
|
||||
document.total_items = total_items
|
||||
return document
|
||||
else
|
||||
error(_("Could not initialize feed document"), 0)
|
||||
end
|
||||
end
|
||||
|
||||
function FeedSource:getItemsContent(feed, progress_callback, error_callback)
|
||||
local limit = tonumber(feed.config.limit)
|
||||
local total_items = (limit == 0) and
|
||||
feed.document.total_items or
|
||||
limit
|
||||
local initialized_feed_items = {}
|
||||
-- Download each ite0m in the feed
|
||||
for index, item in pairs(feed.document.items) do
|
||||
-- If limit has been met, stop downloading feed.
|
||||
if limit ~= 0 and index - 1 == limit then
|
||||
break
|
||||
end
|
||||
-- Display feedback to user.
|
||||
progress_callback(T(
|
||||
_("%3\n Downloading item %1 of %2"),
|
||||
index,
|
||||
total_items,
|
||||
feed.document.title
|
||||
))
|
||||
-- Download the article's HTML.
|
||||
local ok, response = pcall(function()
|
||||
return self:initializeItemHtml(
|
||||
feed,
|
||||
self:getItemHtml(
|
||||
item,
|
||||
feed.config.download_full_article
|
||||
)
|
||||
)
|
||||
end)
|
||||
|
||||
-- Add the result to our table, or send a
|
||||
-- result to the error callback.
|
||||
if ok then
|
||||
table.insert(initialized_feed_items, {
|
||||
html = response.html,
|
||||
images = response.images,
|
||||
item_slug = FeedSource:getItemTitleWithDate(item),
|
||||
item_title = item.title,
|
||||
md5 = md5(item.title),
|
||||
feed_title = feed.document.title,
|
||||
})
|
||||
else
|
||||
error_callback(
|
||||
T(_("Could not get content for: %1"), feed.document.title)
|
||||
)
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
if #initialized_feed_items > 0 then
|
||||
return initialized_feed_items
|
||||
else
|
||||
return nil
|
||||
end
|
||||
end
|
||||
|
||||
function FeedSource:initializeItemHtml(feed, html)
|
||||
local url = feed.config[1]
|
||||
-- local download_full_article = feed.config.download_full_article ~= false
|
||||
local include_images = feed.config.include_images ~= false
|
||||
local filter_element = feed.config.filter_element or
|
||||
feed.config.filter_element == nil
|
||||
local enable_filter = feed.config.enable_filter ~= false
|
||||
local item_images, item_html = DownloadBackend:getImagesAndHtml(
|
||||
html,
|
||||
url,
|
||||
include_images,
|
||||
enable_filter,
|
||||
filter_element
|
||||
)
|
||||
return {
|
||||
html = item_html,
|
||||
images = item_images
|
||||
}
|
||||
end
|
||||
|
||||
function FeedSource:getFeedType(document, rss_cb, atom_cb, rdf_cb)
|
||||
-- Check to see if the feed uses RSS.
|
||||
local is_rss = document.rss and
|
||||
document.rss.channel and
|
||||
document.rss.channel.title and
|
||||
document.rss.channel.item and
|
||||
document.rss.channel.item[1] and
|
||||
document.rss.channel.item[1].title and
|
||||
document.rss.channel.item[1].link
|
||||
-- Check to see if the feed uses Atom.
|
||||
local is_atom = document.feed and
|
||||
document.feed.title and
|
||||
document.feed.entry[1] and
|
||||
document.feed.entry[1].title and
|
||||
document.feed.entry[1].link
|
||||
-- Setup the feed values based on feed type
|
||||
local is_rdf = document["rdf:RDF"] and
|
||||
document["rdf:RDF"].channel and
|
||||
document["rdf:RDF"].channel.title
|
||||
if is_atom then
|
||||
return atom_cb()
|
||||
elseif is_rss then
|
||||
return rss_cb()
|
||||
elseif is_rdf then
|
||||
return rdf_cb()
|
||||
end
|
||||
-- Return the values through our callback, or call an
|
||||
-- error message if the feed wasn't RSS or Atom
|
||||
if not is_rss or not is_atom or not is_rdf then
|
||||
local error_message
|
||||
if not is_rss or not is_rdf then
|
||||
error_message = _("(Reason: Couldn't process RSS)")
|
||||
elseif not is_atom then
|
||||
error_message = _("(Reason: Couldn't process Atom)")
|
||||
end
|
||||
error(error_message)
|
||||
end
|
||||
end
|
||||
|
||||
function FeedSource:getItemHtml(item, download_full_article)
|
||||
if download_full_article then
|
||||
return NewsHelpers:loadPage(
|
||||
FeedSource:getFeedLink(item.link)
|
||||
)
|
||||
else
|
||||
local feed_description = item.description or item.summary
|
||||
local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.")
|
||||
return string.format([[<!DOCTYPE html>
|
||||
<html>
|
||||
<head><meta charset='UTF-8'><title>%s</title></head>
|
||||
<body><header><h2>%s</h2></header><article>%s</article>
|
||||
<br><footer><small>%s</small></footer>
|
||||
</body>
|
||||
</html>]], item.title, item.title, feed_description, footer)
|
||||
end
|
||||
end
|
||||
|
||||
-- @todo: move this elsewhere
|
||||
function FeedSource:getEpubOutputDir(download_dir, sub_dir, epub_title)
|
||||
|
||||
local feed_output_dir = ("%s%s/"):format(
|
||||
download_dir,
|
||||
util.getSafeFilename(util.htmlEntitiesToUtf8(sub_dir)))
|
||||
|
||||
-- Create the output directory if it doesn't exist.
|
||||
if not lfs.attributes(feed_output_dir, "mode") then
|
||||
lfs.mkdir(feed_output_dir)
|
||||
end
|
||||
|
||||
local file_name = FeedSource:getFeedTitle(epub_title)
|
||||
|
||||
return ("%s%s%s"):format(
|
||||
feed_output_dir,
|
||||
file_name,
|
||||
self.file_extension
|
||||
)
|
||||
end
|
||||
|
||||
function FeedSource:createEpub(title, chapters, abs_output_path, progress_callback, error_callback)
|
||||
|
||||
local file_exists = lfs.attributes(abs_output_path, "mode")
|
||||
|
||||
if file_exists then
|
||||
logger.dbg("NewsDownloader: Skipping. EPUB file already exists", abs_output_path)
|
||||
return true
|
||||
end
|
||||
|
||||
if #chapters == 0 then
|
||||
error(_("Error: chapters contains 0 items"), 0)
|
||||
end
|
||||
|
||||
local images = {}
|
||||
|
||||
for index, chapter in ipairs(chapters) do
|
||||
for jndex, image in ipairs(chapter.images) do
|
||||
table.insert(
|
||||
images,
|
||||
image
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
local epub = DownloadBackend:new{}
|
||||
|
||||
progress_callback(T(_("Building EPUB %1"), title))
|
||||
epub:setTitle(title)
|
||||
epub:addToc(chapters)
|
||||
epub:addManifest(chapters, images)
|
||||
|
||||
progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding contents")))
|
||||
epub:addContents(chapters)
|
||||
|
||||
progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding images")))
|
||||
epub:addImages(images)
|
||||
|
||||
progress_callback(T(_("Building EPUB %1: %2"), title, _("Writing EPUB to disk")))
|
||||
local ok = pcall(function()
|
||||
return epub:build(abs_output_path)
|
||||
end)
|
||||
|
||||
if ok then
|
||||
if lfs.attributes(abs_output_path, "mode") then
|
||||
return true
|
||||
end
|
||||
end
|
||||
|
||||
return false
|
||||
end
|
||||
|
||||
local function parseDate(dateTime)
|
||||
-- Uses lua-feedparser https://github.com/slact/lua-feedparser
|
||||
-- feedparser is available under the (new) BSD license.
|
||||
-- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser
|
||||
local date = dateparser.parse(dateTime)
|
||||
return os.date("%y-%m-%d_%H-%M_", date)
|
||||
end
|
||||
|
||||
function FeedSource:getFeedTitleWithDate(feed)
|
||||
local title = util.getSafeFilename(FeedSource:getFeedTitle(feed.document.title))
|
||||
return os.date("%y-%m-%d_%H-%M_") .. title
|
||||
end
|
||||
|
||||
-- Creates a title with date from a feed item.
|
||||
function FeedSource:getItemTitleWithDate(item)
|
||||
local title = util.getSafeFilename(FeedSource:getFeedTitle(item.title))
|
||||
if item.updated then
|
||||
title = parseDate(item.updated) .. title
|
||||
elseif item.pubDate then
|
||||
title = parseDate(item.pubDate) .. title
|
||||
elseif item.published then
|
||||
title = parseDate(item.published) .. title
|
||||
end
|
||||
return title
|
||||
end
|
||||
|
||||
-- If a title looks like <title>blabla</title> it'll just be feed.title.
|
||||
-- If a title looks like <title attr="alb">blabla</title> then we get a table
|
||||
-- where [1] is the title string and the attributes are also available.
|
||||
function FeedSource:getFeedTitle(possible_title)
|
||||
if type(possible_title) == "string" then
|
||||
return util.htmlEntitiesToUtf8(possible_title)
|
||||
elseif possible_title[1] and type(possible_title[1]) == "string" then
|
||||
return util.htmlEntitiesToUtf8(possible_title[1])
|
||||
end
|
||||
end
|
||||
-- There can be multiple links.
|
||||
-- For now we just assume the first link is probably the right one.
|
||||
--- @todo Write unit tests.
|
||||
-- Some feeds that can be used for unit test.
|
||||
-- http://fransdejonge.com/feed/ for multiple links.
|
||||
-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.
|
||||
function FeedSource:getFeedLink(possible_link)
|
||||
local E = {}
|
||||
if type(possible_link) == "string" then
|
||||
return possible_link
|
||||
elseif (possible_link._attr or E).href then
|
||||
return possible_link._attr.href
|
||||
elseif ((possible_link[1] or E)._attr or E).href then
|
||||
return possible_link[1]._attr.href
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
return FeedSource
|
@ -1,126 +0,0 @@
|
||||
local logger = require("logger")
|
||||
local http = require("socket.http")
|
||||
local socketutil = require("socketutil")
|
||||
local socket_url = require("socket.url")
|
||||
local socket = require("socket")
|
||||
local ltn12 = require("ltn12")
|
||||
|
||||
local NewsHelpers = {
|
||||
}
|
||||
|
||||
local max_redirects = 5; --prevent infinite redirects
|
||||
|
||||
-- Get URL content
|
||||
function NewsHelpers:getUrlContent(url, timeout, maxtime, redirectCount)
|
||||
logger.dbg("getUrlContent(", url, ",", timeout, ",", maxtime, ",", redirectCount, ")")
|
||||
if not redirectCount then
|
||||
redirectCount = 0
|
||||
elseif redirectCount == max_redirects then
|
||||
error("EpubDownloadBackend: reached max redirects: ", redirectCount)
|
||||
end
|
||||
|
||||
if not timeout then timeout = 10 end
|
||||
logger.dbg("timeout:", timeout)
|
||||
|
||||
local sink = {}
|
||||
local parsed = socket_url.parse(url)
|
||||
socketutil:set_timeout(timeout, maxtime or 30)
|
||||
local request = {
|
||||
url = url,
|
||||
method = "GET",
|
||||
sink = maxtime and socketutil.table_sink(sink) or ltn12.sink.table(sink),
|
||||
}
|
||||
logger.dbg("request:", request)
|
||||
local code, headers, status = socket.skip(1, http.request(request))
|
||||
socketutil:reset_timeout()
|
||||
logger.dbg("After http.request")
|
||||
local content = table.concat(sink) -- empty or content accumulated till now
|
||||
logger.dbg("type(code):", type(code))
|
||||
logger.dbg("code:", code)
|
||||
logger.dbg("headers:", headers)
|
||||
logger.dbg("status:", status)
|
||||
logger.dbg("#content:", #content)
|
||||
|
||||
if code == socketutil.TIMEOUT_CODE or
|
||||
code == socketutil.SSL_HANDSHAKE_CODE or
|
||||
code == socketutil.SINK_TIMEOUT_CODE
|
||||
then
|
||||
logger.warn("request interrupted:", code)
|
||||
return false, code
|
||||
end
|
||||
if headers == nil then
|
||||
logger.warn("No HTTP headers:", code, status)
|
||||
return false, "Network or remote server unavailable"
|
||||
end
|
||||
if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK
|
||||
if code and code > 299 and code < 400 and headers and headers.location then -- handle 301, 302...
|
||||
local redirected_url = headers.location
|
||||
local parsed_redirect_location = socket_url.parse(redirected_url)
|
||||
if not parsed_redirect_location.host then
|
||||
parsed_redirect_location.host = parsed.host
|
||||
parsed_redirect_location.scheme = parsed.scheme
|
||||
redirected_url = socket_url.build(parsed_redirect_location)
|
||||
end
|
||||
logger.dbg("getUrlContent: Redirecting to url: ", redirected_url)
|
||||
return self:getUrlContent(redirected_url, timeout, maxtime, redirectCount + 1)
|
||||
else
|
||||
-- error("EpubDownloadBackend: Don't know how to handle HTTP response status: " .. status)
|
||||
-- error("EpubDownloadBackend: Don't know how to handle HTTP response status.")
|
||||
logger.warn("HTTP status not okay:", code, status)
|
||||
return false, status
|
||||
end
|
||||
end
|
||||
if headers and headers["content-length"] then
|
||||
-- Check we really got the announced content size
|
||||
local content_length = tonumber(headers["content-length"])
|
||||
if #content ~= content_length then
|
||||
return false, "Incomplete content received"
|
||||
end
|
||||
end
|
||||
logger.dbg("Returning content ok")
|
||||
return true, content
|
||||
end
|
||||
|
||||
function NewsHelpers:loadPage(url)
|
||||
logger.dbg("Load page: ", url)
|
||||
local success, content
|
||||
--[[ if self.trap_widget then -- if previously set with EpubDownloadBackend:setTrapWidget()
|
||||
local Trapper = require("ui/trapper")
|
||||
local timeout, maxtime = 30, 60
|
||||
-- We use dismissableRunInSubprocess with complex return values:
|
||||
completed, success, content = Trapper:dismissableRunInSubprocess(function()
|
||||
return NewsHelpers:getUrlContent(url, timeout, maxtime)
|
||||
end, self.trap_widget)
|
||||
if not completed then
|
||||
error(self.dismissed_error_code) -- "Interrupted by user"
|
||||
end
|
||||
else]]--
|
||||
local timeout, maxtime = 10, 60
|
||||
success, content = NewsHelpers:getUrlContent(url, timeout, maxtime)
|
||||
-- end
|
||||
logger.dbg("success:", success, "type(content):", type(content), "content:", content:sub(1, 500), "...")
|
||||
if not success then
|
||||
error(content)
|
||||
else
|
||||
return content
|
||||
end
|
||||
end
|
||||
|
||||
function NewsHelpers:deserializeXMLString(xml_str)
|
||||
-- uses LuaXML https://github.com/manoelcampos/LuaXML
|
||||
-- The MIT License (MIT)
|
||||
-- Copyright (c) 2016 Manoel Campos da Silva Filho
|
||||
-- see: koreader/plugins/newsdownloader.koplugin/lib/LICENSE_LuaXML
|
||||
local treehdl = require("lib/handler")
|
||||
local libxml = require("lib/xml")
|
||||
-- Instantiate the object that parses the XML file as a Lua table.
|
||||
local xmlhandler = treehdl.simpleTreeHandler()
|
||||
-- Instantiate the object that parses the XML to a Lua table.
|
||||
local ok = pcall(function()
|
||||
libxml.xmlParser(xmlhandler):parse(xml_str)
|
||||
end)
|
||||
if not ok then return end
|
||||
return xmlhandler.root
|
||||
end
|
||||
|
||||
return NewsHelpers
|
Loading…
Reference in New Issue