Revert NewsDownloader (#8953)

My volumize PR introduced too many issues. Revert to previous version of NewsDownloader until they can be resolved.

Fixes #8867 

Reintroduces  #8799, #6234
reviewable/pr8958/r1
Scarlett 2 years ago committed by GitHub
parent c9bfb74d2a
commit d53ee056cc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,10 +1,15 @@
local NewsHelpers = require("http_utilities")
local Version = require("version")
local ffiutil = require("ffi/util")
local http = require("socket.http")
local logger = require("logger")
local ltn12 = require("ltn12")
local socket = require("socket")
local socket_url = require("socket.url")
local socketutil = require("socketutil")
local _ = require("gettext")
local T = ffiutil.template
local EpubBuilder = {
local EpubDownloadBackend = {
-- Can be set so HTTP requests will be done under Trapper and
-- be interruptible
trap_widget = nil,
@ -12,89 +17,8 @@ local EpubBuilder = {
-- and error() with this code. We make the value of this error
-- accessible here so that caller can know it's a user dismiss.
dismissed_error_code = "Interrupted by user",
title = nil,
ncx_toc = nil,
ncx_manifest = nil,
ncx_contents = nil,
ncx_images = nil,
}
function EpubBuilder:new(o)
o = o or {}
self.__index = self
setmetatable(o, self)
return o
end
function EpubBuilder:build(abs_output_path)
-- Open the zip file (with .tmp for now, as crengine may still
-- have a handle to the final epub_path, and we don't want to
-- delete a good one if we fail/cancel later)
local tmp_path = abs_output_path .. ".tmp"
local ZipWriter = require("ffi/zipwriter")
local epub = ZipWriter:new{}
if not epub:open(tmp_path) then
logger.dbg("Failed to open tmp_path")
return false
end
epub:add("mimetype", "application/epub+zip")
epub:add("META-INF/container.xml", [[
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>]])
-- Add the manifest.
if not self.ncx_manifest or #self.ncx_manifest == 0 then
error("EPUB does not contain a valid manifest.")
end
--logger.dbg("Adding Manifest:", self.ncx_manifest)
epub:add("OEBPS/content.opf", table.concat(self.ncx_manifest))
-- Add the table of contents.
if not self.ncx_toc or #self.ncx_toc == 0 then
error("EPUB does not contain a valid table of contents.")
end
--logger.dbg("Adding TOC:", self.ncx_toc)
epub:add("OEBPS/toc.ncx", table.concat(self.ncx_toc))
-- Add the contents.
if not self.ncx_contents or #self.ncx_manifest == 0 then
error("EPUB does not contain any content.")
end
--logger.dbg("Adding Content:", self.ncx_contents)
for index, content in ipairs(self.ncx_contents) do
epub:add("OEBPS/" .. content.filename, content.html)
end
-- Add the images.
--logger.dbg("Adding Images:", self.ncx_images)
if self.ncx_images then
for index, image in ipairs(self.ncx_images) do
epub:add(
"OEBPS/" .. image.path,
image.content,
image.no_compression
)
end
end
epub:close()
os.rename(tmp_path, abs_output_path)
collectgarbage()
end
function EpubBuilder:release()
-- Stub for cleanup methods
end
local max_redirects = 5; --prevent infinite redirects
-- filter HTML using CSS selector
local function filter(text, element)
@ -144,9 +68,79 @@ local function filter(text, element)
return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
end
function EpubBuilder:getResponseAsString(url)
logger.dbg("EpubBuilder:getResponseAsString(", url, ")")
local success, content = NewsHelpers:getUrlContent(url)
-- Get URL content
local function getUrlContent(url, timeout, maxtime, redirectCount)
logger.dbg("getUrlContent(", url, ",", timeout, ",", maxtime, ",", redirectCount, ")")
if not redirectCount then
redirectCount = 0
elseif redirectCount == max_redirects then
error("EpubDownloadBackend: reached max redirects: ", redirectCount)
end
if not timeout then timeout = 10 end
logger.dbg("timeout:", timeout)
local sink = {}
local parsed = socket_url.parse(url)
socketutil:set_timeout(timeout, maxtime or 30)
local request = {
url = url,
method = "GET",
sink = maxtime and socketutil.table_sink(sink) or ltn12.sink.table(sink),
}
logger.dbg("request:", request)
local code, headers, status = socket.skip(1, http.request(request))
socketutil:reset_timeout()
logger.dbg("After http.request")
local content = table.concat(sink) -- empty or content accumulated till now
logger.dbg("type(code):", type(code))
logger.dbg("code:", code)
logger.dbg("headers:", headers)
logger.dbg("status:", status)
logger.dbg("#content:", #content)
if code == socketutil.TIMEOUT_CODE or
code == socketutil.SSL_HANDSHAKE_CODE or
code == socketutil.SINK_TIMEOUT_CODE
then
logger.warn("request interrupted:", code)
return false, code
end
if headers == nil then
logger.warn("No HTTP headers:", code, status)
return false, "Network or remote server unavailable"
end
if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK
if code and code > 299 and code < 400 and headers and headers.location then -- handle 301, 302...
local redirected_url = headers.location
local parsed_redirect_location = socket_url.parse(redirected_url)
if not parsed_redirect_location.host then
parsed_redirect_location.host = parsed.host
parsed_redirect_location.scheme = parsed.scheme
redirected_url = socket_url.build(parsed_redirect_location)
end
logger.dbg("getUrlContent: Redirecting to url: ", redirected_url)
return getUrlContent(redirected_url, timeout, maxtime, redirectCount + 1)
else
error("EpubDownloadBackend: Don't know how to handle HTTP response status: ", status)
end
logger.warn("HTTP status not okay:", code, status)
return false, "Remote server error or unavailable"
end
if headers and headers["content-length"] then
-- Check we really got the announced content size
local content_length = tonumber(headers["content-length"])
if #content ~= content_length then
return false, "Incomplete content received"
end
end
logger.dbg("Returning content ok")
return true, content
end
function EpubDownloadBackend:getResponseAsString(url)
logger.dbg("EpubDownloadBackend:getResponseAsString(", url, ")")
local success, content = getUrlContent(url)
if (success) then
return content
else
@ -154,14 +148,38 @@ function EpubBuilder:getResponseAsString(url)
end
end
function EpubBuilder:setTrapWidget(trap_widget)
function EpubDownloadBackend:setTrapWidget(trap_widget)
self.trap_widget = trap_widget
end
function EpubBuilder:resetTrapWidget()
function EpubDownloadBackend:resetTrapWidget()
self.trap_widget = nil
end
function EpubDownloadBackend:loadPage(url)
local completed, success, content
if self.trap_widget then -- if previously set with EpubDownloadBackend:setTrapWidget()
local Trapper = require("ui/trapper")
local timeout, maxtime = 30, 60
-- We use dismissableRunInSubprocess with complex return values:
completed, success, content = Trapper:dismissableRunInSubprocess(function()
return getUrlContent(url, timeout, maxtime)
end, self.trap_widget)
if not completed then
error(self.dismissed_error_code) -- "Interrupted by user"
end
else
local timeout, maxtime = 10, 60
success, content = getUrlContent(url, timeout, maxtime)
end
logger.dbg("success:", success, "type(content):", type(content), "content:", content:sub(1, 500), "...")
if not success then
error(content)
else
return content
end
end
local ext_to_mimetype = {
png = "image/png",
jpg = "image/jpeg",
@ -177,15 +195,29 @@ local ext_to_mimetype = {
ttf = "application/truetype",
woff = "application/font-woff",
}
-- GetPublishableHtml
function EpubBuilder:getImagesAndHtml(html, url, include_images, filter_enable, filter_element)
-- Create an epub file (with possibly images)
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
-- Use Trapper to display progress and ask questions through the UI.
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
-- Trapper:info() and Trapper:confirm() will just use logger.
local UI = require("ui/trapper")
-- We may need to build absolute urls for non-absolute links and images urls
local base_url = socket_url.parse(url)
local cancelled = false
local page_htmltitle = html:match([[<title>(.*)</title>]])
logger.dbg("page_htmltitle is ", page_htmltitle)
-- local sections = html.sections -- Wikipedia provided TOC
local bookid = "bookid_placeholder" --string.format("wikipedia_%s_%s_%s", lang, phtml.pageid, phtml.revid)
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
-- should it changes if content is updated (as now, including the wikipedia revisionId),
-- or should it stays the same even if revid changes (content of the same book updated).
if filter_enable then html = filter(html, filter_element) end
local images = {}
local seen_images = {}
local imagenum = 1
local cover_imgid = nil -- best candidate for cover among our images
html = filter_enable and filter(html, filter_element) or html
local processImg = function(img_tag)
local src = img_tag:match([[src="([^"]*)"]])
if src == nil or src == "" then
@ -240,20 +272,13 @@ function EpubBuilder:getImagesAndHtml(html, url, include_images, filter_enable,
width = width,
height = height,
}
table.insert(images, cur_image)
seen_images[src] = cur_image
-- Use first image of reasonable size (not an icon) and portrait-like as cover-image
if not cover_imgid and width and width > 50 and height and height > 50 and height > width then
logger.dbg("Found a suitable cover image")
cover_imgid = imgid
cur_image["cover_image"] = true
end
table.insert(
images,
cur_image
)
imagenum = imagenum + 1
end
-- crengine will NOT use width and height attributes, but it will use
@ -271,53 +296,130 @@ function EpubBuilder:getImagesAndHtml(html, url, include_images, filter_enable,
local style = table.concat(style_props, "; ")
return string.format([[<img src="%s" style="%s" alt=""/>]], cur_image.imgpath, style)
end
html = html:gsub("(<%s*img [^>]*>)", processImg)
logger.dbg("Images found in html:", images)
if include_images then
html = html:gsub("(<%s*img [^>]*>)", processImg)
else
-- See what to do with images
local use_img_2x = false
if not include_images then
-- Remove img tags to avoid little blank squares of missing images
html = html:gsub("<%s*img [^>]*>", "")
-- We could remove the whole image container <div class="thumb"...> ,
-- but it's a lot of nested <div> and not easy to do.
-- So the user will see the image legends and know a bit about
-- the images they chose to not get.
-- the images he chose to not get.
end
-- Force a GC to free the memory we used (the second call may help
-- reclaim more memory).
collectgarbage()
collectgarbage()
return images, html
end
UI:info(T(_("%1\n\nBuilding EPUB…"), message))
-- Open the zip file (with .tmp for now, as crengine may still
-- have a handle to the final epub_path, and we don't want to
-- delete a good one if we fail/cancel later)
local epub_path_tmp = epub_path .. ".tmp"
local ZipWriter = require("ffi/zipwriter")
local epub = ZipWriter:new{}
if not epub:open(epub_path_tmp) then
logger.dbg("Failed to open epub_path_tmp")
return false
end
function EpubBuilder:setTitle(title)
self.title = title
end
-- We now create and add all the required epub files
-- ----------------------------------------------------------------
-- /mimetype : always "application/epub+zip"
epub:add("mimetype", "application/epub+zip")
function EpubBuilder:addToc(chapters)
-- ----------------------------------------------------------------
-- /META-INF/container.xml : always the same content
epub:add("META-INF/container.xml", [[
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>]])
logger.dbg("Added META-INF/container.xml")
-- ----------------------------------------------------------------
-- OEBPS/content.opf : metadata + list of other files (paths relative to OEBPS/ directory)
-- Other possible items in this file that are of no interest to crengine :
-- In <manifest> :
-- <item id="cover" href="title.html" media-type="application/xhtml+xml"/>
-- <item id="cover-image" href="images/cover.png" media-type="image/png"/>
-- (crengine only uses <meta name="cover" content="cover-image" /> to get the cover image)
-- In <spine toc="ncx"> :
-- <itemref idref="cover" linear="no"/>
-- And a <guide> section :
-- <guide>
-- <reference href="title.html" type="cover" title="Cover"/>
-- <reference href="toc.html" type="toc" title="Table of Contents" href="toc.html" />
-- </guide>
local content_opf_parts = {}
-- head
local meta_cover = "<!-- no cover image -->"
if include_images and cover_imgid then
meta_cover = string.format([[<meta name="cover" content="%s"/>]], cover_imgid)
end
logger.dbg("meta_cover:", meta_cover)
table.insert(content_opf_parts, string.format([[
<?xml version='1.0' encoding='utf-8'?>
<package xmlns="http://www.idpf.org/2007/opf"
xmlns:dc="http://purl.org/dc/elements/1.1/"
unique-identifier="bookid" version="2.0">
<metadata>
<dc:title>%s</dc:title>
<dc:publisher>KOReader %s</dc:publisher>
%s
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="content" href="content.html" media-type="application/xhtml+xml"/>
<item id="css" href="stylesheet.css" media-type="text/css"/>
]], page_htmltitle, Version:getCurrentRevision(), meta_cover))
-- images files
if include_images then
for inum, img in ipairs(images) do
table.insert(content_opf_parts, string.format([[ <item id="%s" href="%s" media-type="%s"/>%s]], img.imgid, img.imgpath, img.mimetype, "\n"))
end
end
-- tail
table.insert(content_opf_parts, [[
</manifest>
<spine toc="ncx">
<itemref idref="content"/>
</spine>
</package>
]])
epub:add("OEBPS/content.opf", table.concat(content_opf_parts))
logger.dbg("Added OEBPS/content.opf")
-- ----------------------------------------------------------------
-- OEBPS/stylesheet.css
--- @todo We told it we'd include a stylesheet.css, so it's probably best
-- that we do. In theory, we could try to fetch any *.css files linked in
-- the main html.
epub:add("OEBPS/stylesheet.css", [[
/* Empty */
]])
logger.dbg("Added OEBPS/stylesheet.css")
-- ----------------------------------------------------------------
-- OEBPS/toc.ncx : table of content
local toc_ncx_parts = {}
local depth = 0
local num = 0
for index, chapter in ipairs(chapters) do
-- Add nav part for each chapter.
table.insert(
toc_ncx_parts,
string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="%s.html"/></navPoint>]],
num,
num,
chapter.title,
chapter.md5
)
)
num = num + 1
local cur_level = 0
local np_end = [[</navPoint>]]
local num = 1
-- Add our own first section for first page, with page name as title
table.insert(toc_ncx_parts, string.format([[<navPoint id="navpoint-%s" playOrder="%s"><navLabel><text>%s</text></navLabel><content src="content.html"/>]], num, num, page_htmltitle))
table.insert(toc_ncx_parts, np_end)
--- @todo Not essential for most articles, but longer articles might benefit
-- from parsing <h*> tags and constructing a proper TOC
while cur_level > 0 do
table.insert(toc_ncx_parts, np_end)
cur_level = cur_level - 1
end
-- Prepend NCX head.
table.insert(
toc_ncx_parts,
1,
string.format([[
-- Prepend NCX head
table.insert(toc_ncx_parts, 1, string.format([[
<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
@ -331,172 +433,99 @@ function EpubBuilder:addToc(chapters)
<text>%s</text>
</docTitle>
<navMap>
]],
"placeholder_bookid",
depth,
self.title
)
)
-- Append NCX tail.
table.insert(
toc_ncx_parts,
[[
]], bookid, depth, page_htmltitle))
-- Append NCX tail
table.insert(toc_ncx_parts, [[
</navMap>
</ncx>
]]
)
self.ncx_toc = toc_ncx_parts
end
]])
epub:add("OEBPS/toc.ncx", table.concat(toc_ncx_parts))
logger.dbg("Added OEBPS/toc.ncx")
function EpubBuilder:addManifest(chapters, images)
local content_opf_parts = {}
local spine_parts = {}
local meta_cover = "<!-- no cover image -->"
-- ----------------------------------------------------------------
-- OEBPS/content.html
epub:add("OEBPS/content.html", html)
logger.dbg("Added OEBPS/content.html")
if #images > 0 then
for inum, image in ipairs(images) do
table.insert(
content_opf_parts,
string.format([[<item id="%s" href="%s" media-type="%s"/>%s]],
image.imgid,
image.imgpath,
image.mimetype,
"\n"
)
)
-- See if the image has the tag we previously set indicating
-- it can be used as a cover image.
if image.cover_image then
meta_cover = string.format([[<meta name="cover" content="%s"/>]], image.imgid)
-- Force a GC to free the memory we used till now (the second call may
-- help reclaim more memory).
collectgarbage()
collectgarbage()
-- ----------------------------------------------------------------
-- OEBPS/images/*
if include_images then
local nb_images = #images
for inum, img in ipairs(images) do
-- Process can be interrupted at this point between each image download
-- by tapping while the InfoMessage is displayed
-- We use the fast_refresh option from image #2 for a quicker download
local go_on = UI:info(T(_("%1\n\nRetrieving image %2 / %3 …"), message, inum, nb_images), inum >= 2)
if not go_on then
logger.dbg("cancelled")
cancelled = true
break
end
local src = img.src
if use_img_2x and img.src2x then
src = img.src2x
end
logger.dbg("Getting img ", src)
local success, content = getUrlContent(src)
-- success, content = getUrlContent(src..".unexistant") -- to simulate failure
if success then
logger.dbg("success, size:", #content)
else
logger.dbg("failed fetching:", src)
end
if success then
-- Images do not need to be compressed, so spare some cpu cycles
local no_compression = true
if img.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text)
no_compression = false
end
epub:add("OEBPS/"..img.imgpath, content, no_compression)
logger.dbg("Adding OEBPS/"..img.imgpath)
else
go_on = UI:confirm(T(_("Downloading image %1 failed. Continue anyway?"), inum), _("Stop"), _("Continue"))
if not go_on then
cancelled = true
break
end
end
end
end
if #chapters > 0 then
for index, chapter in ipairs(chapters) do
table.insert(
content_opf_parts,
string.format([[<item id="%s" href="%s.html" media-type="application/xhtml+xml"/>%s]],
chapter.md5,
chapter.md5,
"\n"
)
)
table.insert(
spine_parts,
string.format([[<itemref idref="%s"/>%s]],
chapter.md5,
"\n"
)
)
-- Done with adding files
if cancelled then
if UI:confirm(_("Download did not complete.\nDo you want to create an EPUB with the already downloaded images?"), _("Don't create"), _("Create")) then
cancelled = false
end
end
logger.dbg("meta_cover:", meta_cover)
table.insert(
content_opf_parts,
1,
string.format([[<?xml version='1.0' encoding='utf-8'?>
<package xmlns="http://www.idpf.org/2007/opf"
xmlns:dc="http://purl.org/dc/elements/1.1/"
unique-identifier="bookid" version="2.0">
<metadata>
<dc:title>%s</dc:title>
<dc:publisher>KOReader %s</dc:publisher>
%s
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
]], self.title, Version:getCurrentRevision(), meta_cover)
)
-- tail
table.insert(
content_opf_parts,
string.format([[
</manifest>
<spine toc="ncx">
%s
</spine>
</package>
]], table.concat(spine_parts)
)
)
self.ncx_manifest = content_opf_parts
end
function EpubBuilder:addContents(chapters)
local contents = {}
for index, chapter in ipairs(chapters) do
table.insert(
contents,
{
filename = chapter.md5 .. ".html",
html = chapter.html,
}
)
if cancelled then
UI:info(_("Canceled. Cleaning up…"))
else
UI:info(T(_("%1\n\nPacking EPUB…"), message))
end
epub:close()
self.ncx_contents = contents
end
function EpubBuilder:addImages(images)
local images_table = {}
for index, image in ipairs(images) do
if not image.src then
return
end
local src = image.src
local success, content = NewsHelpers:getUrlContent(src)
-- success, content = NewsHelpers:getUrlContent(src..".unexistant") -- to simulate failure
if success then
logger.dbg("EpubBuilder:addImages = success, size:", #content)
else
logger.dbg("EpubBuilder:addImages = failure fetching:", src)
end
if success then
-- Images do not need to be compressed, so spare some cpu cycles
local no_compression = true
if image.mimetype == "image/svg+xml" then -- except for SVG images (which are XML text)
no_compression = false
end
table.insert(
images_table,
{
path = image.imgpath,
content = content,
compression = no_compression
}
)
if cancelled then
-- Build was cancelled, remove half created .epub
if lfs.attributes(epub_path_tmp, "mode") == "file" then
os.remove(epub_path_tmp)
end
return false
end
self.ncx_images = images_table
-- Finally move the .tmp to the final file
os.rename(epub_path_tmp, epub_path)
logger.dbg("successfully created:", epub_path)
-- Force a GC to free the memory we used (the second call may help
-- reclaim more memory).
collectgarbage()
collectgarbage()
return true
end
-- There can be multiple links.
-- For now we just assume the first link is probably the right one.
--- @todo Write unit tests.
-- Some feeds that can be used for unit test.
-- http://fransdejonge.com/feed/ for multiple links.
-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.
function EpubBuilder:getFeedLink(possible_link)
local E = {}
logger.dbg("Possible link", possible_link)
if type(possible_link) == "string" then
return possible_link
elseif (possible_link._attr or E).href then
return possible_link._attr.href
elseif ((possible_link[1] or E)._attr or E).href then
return possible_link[1]._attr.href
end
end
return EpubBuilder
return EpubDownloadBackend

@ -1,409 +0,0 @@
local BD = require("ui/bidi")
local DownloadBackend = require("epubdownloadbackend")
local NewsHelpers = require("http_utilities")
local dateparser = require("lib.dateparser")
local logger = require("logger")
local md5 = require("ffi/sha2").md5
local util = require("util")
local _ = require("gettext")
local N_ = _.ngettext
local FFIUtil = require("ffi/util")
local T = FFIUtil.template
local FeedSource = {
file_extension = ".epub"
}
function FeedSource:new(o)
o = o or {}
self.__index = self
setmetatable(o, self)
return o
end
function FeedSource:getInitializedFeeds(feed_list, progress_callback, error_callback)
local initialized_feeds = {}
local unsupported_feeds_urls = {}
for idx, feed in ipairs(feed_list) do
local url = feed[1]
-- Show a UI update
progress_callback(T(
_("Setting up feed %1 of %2."),
idx,
url
))
-- Initialize the feed
local ok, response = pcall(function()
return self:initializeDocument(
self:fetchDocumentByUrl(url)
)
end)
-- If the initialization worked, add the feed
-- to a list of initialized feeds
if ok and response then
table.insert(initialized_feeds, {
config = feed,
document = response,
})
else
table.insert(unsupported_feeds_urls, {
url .. ": " .. response
})
end
end
if #unsupported_feeds_urls > 0 then
-- When some errors are present, we get a sour message that includes
-- information about the source of the error.
local unsupported_urls = ""
for key, value in pairs(unsupported_feeds_urls) do
-- Create the error message.
-- unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2]
unsupported_urls = value[1] .. "\n\n"
-- Not sure what this does.
if key ~= #unsupported_feeds_urls then
unsupported_urls = BD.url(unsupported_urls) .. ", "
end
end
error_callback(
T(N_("Could not initialize a feed:\n\n%2\n\nPlease review your feed configuration.", "Could not initialize %1 feeds:\n\n%2\n\nPlease review your feed configurations.", #unsupported_feeds_urls),
#unsupported_feeds_urls, unsupported_urls)
)
end
return initialized_feeds
end
-- This function contacts the feed website and attempts to get
-- the RSS/Atom document with a list of the latest items.
function FeedSource:fetchDocumentByUrl(url)
local document
-- Get the XML document representing the feed
local ok, response = pcall(function()
local success, content = NewsHelpers:getUrlContent(url)
if (success) then
return content
else
error("Failed to download content for url: " .. url, 0)
end
end)
-- Check to see if a response is available to deserialize.
if ok then
-- Deserialize the XML document into something Lua can use
document = NewsHelpers:deserializeXMLString(response)
end
-- Return the document or any errors that may have occured
if ok or document then
return document
else
if not ok then
error("(Reason: Failed to download feed document)", 0)
else
error("(Reason: Error during feed document deserialization)", 0)
end
end
end
-- Supply this method with the XML document returned by the feed,
-- and it will initialized the document by extracting the feed title,
-- feed items, and items count.
function FeedSource:initializeDocument(document)
local feed_title
local feed_items
local total_items
local ok = pcall(function()
return self:getFeedType(
document,
function()
-- RSS callback
feed_title = util.htmlEntitiesToUtf8(document.rss.channel.title)
feed_items = document.rss.channel.item
total_items = #document.rss.channel.item
end,
function()
-- Atom callback
feed_title = FeedSource:getFeedTitle(document.feed.title)
feed_items = document.feed.entry
total_items = #document.feed.entry
end,
function()
-- RDF callback
feed_title = util.htmlEntitiesToUtf8(document["rdf:RDF"].channel.title)
feed_items = document["rdf:RDF"].item
total_items = #document["rdf:RDF"].item
end
)
end)
if ok then
document.title = feed_title
document.items = feed_items
document.total_items = total_items
return document
else
error(_("Could not initialize feed document"), 0)
end
end
function FeedSource:getItemsContent(feed, progress_callback, error_callback)
local limit = tonumber(feed.config.limit)
local total_items = (limit == 0) and
feed.document.total_items or
limit
local initialized_feed_items = {}
-- Download each ite0m in the feed
for index, item in pairs(feed.document.items) do
-- If limit has been met, stop downloading feed.
if limit ~= 0 and index - 1 == limit then
break
end
-- Display feedback to user.
progress_callback(T(
_("%3\n Downloading item %1 of %2"),
index,
total_items,
feed.document.title
))
-- Download the article's HTML.
local ok, response = pcall(function()
return self:initializeItemHtml(
feed,
self:getItemHtml(
item,
feed.config.download_full_article
)
)
end)
-- Add the result to our table, or send a
-- result to the error callback.
if ok then
table.insert(initialized_feed_items, {
html = response.html,
images = response.images,
item_slug = FeedSource:getItemTitleWithDate(item),
item_title = item.title,
md5 = md5(item.title),
feed_title = feed.document.title,
})
else
error_callback(
T(_("Could not get content for: %1"), feed.document.title)
)
end
end
if #initialized_feed_items > 0 then
return initialized_feed_items
else
return nil
end
end
function FeedSource:initializeItemHtml(feed, html)
local url = feed.config[1]
-- local download_full_article = feed.config.download_full_article ~= false
local include_images = feed.config.include_images ~= false
local filter_element = feed.config.filter_element or
feed.config.filter_element == nil
local enable_filter = feed.config.enable_filter ~= false
local item_images, item_html = DownloadBackend:getImagesAndHtml(
html,
url,
include_images,
enable_filter,
filter_element
)
return {
html = item_html,
images = item_images
}
end
function FeedSource:getFeedType(document, rss_cb, atom_cb, rdf_cb)
-- Check to see if the feed uses RSS.
local is_rss = document.rss and
document.rss.channel and
document.rss.channel.title and
document.rss.channel.item and
document.rss.channel.item[1] and
document.rss.channel.item[1].title and
document.rss.channel.item[1].link
-- Check to see if the feed uses Atom.
local is_atom = document.feed and
document.feed.title and
document.feed.entry[1] and
document.feed.entry[1].title and
document.feed.entry[1].link
-- Setup the feed values based on feed type
local is_rdf = document["rdf:RDF"] and
document["rdf:RDF"].channel and
document["rdf:RDF"].channel.title
if is_atom then
return atom_cb()
elseif is_rss then
return rss_cb()
elseif is_rdf then
return rdf_cb()
end
-- Return the values through our callback, or call an
-- error message if the feed wasn't RSS or Atom
if not is_rss or not is_atom or not is_rdf then
local error_message
if not is_rss or not is_rdf then
error_message = _("(Reason: Couldn't process RSS)")
elseif not is_atom then
error_message = _("(Reason: Couldn't process Atom)")
end
error(error_message)
end
end
function FeedSource:getItemHtml(item, download_full_article)
if download_full_article then
return NewsHelpers:loadPage(
FeedSource:getFeedLink(item.link)
)
else
local feed_description = item.description or item.summary
local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.")
return string.format([[<!DOCTYPE html>
<html>
<head><meta charset='UTF-8'><title>%s</title></head>
<body><header><h2>%s</h2></header><article>%s</article>
<br><footer><small>%s</small></footer>
</body>
</html>]], item.title, item.title, feed_description, footer)
end
end
-- @todo: move this elsewhere
function FeedSource:getEpubOutputDir(download_dir, sub_dir, epub_title)
local feed_output_dir = ("%s%s/"):format(
download_dir,
util.getSafeFilename(util.htmlEntitiesToUtf8(sub_dir)))
-- Create the output directory if it doesn't exist.
if not lfs.attributes(feed_output_dir, "mode") then
lfs.mkdir(feed_output_dir)
end
local file_name = FeedSource:getFeedTitle(epub_title)
return ("%s%s%s"):format(
feed_output_dir,
file_name,
self.file_extension
)
end
function FeedSource:createEpub(title, chapters, abs_output_path, progress_callback, error_callback)
local file_exists = lfs.attributes(abs_output_path, "mode")
if file_exists then
logger.dbg("NewsDownloader: Skipping. EPUB file already exists", abs_output_path)
return true
end
if #chapters == 0 then
error(_("Error: chapters contains 0 items"), 0)
end
local images = {}
for index, chapter in ipairs(chapters) do
for jndex, image in ipairs(chapter.images) do
table.insert(
images,
image
)
end
end
local epub = DownloadBackend:new{}
progress_callback(T(_("Building EPUB %1"), title))
epub:setTitle(title)
epub:addToc(chapters)
epub:addManifest(chapters, images)
progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding contents")))
epub:addContents(chapters)
progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding images")))
epub:addImages(images)
progress_callback(T(_("Building EPUB %1: %2"), title, _("Writing EPUB to disk")))
local ok = pcall(function()
return epub:build(abs_output_path)
end)
if ok then
if lfs.attributes(abs_output_path, "mode") then
return true
end
end
return false
end
local function parseDate(dateTime)
-- Uses lua-feedparser https://github.com/slact/lua-feedparser
-- feedparser is available under the (new) BSD license.
-- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser
local date = dateparser.parse(dateTime)
return os.date("%y-%m-%d_%H-%M_", date)
end
function FeedSource:getFeedTitleWithDate(feed)
local title = util.getSafeFilename(FeedSource:getFeedTitle(feed.document.title))
return os.date("%y-%m-%d_%H-%M_") .. title
end
-- Creates a title with date from a feed item.
function FeedSource:getItemTitleWithDate(item)
local title = util.getSafeFilename(FeedSource:getFeedTitle(item.title))
if item.updated then
title = parseDate(item.updated) .. title
elseif item.pubDate then
title = parseDate(item.pubDate) .. title
elseif item.published then
title = parseDate(item.published) .. title
end
return title
end
-- If a title looks like <title>blabla</title> it'll just be feed.title.
-- If a title looks like <title attr="alb">blabla</title> then we get a table
-- where [1] is the title string and the attributes are also available.
function FeedSource:getFeedTitle(possible_title)
if type(possible_title) == "string" then
return util.htmlEntitiesToUtf8(possible_title)
elseif possible_title[1] and type(possible_title[1]) == "string" then
return util.htmlEntitiesToUtf8(possible_title[1])
end
end
-- There can be multiple links.
-- For now we just assume the first link is probably the right one.
--- @todo Write unit tests.
-- Some feeds that can be used for unit test.
-- http://fransdejonge.com/feed/ for multiple links.
-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.
function FeedSource:getFeedLink(possible_link)
local E = {}
if type(possible_link) == "string" then
return possible_link
elseif (possible_link._attr or E).href then
return possible_link._attr.href
elseif ((possible_link[1] or E)._attr or E).href then
return possible_link[1]._attr.href
end
end
return FeedSource

@ -7,10 +7,7 @@ local FeedView = {
DOWNLOAD_FULL_ARTICLE = "download_full_article",
INCLUDE_IMAGES = "include_images",
ENABLE_FILTER = "enable_filter",
FILTER_ELEMENT = "filter_element",
VOLUMIZE = "volumize",
ACTION_RESET_HISTORY = "reset_history",
ACTION_DELETE_FEED = "delete_feed",
FILTER_ELEMENT = "filter_element"
}
function FeedView:getList(feed_config, callback, edit_feed_attribute_callback, delete_feed_callback)
@ -52,7 +49,7 @@ function FeedView:getList(feed_config, callback, edit_feed_attribute_callback, d
return view_content
end
function FeedView:getItem(id, feed, edit_feed_callback, feed_action_callback)
function FeedView:getItem(id, feed, edit_feed_callback, delete_feed_callback)
logger.dbg("NewsDownloader:", feed)
@ -70,7 +67,6 @@ function FeedView:getItem(id, feed, edit_feed_callback, feed_action_callback)
local include_images = feed.include_images ~= false
local enable_filter = feed.enable_filter ~= false
local filter_element = feed.filter_element
local volumize = feed.volumize ~= false
local vc = {
{
@ -140,22 +136,11 @@ function FeedView:getItem(id, feed, edit_feed_callback, feed_action_callback)
)
end
},
{
_("Volumize feed"),
volumize,
callback = function()
edit_feed_callback(
id,
FeedView.VOLUMIZE,
volumize
)
end
},
}
-- These actions only pertain to initiated feeds, so we don't always
-- display them.
if feed_action_callback then
-- We don't always display this. For instance: if a feed
-- is being created, this button is not necessary.
if delete_feed_callback then
table.insert(
vc,
"---"
@ -166,22 +151,8 @@ function FeedView:getItem(id, feed, edit_feed_callback, feed_action_callback)
_("Delete feed"),
"",
callback = function()
feed_action_callback(
id,
FeedView.ACTION_DELETE_FEED
)
end
}
)
table.insert(
vc,
{
_("Reset feed history"),
"",
callback = function()
feed_action_callback(
url,
FeedView.ACTION_RESET_HISTORY
delete_feed_callback(
id
)
end
}

@ -1,126 +0,0 @@
local logger = require("logger")
local http = require("socket.http")
local socketutil = require("socketutil")
local socket_url = require("socket.url")
local socket = require("socket")
local ltn12 = require("ltn12")
local NewsHelpers = {
}
local max_redirects = 5; --prevent infinite redirects
-- Get URL content
function NewsHelpers:getUrlContent(url, timeout, maxtime, redirectCount)
logger.dbg("getUrlContent(", url, ",", timeout, ",", maxtime, ",", redirectCount, ")")
if not redirectCount then
redirectCount = 0
elseif redirectCount == max_redirects then
error("EpubDownloadBackend: reached max redirects: ", redirectCount)
end
if not timeout then timeout = 10 end
logger.dbg("timeout:", timeout)
local sink = {}
local parsed = socket_url.parse(url)
socketutil:set_timeout(timeout, maxtime or 30)
local request = {
url = url,
method = "GET",
sink = maxtime and socketutil.table_sink(sink) or ltn12.sink.table(sink),
}
logger.dbg("request:", request)
local code, headers, status = socket.skip(1, http.request(request))
socketutil:reset_timeout()
logger.dbg("After http.request")
local content = table.concat(sink) -- empty or content accumulated till now
logger.dbg("type(code):", type(code))
logger.dbg("code:", code)
logger.dbg("headers:", headers)
logger.dbg("status:", status)
logger.dbg("#content:", #content)
if code == socketutil.TIMEOUT_CODE or
code == socketutil.SSL_HANDSHAKE_CODE or
code == socketutil.SINK_TIMEOUT_CODE
then
logger.warn("request interrupted:", code)
return false, code
end
if headers == nil then
logger.warn("No HTTP headers:", code, status)
return false, "Network or remote server unavailable"
end
if not code or string.sub(code, 1, 1) ~= "2" then -- all 200..299 HTTP codes are OK
if code and code > 299 and code < 400 and headers and headers.location then -- handle 301, 302...
local redirected_url = headers.location
local parsed_redirect_location = socket_url.parse(redirected_url)
if not parsed_redirect_location.host then
parsed_redirect_location.host = parsed.host
parsed_redirect_location.scheme = parsed.scheme
redirected_url = socket_url.build(parsed_redirect_location)
end
logger.dbg("getUrlContent: Redirecting to url: ", redirected_url)
return self:getUrlContent(redirected_url, timeout, maxtime, redirectCount + 1)
else
-- error("EpubDownloadBackend: Don't know how to handle HTTP response status: " .. status)
-- error("EpubDownloadBackend: Don't know how to handle HTTP response status.")
logger.warn("HTTP status not okay:", code, status)
return false, status
end
end
if headers and headers["content-length"] then
-- Check we really got the announced content size
local content_length = tonumber(headers["content-length"])
if #content ~= content_length then
return false, "Incomplete content received"
end
end
logger.dbg("Returning content ok")
return true, content
end
function NewsHelpers:loadPage(url)
logger.dbg("Load page: ", url)
local success, content
--[[ if self.trap_widget then -- if previously set with EpubDownloadBackend:setTrapWidget()
local Trapper = require("ui/trapper")
local timeout, maxtime = 30, 60
-- We use dismissableRunInSubprocess with complex return values:
completed, success, content = Trapper:dismissableRunInSubprocess(function()
return NewsHelpers:getUrlContent(url, timeout, maxtime)
end, self.trap_widget)
if not completed then
error(self.dismissed_error_code) -- "Interrupted by user"
end
else]]--
local timeout, maxtime = 10, 60
success, content = NewsHelpers:getUrlContent(url, timeout, maxtime)
-- end
logger.dbg("success:", success, "type(content):", type(content), "content:", content:sub(1, 500), "...")
if not success then
error(content)
else
return content
end
end
function NewsHelpers:deserializeXMLString(xml_str)
-- uses LuaXML https://github.com/manoelcampos/LuaXML
-- The MIT License (MIT)
-- Copyright (c) 2016 Manoel Campos da Silva Filho
-- see: koreader/plugins/newsdownloader.koplugin/lib/LICENSE_LuaXML
local treehdl = require("lib/handler")
local libxml = require("lib/xml")
-- Instantiate the object that parses the XML file as a Lua table.
local xmlhandler = treehdl.simpleTreeHandler()
-- Instantiate the object that parses the XML to a Lua table.
local ok = pcall(function()
libxml.xmlParser(xmlhandler):parse(xml_str)
end)
if not ok then return end
return xmlhandler.root
end
return NewsHelpers

@ -1,9 +1,11 @@
local BD = require("ui/bidi")
local DataStorage = require("datastorage")
--local DownloadBackend = require("internaldownloadbackend")
--local DownloadBackend = require("luahttpdownloadbackend")
local DownloadBackend = require("epubdownloadbackend")
local ReadHistory = require("readhistory")
local FFIUtil = require("ffi/util")
local FeedView = require("feed_view")
local FeedSource = require("feed_source")
local InfoMessage = require("ui/widget/infomessage")
local LuaSettings = require("frontend/luasettings")
local UIManager = require("ui/uimanager")
@ -13,6 +15,7 @@ local MultiConfirmBox = require("ui/widget/multiconfirmbox")
local NetworkMgr = require("ui/network/manager")
local Persist = require("persist")
local WidgetContainer = require("ui/widget/container/widgetcontainer")
local dateparser = require("lib.dateparser")
local logger = require("logger")
local util = require("util")
local _ = require("gettext")
@ -24,11 +27,10 @@ local NewsDownloader = WidgetContainer:new{
feed_config_file = "feed_config.lua",
feed_config_path = nil,
news_config_file = "news_settings.lua",
news_history_file = "news_history.lua",
settings = nil,
history = nil,
download_dir_name = "news",
download_dir = nil,
file_extension = ".epub",
config_key_custom_dl_dir = "custom_dl_dir",
empty_feed = {
[1] = "https://",
@ -36,12 +38,46 @@ local NewsDownloader = WidgetContainer:new{
download_full_article = true,
include_images = true,
enable_filter = false,
filter_element = "",
volumize = false
filter_element = ""
},
kv = {}
}
local FEED_TYPE_RSS = "rss"
local FEED_TYPE_ATOM = "atom"
--local initialized = false
--local feed_config_file_name = "feed_config.lua"
--local news_downloader_config_file = "news_downloader_settings.lua
-- If a title looks like <title>blabla</title> it'll just be feed.title.
-- If a title looks like <title attr="alb">blabla</title> then we get a table
-- where [1] is the title string and the attributes are also available.
local function getFeedTitle(possible_title)
if type(possible_title) == "string" then
return util.htmlEntitiesToUtf8(possible_title)
elseif possible_title[1] and type(possible_title[1]) == "string" then
return util.htmlEntitiesToUtf8(possible_title[1])
end
end
-- There can be multiple links.
-- For now we just assume the first link is probably the right one.
--- @todo Write unit tests.
-- Some feeds that can be used for unit test.
-- http://fransdejonge.com/feed/ for multiple links.
-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.
local function getFeedLink(possible_link)
local E = {}
if type(possible_link) == "string" then
return possible_link
elseif (possible_link._attr or E).href then
return possible_link._attr.href
elseif ((possible_link[1] or E)._attr or E).href then
return possible_link[1]._attr.href
end
end
function NewsDownloader:init()
self.ui.menu:registerToMainMenu(self)
end
@ -69,39 +105,7 @@ function NewsDownloader:getSubMenuItems()
text = _("Sync news feeds"),
keep_menu_open = true,
callback = function(touchmenu_instance)
NetworkMgr:runWhenOnline(
function() self:syncAllFeedsWithUI(
touchmenu_instance,
function(feed_message)
-- Callback to fire after sync is finished
local UI = require("ui/trapper")
-- This callback is called after the
-- processing is complete.
--
-- Clear the info widgets before displaying the next ui widget.
-- UI:clear()
-- Ask the user if they want to go to their downloads folder
-- or if they'd rather remain at the menu.
feed_message = feed_message _("Go to downloaders folder?")
local should_go_to_downloads = UI:confirm(
feed_message,
_("Close"),
_("Go to downloads")
)
if should_go_to_downloads then
-- Go to downloads folder.
UI:clear()
self:openDownloadsFolder()
touchmenu_instance:closeMenu()
NetworkMgr:afterWifiAction()
return
else
-- Return to the menu.
NetworkMgr:afterWifiAction()
return
end
end
) end)
NetworkMgr:runWhenOnline(function() self:loadConfigAndProcessFeedsWithUI(touchmenu_instance) end)
end,
},
{
@ -123,6 +127,17 @@ function NewsDownloader:getSubMenuItems()
keep_menu_open = true,
callback = function() self:setCustomDownloadDirectory() end,
},
{
text = _("Never download images"),
keep_menu_open = true,
checked_func = function()
return self.settings:isTrue("never_download_images")
end,
callback = function()
self.settings:toggle("never_download_images")
self.settings:flush()
end,
},
{
text = _("Delete all downloaded items"),
keep_menu_open = true,
@ -136,9 +151,6 @@ function NewsDownloader:getSubMenuItems()
)
if should_delete then
self:removeNewsButKeepFeedConfig()
-- Move user to the downloads folder to avoid an error where they
-- are within a feed folder which we have just deleted.
self:openDownloadsFolder()
Trapper:reset()
else
Trapper:reset()
@ -161,7 +173,7 @@ function NewsDownloader:getSubMenuItems()
}
return sub_item_table
end
-- lazyInitialization sets up our variables to point to the
-- lazyInitialization sets up variables that point to the
-- Downloads folder and the feeds configuration file.
function NewsDownloader:lazyInitialization()
if not self.initialized then
@ -176,8 +188,6 @@ function NewsDownloader:lazyInitialization()
DataStorage:getFullDataDir(),
self.download_dir_name)
end
logger.dbg("NewsDownloader: initializing download history")
self.history = LuaSettings:open(("%s/%s"):format(DataStorage:getSettingsDir(), self.news_history_file))
logger.dbg("NewsDownloader: Custom directory set to:", self.download_dir)
-- If the directory doesn't exist we will create it.
if not lfs.attributes(self.download_dir, "mode") then
@ -195,16 +205,15 @@ function NewsDownloader:lazyInitialization()
self.initialized = true
end
end
-- This function loads the config file. If the config is not available
-- then this function includes prompts for handling that.
function NewsDownloader:loadConfig()
function NewsDownloader:loadConfigAndProcessFeeds(touchmenu_instance)
local UI = require("ui/trapper")
logger.dbg("force repaint due to upcoming blocking calls")
-- Check if the feed config file exists
local ok, feed_config = pcall(dofile, self.feed_config_path)
if not ok or not feed_config then
UI:info(T(_("Invalid configuration file. Detailed error message:\n%1"), feed_config))
return false
return
end
-- If the file contains no table elements, then the user hasn't set any feeds.
if #feed_config <= 0 then
@ -228,198 +237,344 @@ function NewsDownloader:loadConfig()
feed_item_vc
)
end
return false
return
end
local never_download_images = self.settings:isTrue("never_download_images")
local unsupported_feeds_urls = {}
local total_feed_entries = #feed_config
local feed_message
for idx, feed in ipairs(feed_config) do
local url = feed[1]
local limit = feed.limit
local download_full_article = feed.download_full_article == nil or feed.download_full_article
local include_images = not never_download_images and feed.include_images
local enable_filter = feed.enable_filter or feed.enable_filter == nil
local filter_element = feed.filter_element or feed.filter_element == nil
-- Check if the two required attributes are set.
if url and limit then
feed_message = T(_("Processing %1/%2:\n%3"), idx, total_feed_entries, BD.url(url))
UI:info(feed_message)
-- Process the feed source.
self:processFeedSource(
url,
tonumber(limit),
unsupported_feeds_urls,
download_full_article,
include_images,
feed_message,
enable_filter,
filter_element)
else
logger.warn("NewsDownloader: invalid feed config entry.", feed)
end
end
if #unsupported_feeds_urls <= 0 then
-- When no errors are present, we get a happy message.
feed_message = _("Downloading news finished.")
else
-- When some errors are present, we get a sour message that includes
-- information about the source of the error.
local unsupported_urls = ""
for key, value in pairs(unsupported_feeds_urls) do
-- Create the error message.
unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2]
-- Not sure what this does.
if key ~= #unsupported_feeds_urls then
unsupported_urls = BD.url(unsupported_urls) .. ", "
end
end
-- Tell the user there were problems.
feed_message = _("Downloading news finished with errors.")
-- Display a dialogue that requires the user to acknowledge
-- that errors occured.
UI:confirm(
T(_([[
Could not process some feeds.
Unsupported format in: %1. Please
review your feed configuration file.]])
, unsupported_urls),
_("Continue"),
""
)
end
-- Clear the info widgets before displaying the next ui widget.
UI:clear()
-- Check to see if this method was called from the menu. If it was,
-- we will have gotten a touchmenu_instance. This will context gives the user
-- two options about what to do next, which are handled by this block.
if touchmenu_instance then
-- Ask the user if they want to go to their downloads folder
-- or if they'd rather remain at the menu.
feed_message = feed_message .. _("Go to download folder?")
local should_go_to_downloads = UI:confirm(
feed_message,
_("Close"),
_("Go to downloads")
)
if should_go_to_downloads then
-- Go to downloads folder.
UI:clear()
self:openDownloadsFolder()
touchmenu_instance:closeMenu()
NetworkMgr:afterWifiAction()
return
else
-- Return to the menu.
NetworkMgr:afterWifiAction()
return
end
end
-- If we made it this far, then the feed config is valid
-- and the next step is to process its contents
return feed_config
return
end
function NewsDownloader:syncAllFeedsWithUI(touchmenu_instance, callback)
function NewsDownloader:loadConfigAndProcessFeedsWithUI(touchmenu_instance)
local Trapper = require("ui/trapper")
Trapper:wrap(function()
local UI = require("ui/trapper")
-- Get the config
local config = self:loadConfig()
local sync_errors = {}
-- Get the HTML for the feeds
local feedSource = FeedSource:new{}
-- Get the initialized feeds list
local initialized_feeds = feedSource:getInitializedFeeds(
config,
function(progress_message)
-- This callback relays updates to the UI
UI:info(progress_message)
end,
function(error_message)
table.insert(
sync_errors,
error_message
)
end
)
-- In this block, each feed item will be its own
-- epub complete with title and chapters
local epubs_to_make = {}
local epubs_successfully_created = {}
local feed_history = {}
for feed_index, feed in pairs(initialized_feeds) do
-- Go through each feed and make new entry
local items_content = feedSource:getItemsContent(
feed,
function(progress_message)
UI:info(progress_message)
end,
function(error_message)
table.insert(
sync_errors,
error_message
)
end
)
local volumize = feed.config.volumize ~= false
local chapters = {}
local feed_title = feedSource:getFeedTitleWithDate(feed)
local feed_id = feed.config[1] -- The url.
local sub_dir = feedSource:getFeedTitle(feed.document.title)
local item_history = {}
for content_index, content in pairs(items_content) do
-- Check to see if we've already downloaded this item.
local history_for_feed = self.history:child(feed_id)
self:loadConfigAndProcessFeeds(touchmenu_instance)
end)
end
if history_for_feed:has(content.md5) then
logger.dbg("NewsDownloader: ", "Item already downloaded")
UI:info(_("Skipping downloaded item"))
else
local abs_path = feedSource:getEpubOutputDir(
self.download_dir,
sub_dir,
content.item_title
)
-- Not sure the slug returned is what we want.
-- Should be something like 2022_09_20-ArticleTitle
table.insert(
chapters,
{
title = content.item_title,
slug = content.item_slug,
md5 = content.md5,
html = content.html,
images = content.images
}
)
if not volumize then
-- We're not volumizing, so each chapter
-- will be its own epub.
table.insert(
epubs_to_make,
{
title = content.item_title,
chapters = chapters,
abs_path = abs_path,
id = feed_id,
}
)
-- Reset the chapters list.
chapters = {}
end
table.insert(
item_history,
content.md5
)
end
end
-- We're volumizing, so all of the chapters we collected
-- get added to a single epub.
if volumize and #chapters > 0 then
local abs_path = feedSource:getEpubOutputDir(
self.download_dir,
sub_dir,
feed_title
)
function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element)
local ok, response = pcall(function()
return DownloadBackend:getResponseAsString(url)
end)
local feeds
-- Check to see if a response is available to deserialize.
if ok then
feeds = self:deserializeXMLString(response)
end
-- If the response is not available (for a reason that we don't know),
-- add the URL to the unsupported feeds list.
if not ok or not feeds then
local error_message
if not ok then
error_message = _("(Reason: Failed to download content)")
else
error_message = _("(Reason: Error during feed deserialization)")
end
table.insert(
unsupported_feeds_urls,
{
url,
error_message
}
)
return
end
-- Check to see if the feed uses RSS.
local is_rss = feeds.rss
and feeds.rss.channel
and feeds.rss.channel.title
and feeds.rss.channel.item
and feeds.rss.channel.item[1]
and feeds.rss.channel.item[1].title
and feeds.rss.channel.item[1].link
-- Check to see if the feed uses Atom.
local is_atom = feeds.feed
and feeds.feed.title
and feeds.feed.entry[1]
and feeds.feed.entry[1].title
and feeds.feed.entry[1].link
-- Process the feeds accordingly.
if is_atom then
ok = pcall(function()
return self:processFeed(
FEED_TYPE_ATOM,
feeds,
limit,
download_full_article,
include_images,
message,
enable_filter,
filter_element
)
end)
elseif is_rss then
ok = pcall(function()
return self:processFeed(
FEED_TYPE_RSS,
feeds,
limit,
download_full_article,
include_images,
message,
enable_filter,
filter_element
)
end)
end
-- If the feed can't be processed, or it is neither
-- Atom or RSS, then add it to the unsupported feeds list
-- and return an error message.
if not ok or (not is_rss and not is_atom) then
local error_message
if not ok then
error_message = _("(Reason: Failed to download content)")
elseif not is_rss then
error_message = _("(Reason: Couldn't process RSS)")
elseif not is_atom then
error_message = _("(Reason: Couldn't process Atom)")
end
table.insert(
unsupported_feeds_urls,
{
url,
error_message
}
)
end
end
table.insert(
epubs_to_make,
{
title = feed_title,
chapters = chapters,
abs_path = abs_path,
id = feed_id,
}
)
end
function NewsDownloader:deserializeXMLString(xml_str)
-- uses LuaXML https://github.com/manoelcampos/LuaXML
-- The MIT License (MIT)
-- Copyright (c) 2016 Manoel Campos da Silva Filho
-- see: koreader/plugins/newsdownloader.koplugin/lib/LICENSE_LuaXML
local treehdl = require("lib/handler")
local libxml = require("lib/xml")
-- Instantiate the object that parses the XML file as a Lua table.
local xmlhandler = treehdl.simpleTreeHandler()
-- Instantiate the object that parses the XML to a Lua table.
local ok = pcall(function()
libxml.xmlParser(xmlhandler):parse(xml_str)
end)
if not ok then return end
return xmlhandler.root
end
feed_history[feed_id] = item_history
end
function NewsDownloader:processFeed(feed_type, feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
local feed_title
local feed_item
local total_items
-- Setup the above vars based on feed type.
if feed_type == FEED_TYPE_RSS then
feed_title = util.htmlEntitiesToUtf8(feeds.rss.channel.title)
feed_item = feeds.rss.channel.item
total_items = (limit == 0)
and #feeds.rss.channel.item
or limit
else
feed_title = getFeedTitle(feeds.feed.title)
feed_item = feeds.feed.entry
total_items = (limit == 0)
and #feeds.feed.entry
or limit
end
-- Get the path to the output directory.
local feed_output_dir = ("%s%s/"):format(
self.download_dir,
util.getSafeFilename(util.htmlEntitiesToUtf8(feed_title)))
-- Create the output directory if it doesn't exist.
if not lfs.attributes(feed_output_dir, "mode") then
lfs.mkdir(feed_output_dir)
end
-- Download the feed
for index, feed in pairs(feed_item) do
-- If limit has been met, stop downloading feed.
if limit ~= 0 and index - 1 == limit then
break
end
-- Create a message to display during processing.
local article_message = T(
_("%1\n\nFetching article %2/%3:"),
message,
index,
total_items
)
-- Get the feed description.
local feed_description
if feed_type == FEED_TYPE_RSS then
feed_description = feed.description
else
feed_description = feed.summary
end
-- Download the article.
if download_full_article then
self:downloadFeed(
feed,
feed_output_dir,
include_images,
article_message,
enable_filter,
filter_element
)
else
self:createFromDescription(
feed,
feed_description,
feed_output_dir,
include_images,
article_message
)
end
end
end
-- Make each EPUB.
for epub_index, epub in pairs(epubs_to_make) do
local ok = feedSource:createEpub(
epub.title,
epub.chapters,
epub.abs_path,
function(progress_message)
UI:info(progress_message)
end,
function(error_message)
table.insert(
sync_errors,
error_message
)
end
)
if ok then
-- Save the hashes to the setting for this feed.
local hashes_to_save = feed_history[epub.id]
local history_for_feed = self.history:child(epub.id)
for index, hash in ipairs(hashes_to_save) do
if history_for_feed:hasNot(hash) then
history_for_feed:saveSetting(hash, true)
end
end
-- Add the epub title to the successfully created table.
table.insert(
epubs_successfully_created,
epub.title
)
else
table.insert(
sync_errors,
T(
_('Error building EPUB %1'),
epub.title
)
)
end
end
local function parseDate(dateTime)
-- Uses lua-feedparser https://github.com/slact/lua-feedparser
-- feedparser is available under the (new) BSD license.
-- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser
local date = dateparser.parse(dateTime)
return os.date("%y-%m-%d_%H-%M_", date)
end
logger.dbg(epubs_to_make)
-- This appears to be used by Atom feeds in processFeed.
local function getTitleWithDate(feed)
local title = util.getSafeFilename(getFeedTitle(feed.title))
if feed.updated then
title = parseDate(feed.updated) .. title
elseif feed.pubDate then
title = parseDate(feed.pubDate) .. title
elseif feed.published then
title = parseDate(feed.published) .. title
end
return title
end
self.history:flush()
function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message, enable_filter, filter_element)
local title_with_date = getTitleWithDate(feed)
local news_file_path = ("%s%s%s"):format(feed_output_dir,
title_with_date,
self.file_extension)
-- Relay any errors
for index, error_message in pairs(sync_errors) do
UI:confirm(
error_message,
_("Continue"),
""
)
end
local file_mode = lfs.attributes(news_file_path, "mode")
if file_mode == "file" then
logger.dbg("NewsDownloader:", news_file_path, "already exists. Skipping")
else
logger.dbg("NewsDownloader: News file will be stored to :", news_file_path)
local article_message = T(_("%1\n%2"), message, title_with_date)
local link = getFeedLink(feed.link)
local html = DownloadBackend:loadPage(link)
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element)
end
end
local message = (#epubs_successfully_created == 0) and
_("Sync complete. No new EPUBs created.") or
T(_("Sync complete. EPUBs created: %1"),
table.concat(epubs_successfully_created, ", "))
function NewsDownloader:createFromDescription(feed, content, feed_output_dir, include_images, message)
local title_with_date = getTitleWithDate(feed)
local news_file_path = ("%s%s%s"):format(feed_output_dir,
title_with_date,
self.file_extension)
local file_mode = lfs.attributes(news_file_path, "mode")
if file_mode == "file" then
logger.dbg("NewsDownloader:", news_file_path, "already exists. Skipping")
else
logger.dbg("NewsDownloader: News file will be stored to :", news_file_path)
local article_message = T(_("%1\n%2"), message, title_with_date)
local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.")
callback(message)
end)
local html = string.format([[<!DOCTYPE html>
<html>
<head><meta charset='UTF-8'><title>%s</title></head>
<body><header><h2>%s</h2></header><article>%s</article>
<br><footer><small>%s</small></footer>
</body>
</html>]], feed.title, feed.title, content, footer)
local link = getFeedLink(feed.link)
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message)
end
end
function NewsDownloader:removeNewsButKeepFeedConfig()
@ -436,7 +591,7 @@ function NewsDownloader:removeNewsButKeepFeedConfig()
end
end
UIManager:show(InfoMessage:new{
text = _("All downloaded news feed items deleted. To download these again in the future, reset the feed history.")
text = _("All downloaded news feed items deleted.")
})
end
@ -457,10 +612,11 @@ function NewsDownloader:setCustomDownloadDirectory()
end
function NewsDownloader:viewFeedList()
local UI = require("ui/trapper")
UI:info(_("Loading news feed list…"))
-- Protected call to see if feed config path returns a file that can be opened.
local ok, feed_config = pcall(dofile, self.feed_config_path)
if not ok or not feed_config then
local UI = require("ui/trapper")
local change_feed_config = UI:confirm(
_("Could not open feed list. Feeds configuration file is invalid."),
_("Close"),
@ -471,6 +627,15 @@ function NewsDownloader:viewFeedList()
end
return
end
UI:clear()
-- See if the config file contains any feed items
if #feed_config <= 0 then
logger.err("NewsDownloader: empty feed list.", self.feed_config_path)
-- Why not ask the user if they want to add one?
-- Or, in future, move along to our list UI with an entry for new feeds
--return
end
local view_content = FeedView:getList(
feed_config,
@ -482,25 +647,8 @@ function NewsDownloader:viewFeedList()
function(id, edit_key, value)
self:editFeedAttribute(id, edit_key, value)
end,
function(id, action)
if action == FeedView.ACTION_DELETE_FEED then
self:deleteFeed(id)
elseif action == FeedView.ACTION_RESET_HISTORY then
local Trapper = require("ui/trapper")
Trapper:wrap(function()
local should_reset = Trapper:confirm(
_("Are you sure you want to reset the feed history? Proceeding will cause items to be re-downloaded next time you sync."),
_("Cancel"),
_("Reset")
)
if should_reset then
self:resetFeedHistory(id)
Trapper:reset()
else
Trapper:reset()
end
end)
end
function(id)
self:deleteFeed(id)
end
)
-- Add a "Add new feed" button with callback
@ -556,15 +704,10 @@ end
function NewsDownloader:editFeedAttribute(id, key, value)
local kv = self.kv
-- This block determines what kind of UI to produce, or action to run,
-- based on the key value. Some values need an input dialog, others need
-- a Yes/No dialog.
if key == FeedView.RESET_HISTORY then
-- Show a "are you sure" box.
-- Reset the history
self.history:removeTableItem(value, 1)
self.history:flush()
elseif key == FeedView.URL
-- There are basically two types of values: string (incl. numbers)
-- and booleans. This block chooses what type of value our
-- attribute will need and displays the corresponding dialog.
if key == FeedView.URL
or key == FeedView.LIMIT
or key == FeedView.FILTER_ELEMENT then
@ -625,8 +768,6 @@ function NewsDownloader:editFeedAttribute(id, key, value)
text = _("Include images?")
elseif key == FeedView.ENABLE_FILTER then
text = _("Enable CSS filter?")
elseif key == FeedView.VOLUMIZE then
text = _("Volumize feed?")
end
local multi_box
@ -670,7 +811,6 @@ function NewsDownloader:updateFeedConfig(id, key, value)
end
local ok, feed_config = pcall(dofile, self.feed_config_path)
if not ok or not feed_config then
UI:info(T(_("Invalid configuration file. Detailed error message:\n%1"), feed_config))
return
@ -679,6 +819,7 @@ function NewsDownloader:updateFeedConfig(id, key, value)
if #feed_config <= 0 then
logger.dbg("NewsDownloader: empty feed list.", self.feed_config_path)
end
-- Check to see if the id is larger than the number of feeds. If it is,
-- then we know this is a new add. Insert the base array.
if id > #feed_config then
@ -712,17 +853,65 @@ function NewsDownloader:updateFeedConfig(id, key, value)
)
end
elseif key == FeedView.LIMIT then
feed.limit = value
if feed.limit then
feed.limit = value
else
table.insert(
feed,
{
"limit",
value
}
)
end
elseif key == FeedView.DOWNLOAD_FULL_ARTICLE then
feed.download_full_article = value
if feed.download_full_article ~= nil then
feed.download_full_article = value
else
table.insert(
feed,
{
"download_full_article",
value
}
)
end
elseif key == FeedView.INCLUDE_IMAGES then
feed.include_images = value
if feed.include_images ~= nil then
feed.include_images = value
else
table.insert(
feed,
{
"include_images",
value
}
)
end
elseif key == FeedView.ENABLE_FILTER then
feed.enable_filter = value
if feed.enable_filter ~= nil then
feed.enable_filter = value
else
table.insert(
feed,
{
"enable_filter",
value
}
)
end
elseif key == FeedView.FILTER_ELEMENT then
feed.filter_element = value
elseif key == FeedView.VOLUMIZE then
feed.volumize = value
if feed.filter_element then
feed.filter_element = value
else
table.insert(
feed,
{
"filter_element",
value
}
)
end
end
end
-- Now we insert the updated (or newly created) feed into the
@ -741,31 +930,12 @@ function NewsDownloader:updateFeedConfig(id, key, value)
new_config[id],
function(cb_id, cb_edit_key, cb_value)
self:editFeedAttribute(cb_id, cb_edit_key, cb_value)
end,
function(feed_id, action)
if action == FeedView.ACTION_DELETE_FEED then
self:deleteFeed(feed_id)
elseif action == FeedView.ACTION_RESET_HISTORY then
local Trapper = require("ui/trapper")
Trapper:wrap(function()
local should_reset = Trapper:confirm(
_("Are you sure you want to reset the feed history? Proceeding will cause items to be re-downloaded next time you sync."),
_("Cancel"),
_("Reset")
)
if should_reset then
self:resetFeedHistory(id)
Trapper:reset()
else
Trapper:reset()
end
end)
end
end
)
self:viewFeedItem(
feed_item_vc
)
end
function NewsDownloader:deleteFeed(id)
@ -773,7 +943,6 @@ function NewsDownloader:deleteFeed(id)
logger.dbg("Newsdownloader: attempting to delete feed")
-- Check to see if we can get the config file.
local ok, feed_config = pcall(dofile, self.feed_config_path)
if not ok or not feed_config then
UI:info(T(_("Invalid configuration file. Detailed error message:\n%1"), feed_config))
return
@ -783,7 +952,6 @@ function NewsDownloader:deleteFeed(id)
-- and key (i.e.: the key that triggered this function.
-- If we are at the right spot, we overrite (or create) the value
local new_config = {}
for idx, feed in ipairs(feed_config) do
-- Check to see if this is the correct feed to update.
if idx ~= id then
@ -795,7 +963,6 @@ function NewsDownloader:deleteFeed(id)
end
-- Save the config
local Trapper = require("ui/trapper")
Trapper:wrap(function()
logger.dbg("NewsDownloader: config to save", new_config)
self:saveConfig(new_config)
@ -804,14 +971,6 @@ function NewsDownloader:deleteFeed(id)
self:viewFeedList()
end
function NewsDownloader:resetFeedHistory(url)
logger.dbg("Newsdownloader: attempting to reset feed history")
self.history:saveSetting(url, {})
self.history:flush()
-- Refresh the view
self:viewFeedList()
end
function NewsDownloader:saveConfig(config)
local UI = require("ui/trapper")
UI:info(_("Saving news feed list…"))
@ -827,9 +986,6 @@ function NewsDownloader:saveConfig(config)
UI:reset()
end
-- This function opens an input dialog that lets the user
-- manually change their feed config. This function is called
-- when there is an error with the parsing.
function NewsDownloader:changeFeedConfig()
local feed_config_file = io.open(self.feed_config_path, "rb")
local config = feed_config_file:read("*all")
@ -872,7 +1028,6 @@ function NewsDownloader:changeFeedConfig()
UIManager:show(config_editor)
config_editor:onShowKeyboard()
end
function NewsDownloader:openDownloadsFolder()
local FileManager = require("apps/filemanager/filemanager")
if self.ui.document then
@ -897,4 +1052,38 @@ function NewsDownloader:onCloseDocument()
end
end
--
-- KeyValuePage doesn't like to get a table with sub tables.
-- This function flattens an array, moving all nested tables
-- up the food chain, so to speak
--
function NewsDownloader:flattenArray(base_array, source_array)
for key, value in pairs(source_array) do
if value[2] == nil then
-- If the value is empty, then it's probably supposed to be a line
table.insert(
base_array,
"---"
)
else
if value["callback"] then
table.insert(
base_array,
{
value[1], value[2], callback = value["callback"]
}
)
else
table.insert(
base_array,
{
value[1], value[2]
}
)
end
end
end
return base_array
end
return NewsDownloader

Loading…
Cancel
Save