koreader/plugins/newsdownloader.koplugin/feed_source.lua

local BD = require("ui/bidi")
local DownloadBackend = require("epubdownloadbackend")
local NewsHelpers = require("http_utilities")
local dateparser = require("lib.dateparser")
local logger = require("logger")
local md5 = require("ffi/sha2").md5
local util = require("util")
local _ = require("gettext")
local N_ = _.ngettext
local FFIUtil = require("ffi/util")
local T = FFIUtil.template

local FeedSource = {
    file_extension = ".epub"
}

function FeedSource:new(o)
    o = o or {}
    self.__index = self
    setmetatable(o, self)
    return o
end

function FeedSource:getInitializedFeeds(feed_list, progress_callback, error_callback)
    local initialized_feeds = {}
    local unsupported_feeds_urls = {}

    for idx, feed in ipairs(feed_list) do
        local url = feed[1]
        -- Show a UI update
        progress_callback(T(
            _("Setting up feed %1 of %2."),
            idx,
            url
        ))
        -- Initialize the feed
        local ok, response = pcall(function()
            return self:initializeDocument(
                self:fetchDocumentByUrl(url)
            )
        end)
        -- If the initialization worked, add the feed
        -- to a list of initialized feeds
        if ok and response then
            table.insert(initialized_feeds, {
                config = feed,
                document = response,
            })
        else
            table.insert(unsupported_feeds_urls, {
                url .. ": " .. response
            })
        end
    end

    if #unsupported_feeds_urls > 0 then
        -- When some errors are present, we get a sour message that includes
        -- information about the source of the error.
        local unsupported_urls = ""
        for key, value in pairs(unsupported_feeds_urls) do
            -- Create the error message.
            --            unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2]
            unsupported_urls = value[1] .. "\n\n"
            -- Not sure what this does.
            if key ~= #unsupported_feeds_urls then
                unsupported_urls = BD.url(unsupported_urls) .. ", "
            end
        end
        error_callback(
            T(N_("Could not initialize a feed:\n\n%2\n\nPlease review your feed configuration.", "Could not initialize %1 feeds:\n\n%2\n\nPlease review your feed configurations.", #unsupported_feeds_urls),
                #unsupported_feeds_urls, unsupported_urls)
        )
    end

    return initialized_feeds
end

-- This function contacts the feed website and attempts to get
-- the RSS/Atom document with a list of the latest items.
function FeedSource:fetchDocumentByUrl(url)
    local document
    -- Get the XML document representing the feed
    local ok, response = pcall(function()
            local success, content = NewsHelpers:getUrlContent(url)
            if (success) then
                return content
            else
                error("Failed to download content for url: " .. url, 0)
            end
    end)
    -- Check to see if a response is available to deserialize.
    if ok then
        -- Deserialize the XML document into something Lua can use
        document = NewsHelpers:deserializeXMLString(response)
    end
    -- Return the document or any errors that may have occured
    if ok or document then
        return document
    else
        if not ok then
            error("(Reason: Failed to download feed document)", 0)
        else
            error("(Reason: Error during feed document deserialization)", 0)
        end
    end
end

-- Supply this method with the XML document returned by the feed,
-- and it will initialized the document by extracting the feed title,
-- feed items, and items count.
function FeedSource:initializeDocument(document)
    local feed_title
    local feed_items
    local total_items

    local ok = pcall(function()
            return self:getFeedType(
                document,
                function()
                    -- RSS callback
                    feed_title = util.htmlEntitiesToUtf8(document.rss.channel.title)
                    feed_items = document.rss.channel.item
                    total_items = #document.rss.channel.item
                end,
                function()
                    -- Atom callback
                    feed_title = FeedSource:getFeedTitle(document.feed.title)
                    feed_items = document.feed.entry
                    total_items = #document.feed.entry
                end
            )
    end)

    if ok then
        document.title = feed_title
        document.items = feed_items
        document.total_items = total_items
        return document
    else
        error(_("Could not initialize feed document"), 0)
    end
end

function FeedSource:getItemsContent(feed, progress_callback, error_callback)
    local limit = tonumber(feed.config.limit)
    local total_items = (limit == 0) and
        feed.document.total_items or
        limit
    local initialized_feed_items = {}
    -- Download each ite0m in the feed
    for index, item in pairs(feed.document.items) do
        -- If limit has been met, stop downloading feed.
        if limit ~= 0 and index - 1 == limit then
            break
        end
        -- Display feedback to user.
        progress_callback(T(
            _("%3\n Downloading item %1 of %2"),
            index,
            total_items,
            feed.document.title
        ))
        -- Download the article's HTML.
        local ok, response = pcall(function()
                return self:initializeItemHtml(
                    feed,
                    self:getItemHtml(
                        item,
                        feed.config.download_full_article
                    )
                )
        end)

        -- Add the result to our table, or send a
        -- result to the error callback.
        if ok then
            table.insert(initialized_feed_items, {
                html = response.html,
                images = response.images,
                item_slug = FeedSource:getItemTitleWithDate(item),
                item_title = item.title,
                md5 = md5(item.title),
                feed_title = feed.document.title,
            })
        else
            error_callback(
                T(_("Could not get content for: %1"), feed.document.title)
            )
        end

    end

    if #initialized_feed_items > 0 then
        return initialized_feed_items
    else
        return nil
    end
end

function FeedSource:initializeItemHtml(feed, html)
    local url = feed.config[1]
    -- local download_full_article = feed.config.download_full_article ~= false
    local include_images = feed.config.include_images ~= false
    local filter_element = feed.config.filter_element or
        feed.config.filter_element == nil
    local enable_filter = feed.config.enable_filter ~= false
    local item_images, item_html = DownloadBackend:getImagesAndHtml(
        html,
        url,
        include_images,
        enable_filter,
        filter_element
    )
    return {
        html = item_html,
        images = item_images
    }
end

function FeedSource:getFeedType(document, rss_cb, atom_cb)
    -- Check to see if the feed uses RSS.
    local is_rss = document.rss and
        document.rss.channel and
        document.rss.channel.title and
        document.rss.channel.item and
        document.rss.channel.item[1] and
        document.rss.channel.item[1].title and
        document.rss.channel.item[1].link
    -- Check to see if the feed uses Atom.
    local is_atom = document.feed and
        document.feed.title and
        document.feed.entry[1] and
        document.feed.entry[1].title and
        document.feed.entry[1].link
    -- Setup the feed values based on feed type
    if is_atom then
        return atom_cb()
    elseif is_rss then
        return rss_cb()
    end
    -- Return the values through our callback, or call an
    -- error message if the feed wasn't RSS or Atom
    if not is_rss or not is_atom then
        local error_message
        if not is_rss then
            error_message = _("(Reason: Couldn't process RSS)")
        elseif not is_atom then
            error_message = _("(Reason: Couldn't process Atom)")
        end
        error(error_message)
    end
end

function FeedSource:getItemHtml(item, download_full_article)
    if download_full_article then
        return NewsHelpers:loadPage(
            FeedSource:getFeedLink(item.link)
        )
    else
        local feed_description = item.description or item.summary
        local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.")
        return string.format([[<!DOCTYPE html>
<html>
<head><meta charset='UTF-8'><title>%s</title></head>
<body><header><h2>%s</h2></header><article>%s</article>
<br><footer><small>%s</small></footer>
</body>
</html>]], item.title, item.title, feed_description, footer)
    end
end

-- @todo: move this elsewhere
function FeedSource:getEpubOutputDir(download_dir, sub_dir, epub_title)

    local feed_output_dir = ("%s%s/"):format(
        download_dir,
        util.getSafeFilename(util.htmlEntitiesToUtf8(sub_dir)))

    -- Create the output directory if it doesn't exist.
    if not lfs.attributes(feed_output_dir, "mode") then
        lfs.mkdir(feed_output_dir)
    end

    local file_name = FeedSource:getFeedTitle(epub_title)

    return ("%s%s%s"):format(
        feed_output_dir,
        file_name,
        self.file_extension
    )
end

function FeedSource:createEpub(title, chapters, abs_output_path, progress_callback, error_callback)

    local file_exists = lfs.attributes(abs_output_path, "mode")

    if file_exists then
        logger.dbg("NewsDownloader: Skipping. EPUB file already exists", abs_output_path)
        return true
    end

    if #chapters == 0 then
        error(_("Error: chapters contains 0 items"), 0)
    end

    local images = {}

    for index, chapter in ipairs(chapters) do
        for jndex, image in ipairs(chapter.images) do
            table.insert(
                images,
                image
            )
        end
    end

    local epub = DownloadBackend:new{}

    progress_callback(T(_("Building EPUB %1"), title))
    epub:setTitle(title)
    epub:addToc(chapters)
    epub:addManifest(chapters, images)

    progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding contents")))
    epub:addContents(chapters)

    progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding images")))
    epub:addImages(images)

    progress_callback(T(_("Building EPUB %1: %2"), title, _("Writing EPUB to disk")))
    local ok = pcall(function()
        return epub:build(abs_output_path)
    end)

    if ok then
        if lfs.attributes(abs_output_path, "mode") then
            return true
        end
    end

    return false
end

local function parseDate(dateTime)
    -- Uses lua-feedparser https://github.com/slact/lua-feedparser
    -- feedparser is available under the (new) BSD license.
    -- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser
    local date = dateparser.parse(dateTime)
    return os.date("%y-%m-%d_%H-%M_", date)
end

function FeedSource:getFeedTitleWithDate(feed)
    local title = util.getSafeFilename(FeedSource:getFeedTitle(feed.document.title))
    return os.date("%y-%m-%d_%H-%M_") .. title
end

-- Creates a title with date from a feed item.
function FeedSource:getItemTitleWithDate(item)
    local title = util.getSafeFilename(FeedSource:getFeedTitle(item.title))
    if item.updated then
        title = parseDate(item.updated) .. title
    elseif item.pubDate then
        title = parseDate(item.pubDate) .. title
    elseif item.published then
        title = parseDate(item.published) .. title
    end
    return title
end

-- If a title looks like <title>blabla</title> it'll just be feed.title.
-- If a title looks like <title attr="alb">blabla</title> then we get a table
-- where [1] is the title string and the attributes are also available.
function FeedSource:getFeedTitle(possible_title)
    if type(possible_title) == "string" then
        return util.htmlEntitiesToUtf8(possible_title)
    elseif possible_title[1] and type(possible_title[1]) == "string" then
        return util.htmlEntitiesToUtf8(possible_title[1])
    end
end
-- There can be multiple links.
-- For now we just assume the first link is probably the right one.
--- @todo Write unit tests.
-- Some feeds that can be used for unit test.
-- http://fransdejonge.com/feed/ for multiple links.
-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.
function FeedSource:getFeedLink(possible_link)
    local E = {}
    if type(possible_link) == "string" then
        return possible_link
    elseif (possible_link._attr or E).href then
        return possible_link._attr.href
    elseif ((possible_link[1] or E)._attr or E).href then
        return possible_link[1]._attr.href
    end
end


return FeedSource
NewsDownloader: new option to allow EPUB volumization (#8263) When this feature is enabled on a feed and that feed is synced, all new feed entries will be collected into a new single EPUB file. This is achieved by implementing a feed history feature (downloaded feeds are added as M5D hashes to a LuaSettings file), and by introducing additional methods into epubdownloader.lua that allow for multiple HTML documents to be added into single EPUB file. 2022-01-21 18:23:21 +00:00			`local BD = require("ui/bidi")`
			`local DownloadBackend = require("epubdownloadbackend")`
			`local NewsHelpers = require("http_utilities")`
			`local dateparser = require("lib.dateparser")`
			`local logger = require("logger")`
			`local md5 = require("ffi/sha2").md5`
			`local util = require("util")`
			`local _ = require("gettext")`
			`local N_ = _.ngettext`
			`local FFIUtil = require("ffi/util")`
			`local T = FFIUtil.template`

			`local FeedSource = {`
			`file_extension = ".epub"`
			`}`

			`function FeedSource:new(o)`
			`o = o or {}`
			`self.__index = self`
			`setmetatable(o, self)`
			`return o`
			`end`

			`function FeedSource:getInitializedFeeds(feed_list, progress_callback, error_callback)`
			`local initialized_feeds = {}`
			`local unsupported_feeds_urls = {}`

			`for idx, feed in ipairs(feed_list) do`
			`local url = feed[1]`
			`-- Show a UI update`
			`progress_callback(T(`
			`_("Setting up feed %1 of %2."),`
			`idx,`
			`url`
			`))`
			`-- Initialize the feed`
			`local ok, response = pcall(function()`
			`return self:initializeDocument(`
			`self:fetchDocumentByUrl(url)`
			`)`
			`end)`
			`-- If the initialization worked, add the feed`
			`-- to a list of initialized feeds`
			`if ok and response then`
			`table.insert(initialized_feeds, {`
			`config = feed,`
			`document = response,`
			`})`
			`else`
			`table.insert(unsupported_feeds_urls, {`
			`url .. ": " .. response`
			`})`
			`end`
			`end`

			`if #unsupported_feeds_urls > 0 then`
			`-- When some errors are present, we get a sour message that includes`
			`-- information about the source of the error.`
			`local unsupported_urls = ""`
			`for key, value in pairs(unsupported_feeds_urls) do`
			`-- Create the error message.`
			`-- unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2]`
			`unsupported_urls = value[1] .. "\n\n"`
			`-- Not sure what this does.`
			`if key ~= #unsupported_feeds_urls then`
			`unsupported_urls = BD.url(unsupported_urls) .. ", "`
			`end`
			`end`
			`error_callback(`
			`T(N_("Could not initialize a feed:\n\n%2\n\nPlease review your feed configuration.", "Could not initialize %1 feeds:\n\n%2\n\nPlease review your feed configurations.", #unsupported_feeds_urls),`
			`#unsupported_feeds_urls, unsupported_urls)`
			`)`
			`end`

			`return initialized_feeds`
			`end`

			`-- This function contacts the feed website and attempts to get`
			`-- the RSS/Atom document with a list of the latest items.`
			`function FeedSource:fetchDocumentByUrl(url)`
			`local document`
			`-- Get the XML document representing the feed`
			`local ok, response = pcall(function()`
			`local success, content = NewsHelpers:getUrlContent(url)`
			`if (success) then`
			`return content`
			`else`
			`error("Failed to download content for url: " .. url, 0)`
			`end`
			`end)`
			`-- Check to see if a response is available to deserialize.`
			`if ok then`
			`-- Deserialize the XML document into something Lua can use`
			`document = NewsHelpers:deserializeXMLString(response)`
			`end`
			`-- Return the document or any errors that may have occured`
			`if ok or document then`
			`return document`
			`else`
			`if not ok then`
			`error("(Reason: Failed to download feed document)", 0)`
			`else`
			`error("(Reason: Error during feed document deserialization)", 0)`
			`end`
			`end`
			`end`

			`-- Supply this method with the XML document returned by the feed,`
			`-- and it will initialized the document by extracting the feed title,`
			`-- feed items, and items count.`
			`function FeedSource:initializeDocument(document)`
			`local feed_title`
			`local feed_items`
			`local total_items`

			`local ok = pcall(function()`
			`return self:getFeedType(`
			`document,`
			`function()`
			`-- RSS callback`
			`feed_title = util.htmlEntitiesToUtf8(document.rss.channel.title)`
			`feed_items = document.rss.channel.item`
			`total_items = #document.rss.channel.item`
			`end,`
			`function()`
			`-- Atom callback`
			`feed_title = FeedSource:getFeedTitle(document.feed.title)`
			`feed_items = document.feed.entry`
			`total_items = #document.feed.entry`
			`end`
			`)`
			`end)`

			`if ok then`
			`document.title = feed_title`
			`document.items = feed_items`
			`document.total_items = total_items`
			`return document`
			`else`
			`error(_("Could not initialize feed document"), 0)`
			`end`
			`end`

			`function FeedSource:getItemsContent(feed, progress_callback, error_callback)`
			`local limit = tonumber(feed.config.limit)`
			`local total_items = (limit == 0) and`
			`feed.document.total_items or`
			`limit`
			`local initialized_feed_items = {}`
			`-- Download each ite0m in the feed`
			`for index, item in pairs(feed.document.items) do`
			`-- If limit has been met, stop downloading feed.`
			`if limit ~= 0 and index - 1 == limit then`
			`break`
			`end`
			`-- Display feedback to user.`
			`progress_callback(T(`
			`_("%3\n Downloading item %1 of %2"),`
			`index,`
			`total_items,`
			`feed.document.title`
			`))`
			`-- Download the article's HTML.`
			`local ok, response = pcall(function()`
			`return self:initializeItemHtml(`
			`feed,`
			`self:getItemHtml(`
			`item,`
			`feed.config.download_full_article`
			`)`
			`)`
			`end)`

			`-- Add the result to our table, or send a`
			`-- result to the error callback.`
			`if ok then`
			`table.insert(initialized_feed_items, {`
			`html = response.html,`
			`images = response.images,`
			`item_slug = FeedSource:getItemTitleWithDate(item),`
			`item_title = item.title,`
			`md5 = md5(item.title),`
			`feed_title = feed.document.title,`
			`})`
			`else`
			`error_callback(`
			`T(_("Could not get content for: %1"), feed.document.title)`
			`)`
			`end`

			`end`

			`if #initialized_feed_items > 0 then`
			`return initialized_feed_items`
			`else`
			`return nil`
			`end`
			`end`

			`function FeedSource:initializeItemHtml(feed, html)`
			`local url = feed.config[1]`
			`-- local download_full_article = feed.config.download_full_article ~= false`
			`local include_images = feed.config.include_images ~= false`
			`local filter_element = feed.config.filter_element or`
			`feed.config.filter_element == nil`
			`local enable_filter = feed.config.enable_filter ~= false`
			`local item_images, item_html = DownloadBackend:getImagesAndHtml(`
			`html,`
			`url,`
			`include_images,`
			`enable_filter,`
			`filter_element`
			`)`
			`return {`
			`html = item_html,`
			`images = item_images`
			`}`
			`end`

			`function FeedSource:getFeedType(document, rss_cb, atom_cb)`
			`-- Check to see if the feed uses RSS.`
			`local is_rss = document.rss and`
			`document.rss.channel and`
			`document.rss.channel.title and`
			`document.rss.channel.item and`
			`document.rss.channel.item[1] and`
			`document.rss.channel.item[1].title and`
			`document.rss.channel.item[1].link`
			`-- Check to see if the feed uses Atom.`
			`local is_atom = document.feed and`
			`document.feed.title and`
			`document.feed.entry[1] and`
			`document.feed.entry[1].title and`
			`document.feed.entry[1].link`
			`-- Setup the feed values based on feed type`
			`if is_atom then`
			`return atom_cb()`
			`elseif is_rss then`
			`return rss_cb()`
			`end`
			`-- Return the values through our callback, or call an`
			`-- error message if the feed wasn't RSS or Atom`
			`if not is_rss or not is_atom then`
			`local error_message`
			`if not is_rss then`
			`error_message = _("(Reason: Couldn't process RSS)")`
			`elseif not is_atom then`
			`error_message = _("(Reason: Couldn't process Atom)")`
			`end`
			`error(error_message)`
			`end`
			`end`

			`function FeedSource:getItemHtml(item, download_full_article)`
			`if download_full_article then`
			`return NewsHelpers:loadPage(`
			`FeedSource:getFeedLink(item.link)`
			`)`
			`else`
			`local feed_description = item.description or item.summary`
			`local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.")`
			`return string.format([[<!DOCTYPE html>`
			`<html>`
			`<head><meta charset='UTF-8'><title>%s</title></head>`
			`<body><header><h2>%s</h2></header><article>%s</article>`
			`<br><footer><small>%s</small></footer>`
			`</body>`
			`</html>]], item.title, item.title, feed_description, footer)`
			`end`
			`end`

			`-- @todo: move this elsewhere`
			`function FeedSource:getEpubOutputDir(download_dir, sub_dir, epub_title)`

			`local feed_output_dir = ("%s%s/"):format(`
			`download_dir,`
			`util.getSafeFilename(util.htmlEntitiesToUtf8(sub_dir)))`

			`-- Create the output directory if it doesn't exist.`
			`if not lfs.attributes(feed_output_dir, "mode") then`
			`lfs.mkdir(feed_output_dir)`
			`end`

			`local file_name = FeedSource:getFeedTitle(epub_title)`

			`return ("%s%s%s"):format(`
			`feed_output_dir,`
			`file_name,`
			`self.file_extension`
			`)`
			`end`

			`function FeedSource:createEpub(title, chapters, abs_output_path, progress_callback, error_callback)`

			`local file_exists = lfs.attributes(abs_output_path, "mode")`

			`if file_exists then`
			`logger.dbg("NewsDownloader: Skipping. EPUB file already exists", abs_output_path)`
			`return true`
			`end`

			`if #chapters == 0 then`
			`error(_("Error: chapters contains 0 items"), 0)`
			`end`

			`local images = {}`

			`for index, chapter in ipairs(chapters) do`
			`for jndex, image in ipairs(chapter.images) do`
			`table.insert(`
			`images,`
			`image`
			`)`
			`end`
			`end`

			`local epub = DownloadBackend:new{}`

			`progress_callback(T(_("Building EPUB %1"), title))`
			`epub:setTitle(title)`
			`epub:addToc(chapters)`
			`epub:addManifest(chapters, images)`

			`progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding contents")))`
			`epub:addContents(chapters)`

			`progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding images")))`
			`epub:addImages(images)`

			`progress_callback(T(_("Building EPUB %1: %2"), title, _("Writing EPUB to disk")))`
			`local ok = pcall(function()`
			`return epub:build(abs_output_path)`
			`end)`

			`if ok then`
			`if lfs.attributes(abs_output_path, "mode") then`
			`return true`
			`end`
			`end`

			`return false`
			`end`

			`local function parseDate(dateTime)`
			`-- Uses lua-feedparser https://github.com/slact/lua-feedparser`
			`-- feedparser is available under the (new) BSD license.`
			`-- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser`
			`local date = dateparser.parse(dateTime)`
			`return os.date("%y-%m-%d_%H-%M_", date)`
			`end`

			`function FeedSource:getFeedTitleWithDate(feed)`
			`local title = util.getSafeFilename(FeedSource:getFeedTitle(feed.document.title))`
			`return os.date("%y-%m-%d_%H-%M_") .. title`
			`end`

			`-- Creates a title with date from a feed item.`
			`function FeedSource:getItemTitleWithDate(item)`
			`local title = util.getSafeFilename(FeedSource:getFeedTitle(item.title))`
			`if item.updated then`
			`title = parseDate(item.updated) .. title`
			`elseif item.pubDate then`
			`title = parseDate(item.pubDate) .. title`
			`elseif item.published then`
			`title = parseDate(item.published) .. title`
			`end`
			`return title`
			`end`

			`-- If a title looks like <title>blabla</title> it'll just be feed.title.`
			`-- If a title looks like <title attr="alb">blabla</title> then we get a table`
			`-- where [1] is the title string and the attributes are also available.`
			`function FeedSource:getFeedTitle(possible_title)`
			`if type(possible_title) == "string" then`
			`return util.htmlEntitiesToUtf8(possible_title)`
			`elseif possible_title[1] and type(possible_title[1]) == "string" then`
			`return util.htmlEntitiesToUtf8(possible_title[1])`
			`end`
			`end`
			`-- There can be multiple links.`
			`-- For now we just assume the first link is probably the right one.`
			`--- @todo Write unit tests.`
			`-- Some feeds that can be used for unit test.`
			`-- http://fransdejonge.com/feed/ for multiple links.`
			`-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.`
			`function FeedSource:getFeedLink(possible_link)`
			`local E = {}`
			`if type(possible_link) == "string" then`
			`return possible_link`
			`elseif (possible_link._attr or E).href then`
			`return possible_link._attr.href`
			`elseif ((possible_link[1] or E)._attr or E).href then`
			`return possible_link[1]._attr.href`
			`end`
			`end`


			`return FeedSource`