koreader/plugins/newsdownloader.koplugin/feed_source.lua

local BD = require("ui/bidi")
local DownloadBackend = require("epubdownloadbackend")
local NewsHelpers = require("http_utilities")
local dateparser = require("lib.dateparser")
local logger = require("logger")
local md5 = require("ffi/sha2").md5
local util = require("util")
local _ = require("gettext")
local N_ = _.ngettext
local FFIUtil = require("ffi/util")
local T = FFIUtil.template

local FeedSource = {
    file_extension = ".epub"
}

function FeedSource:new(o)
    o = o or {}
    self.__index = self
    setmetatable(o, self)
    return o
end

function FeedSource:getInitializedFeeds(feed_list, progress_callback, error_callback)
    local initialized_feeds = {}
    local unsupported_feeds_urls = {}

    for idx, feed in ipairs(feed_list) do
        local url = feed[1]
        -- Show a UI update
        progress_callback(T(
            _("Setting up feed %1 of %2."),
            idx,
            url
        ))
        -- Initialize the feed
        local ok, response = pcall(function()
            return self:initializeDocument(
                self:fetchDocumentByUrl(url)
            )
        end)
        -- If the initialization worked, add the feed
        -- to a list of initialized feeds
        if ok and response then
            table.insert(initialized_feeds, {
                config = feed,
                document = response,
            })
        else
            table.insert(unsupported_feeds_urls, {
                url .. ": " .. response
            })
        end
    end

    if #unsupported_feeds_urls > 0 then
        -- When some errors are present, we get a sour message that includes
        -- information about the source of the error.
        local unsupported_urls = ""
        for key, value in pairs(unsupported_feeds_urls) do
            -- Create the error message.
            --            unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2]
            unsupported_urls = value[1] .. "\n\n"
            -- Not sure what this does.
            if key ~= #unsupported_feeds_urls then
                unsupported_urls = BD.url(unsupported_urls) .. ", "
            end
        end
        error_callback(
            T(N_("Could not initialize a feed:\n\n%2\n\nPlease review your feed configuration.", "Could not initialize %1 feeds:\n\n%2\n\nPlease review your feed configurations.", #unsupported_feeds_urls),
                #unsupported_feeds_urls, unsupported_urls)
        )
    end

    return initialized_feeds
end

-- This function contacts the feed website and attempts to get
-- the RSS/Atom document with a list of the latest items.
function FeedSource:fetchDocumentByUrl(url)
    local document
    -- Get the XML document representing the feed
    local ok, response = pcall(function()
            local success, content = NewsHelpers:getUrlContent(url)
            if (success) then
                return content
            else
                error("Failed to download content for url: " .. url, 0)
            end
    end)
    -- Check to see if a response is available to deserialize.
    if ok then
        -- Deserialize the XML document into something Lua can use
        document = NewsHelpers:deserializeXMLString(response)
    end
    -- Return the document or any errors that may have occured
    if ok or document then
        return document
    else
        if not ok then
            error("(Reason: Failed to download feed document)", 0)
        else
            error("(Reason: Error during feed document deserialization)", 0)
        end
    end
end

-- Supply this method with the XML document returned by the feed,
-- and it will initialized the document by extracting the feed title,
-- feed items, and items count.
function FeedSource:initializeDocument(document)
    local feed_title
    local feed_items
    local total_items

    local ok = pcall(function()
            return self:getFeedType(
                document,
                function()
                    -- RSS callback
                    feed_title = util.htmlEntitiesToUtf8(document.rss.channel.title)
                    feed_items = document.rss.channel.item
                    total_items = #document.rss.channel.item
                end,
                function()
                    -- Atom callback
                    feed_title = FeedSource:getFeedTitle(document.feed.title)
                    feed_items = document.feed.entry
                    total_items = #document.feed.entry
                end,
                function()
                    -- RDF callback
                    feed_title = util.htmlEntitiesToUtf8(document["rdf:RDF"].channel.title)
                    feed_items = document["rdf:RDF"].item
                    total_items = #document["rdf:RDF"].item
                end
            )
    end)

    if ok then
        document.title = feed_title
        document.items = feed_items
        document.total_items = total_items
        return document
    else
        error(_("Could not initialize feed document"), 0)
    end
end

function FeedSource:getItemsContent(feed, progress_callback, error_callback)
    local limit = tonumber(feed.config.limit)
    local total_items = (limit == 0) and
        feed.document.total_items or
        limit
    local initialized_feed_items = {}
    -- Download each ite0m in the feed
    for index, item in pairs(feed.document.items) do
        -- If limit has been met, stop downloading feed.
        if limit ~= 0 and index - 1 == limit then
            break
        end
        -- Display feedback to user.
        progress_callback(T(
            _("%3\n Downloading item %1 of %2"),
            index,
            total_items,
            feed.document.title
        ))
        -- Download the article's HTML.
        local ok, response = pcall(function()
                return self:initializeItemHtml(
                    feed,
                    self:getItemHtml(
                        item,
                        feed.config.download_full_article
                    )
                )
        end)

        -- Add the result to our table, or send a
        -- result to the error callback.
        if ok then
            table.insert(initialized_feed_items, {
                html = response.html,
                images = response.images,
                item_slug = FeedSource:getItemTitleWithDate(item),
                item_title = item.title,
                md5 = md5(item.title),
                feed_title = feed.document.title,
            })
        else
            error_callback(
                T(_("Could not get content for: %1"), feed.document.title)
            )
        end

    end

    if #initialized_feed_items > 0 then
        return initialized_feed_items
    else
        return nil
    end
end

function FeedSource:initializeItemHtml(feed, html)
    local url = feed.config[1]
    -- local download_full_article = feed.config.download_full_article ~= false
    local include_images = feed.config.include_images ~= false
    local filter_element = feed.config.filter_element or
        feed.config.filter_element == nil
    local enable_filter = feed.config.enable_filter ~= false
    local item_images, item_html = DownloadBackend:getImagesAndHtml(
        html,
        url,
        include_images,
        enable_filter,
        filter_element
    )
    return {
        html = item_html,
        images = item_images
    }
end

function FeedSource:getFeedType(document, rss_cb, atom_cb, rdf_cb)
    -- Check to see if the feed uses RSS.
    local is_rss = document.rss and
        document.rss.channel and
        document.rss.channel.title and
        document.rss.channel.item and
        document.rss.channel.item[1] and
        document.rss.channel.item[1].title and
        document.rss.channel.item[1].link
    -- Check to see if the feed uses Atom.
    local is_atom = document.feed and
        document.feed.title and
        document.feed.entry[1] and
        document.feed.entry[1].title and
        document.feed.entry[1].link
    -- Setup the feed values based on feed type
    local is_rdf = document["rdf:RDF"] and
        document["rdf:RDF"].channel and
        document["rdf:RDF"].channel.title
    if is_atom then
        return atom_cb()
    elseif is_rss then
        return rss_cb()
    elseif is_rdf then
        return rdf_cb()
    end
    -- Return the values through our callback, or call an
    -- error message if the feed wasn't RSS or Atom
    if not is_rss or not is_atom or not is_rdf then
        local error_message
        if not is_rss or not is_rdf then
            error_message = _("(Reason: Couldn't process RSS)")
        elseif not is_atom then
            error_message = _("(Reason: Couldn't process Atom)")
        end
        error(error_message)
    end
end

function FeedSource:getItemHtml(item, download_full_article)
    if download_full_article then
        return NewsHelpers:loadPage(
            FeedSource:getFeedLink(item.link)
        )
    else
        local feed_description = item.description or item.summary
        local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.")
        return string.format([[<!DOCTYPE html>
<html>
<head><meta charset='UTF-8'><title>%s</title></head>
<body><header><h2>%s</h2></header><article>%s</article>
<br><footer><small>%s</small></footer>
</body>
</html>]], item.title, item.title, feed_description, footer)
    end
end

-- @todo: move this elsewhere
function FeedSource:getEpubOutputDir(download_dir, sub_dir, epub_title)

    local feed_output_dir = ("%s%s/"):format(
        download_dir,
        util.getSafeFilename(util.htmlEntitiesToUtf8(sub_dir)))

    -- Create the output directory if it doesn't exist.
    if not lfs.attributes(feed_output_dir, "mode") then
        lfs.mkdir(feed_output_dir)
    end

    local file_name = FeedSource:getFeedTitle(epub_title)

    return ("%s%s%s"):format(
        feed_output_dir,
        file_name,
        self.file_extension
    )
end

function FeedSource:createEpub(title, chapters, abs_output_path, progress_callback, error_callback)

    local file_exists = lfs.attributes(abs_output_path, "mode")

    if file_exists then
        logger.dbg("NewsDownloader: Skipping. EPUB file already exists", abs_output_path)
        return true
    end

    if #chapters == 0 then
        error(_("Error: chapters contains 0 items"), 0)
    end

    local images = {}

    for index, chapter in ipairs(chapters) do
        for jndex, image in ipairs(chapter.images) do
            table.insert(
                images,
                image
            )
        end
    end

    local epub = DownloadBackend:new{}

    progress_callback(T(_("Building EPUB %1"), title))
    epub:setTitle(title)
    epub:addToc(chapters)
    epub:addManifest(chapters, images)

    progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding contents")))
    epub:addContents(chapters)

    progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding images")))
    epub:addImages(images)

    progress_callback(T(_("Building EPUB %1: %2"), title, _("Writing EPUB to disk")))
    local ok = pcall(function()
        return epub:build(abs_output_path)
    end)

    if ok then
        if lfs.attributes(abs_output_path, "mode") then
            return true
        end
    end

    return false
end

local function parseDate(dateTime)
    -- Uses lua-feedparser https://github.com/slact/lua-feedparser
    -- feedparser is available under the (new) BSD license.
    -- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser
    local date = dateparser.parse(dateTime)
    return os.date("%y-%m-%d_%H-%M_", date)
end

function FeedSource:getFeedTitleWithDate(feed)
    local title = util.getSafeFilename(FeedSource:getFeedTitle(feed.document.title))
    return os.date("%y-%m-%d_%H-%M_") .. title
end

-- Creates a title with date from a feed item.
function FeedSource:getItemTitleWithDate(item)
    local title = util.getSafeFilename(FeedSource:getFeedTitle(item.title))
    if item.updated then
        title = parseDate(item.updated) .. title
    elseif item.pubDate then
        title = parseDate(item.pubDate) .. title
    elseif item.published then
        title = parseDate(item.published) .. title
    end
    return title
end

-- If a title looks like <title>blabla</title> it'll just be feed.title.
-- If a title looks like <title attr="alb">blabla</title> then we get a table
-- where [1] is the title string and the attributes are also available.
function FeedSource:getFeedTitle(possible_title)
    if type(possible_title) == "string" then
        return util.htmlEntitiesToUtf8(possible_title)
    elseif possible_title[1] and type(possible_title[1]) == "string" then
        return util.htmlEntitiesToUtf8(possible_title[1])
    end
end
-- There can be multiple links.
-- For now we just assume the first link is probably the right one.
--- @todo Write unit tests.
-- Some feeds that can be used for unit test.
-- http://fransdejonge.com/feed/ for multiple links.
-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.
function FeedSource:getFeedLink(possible_link)
    local E = {}
    if type(possible_link) == "string" then
        return possible_link
    elseif (possible_link._attr or E).href then
        return possible_link._attr.href
    elseif ((possible_link[1] or E)._attr or E).href then
        return possible_link[1]._attr.href
    end
end


return FeedSource