mirror of
https://github.com/koreader/koreader
synced 2024-11-13 19:11:25 +00:00
399 lines
13 KiB
Lua
399 lines
13 KiB
Lua
|
local BD = require("ui/bidi")
|
||
|
local DownloadBackend = require("epubdownloadbackend")
|
||
|
local NewsHelpers = require("http_utilities")
|
||
|
local dateparser = require("lib.dateparser")
|
||
|
local logger = require("logger")
|
||
|
local md5 = require("ffi/sha2").md5
|
||
|
local util = require("util")
|
||
|
local _ = require("gettext")
|
||
|
local N_ = _.ngettext
|
||
|
local FFIUtil = require("ffi/util")
|
||
|
local T = FFIUtil.template
|
||
|
|
||
|
local FeedSource = {
|
||
|
file_extension = ".epub"
|
||
|
}
|
||
|
|
||
|
function FeedSource:new(o)
|
||
|
o = o or {}
|
||
|
self.__index = self
|
||
|
setmetatable(o, self)
|
||
|
return o
|
||
|
end
|
||
|
|
||
|
function FeedSource:getInitializedFeeds(feed_list, progress_callback, error_callback)
|
||
|
local initialized_feeds = {}
|
||
|
local unsupported_feeds_urls = {}
|
||
|
|
||
|
for idx, feed in ipairs(feed_list) do
|
||
|
local url = feed[1]
|
||
|
-- Show a UI update
|
||
|
progress_callback(T(
|
||
|
_("Setting up feed %1 of %2."),
|
||
|
idx,
|
||
|
url
|
||
|
))
|
||
|
-- Initialize the feed
|
||
|
local ok, response = pcall(function()
|
||
|
return self:initializeDocument(
|
||
|
self:fetchDocumentByUrl(url)
|
||
|
)
|
||
|
end)
|
||
|
-- If the initialization worked, add the feed
|
||
|
-- to a list of initialized feeds
|
||
|
if ok and response then
|
||
|
table.insert(initialized_feeds, {
|
||
|
config = feed,
|
||
|
document = response,
|
||
|
})
|
||
|
else
|
||
|
table.insert(unsupported_feeds_urls, {
|
||
|
url .. ": " .. response
|
||
|
})
|
||
|
end
|
||
|
end
|
||
|
|
||
|
if #unsupported_feeds_urls > 0 then
|
||
|
-- When some errors are present, we get a sour message that includes
|
||
|
-- information about the source of the error.
|
||
|
local unsupported_urls = ""
|
||
|
for key, value in pairs(unsupported_feeds_urls) do
|
||
|
-- Create the error message.
|
||
|
-- unsupported_urls = unsupported_urls .. " " .. value[1] .. " " .. value[2]
|
||
|
unsupported_urls = value[1] .. "\n\n"
|
||
|
-- Not sure what this does.
|
||
|
if key ~= #unsupported_feeds_urls then
|
||
|
unsupported_urls = BD.url(unsupported_urls) .. ", "
|
||
|
end
|
||
|
end
|
||
|
error_callback(
|
||
|
T(N_("Could not initialize a feed:\n\n%2\n\nPlease review your feed configuration.", "Could not initialize %1 feeds:\n\n%2\n\nPlease review your feed configurations.", #unsupported_feeds_urls),
|
||
|
#unsupported_feeds_urls, unsupported_urls)
|
||
|
)
|
||
|
end
|
||
|
|
||
|
return initialized_feeds
|
||
|
end
|
||
|
|
||
|
-- This function contacts the feed website and attempts to get
|
||
|
-- the RSS/Atom document with a list of the latest items.
|
||
|
function FeedSource:fetchDocumentByUrl(url)
|
||
|
local document
|
||
|
-- Get the XML document representing the feed
|
||
|
local ok, response = pcall(function()
|
||
|
local success, content = NewsHelpers:getUrlContent(url)
|
||
|
if (success) then
|
||
|
return content
|
||
|
else
|
||
|
error("Failed to download content for url: " .. url, 0)
|
||
|
end
|
||
|
end)
|
||
|
-- Check to see if a response is available to deserialize.
|
||
|
if ok then
|
||
|
-- Deserialize the XML document into something Lua can use
|
||
|
document = NewsHelpers:deserializeXMLString(response)
|
||
|
end
|
||
|
-- Return the document or any errors that may have occured
|
||
|
if ok or document then
|
||
|
return document
|
||
|
else
|
||
|
if not ok then
|
||
|
error("(Reason: Failed to download feed document)", 0)
|
||
|
else
|
||
|
error("(Reason: Error during feed document deserialization)", 0)
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|
||
|
-- Supply this method with the XML document returned by the feed,
|
||
|
-- and it will initialized the document by extracting the feed title,
|
||
|
-- feed items, and items count.
|
||
|
function FeedSource:initializeDocument(document)
|
||
|
local feed_title
|
||
|
local feed_items
|
||
|
local total_items
|
||
|
|
||
|
local ok = pcall(function()
|
||
|
return self:getFeedType(
|
||
|
document,
|
||
|
function()
|
||
|
-- RSS callback
|
||
|
feed_title = util.htmlEntitiesToUtf8(document.rss.channel.title)
|
||
|
feed_items = document.rss.channel.item
|
||
|
total_items = #document.rss.channel.item
|
||
|
end,
|
||
|
function()
|
||
|
-- Atom callback
|
||
|
feed_title = FeedSource:getFeedTitle(document.feed.title)
|
||
|
feed_items = document.feed.entry
|
||
|
total_items = #document.feed.entry
|
||
|
end
|
||
|
)
|
||
|
end)
|
||
|
|
||
|
if ok then
|
||
|
document.title = feed_title
|
||
|
document.items = feed_items
|
||
|
document.total_items = total_items
|
||
|
return document
|
||
|
else
|
||
|
error(_("Could not initialize feed document"), 0)
|
||
|
end
|
||
|
end
|
||
|
|
||
|
function FeedSource:getItemsContent(feed, progress_callback, error_callback)
|
||
|
local limit = tonumber(feed.config.limit)
|
||
|
local total_items = (limit == 0) and
|
||
|
feed.document.total_items or
|
||
|
limit
|
||
|
local initialized_feed_items = {}
|
||
|
-- Download each ite0m in the feed
|
||
|
for index, item in pairs(feed.document.items) do
|
||
|
-- If limit has been met, stop downloading feed.
|
||
|
if limit ~= 0 and index - 1 == limit then
|
||
|
break
|
||
|
end
|
||
|
-- Display feedback to user.
|
||
|
progress_callback(T(
|
||
|
_("%3\n Downloading item %1 of %2"),
|
||
|
index,
|
||
|
total_items,
|
||
|
feed.document.title
|
||
|
))
|
||
|
-- Download the article's HTML.
|
||
|
local ok, response = pcall(function()
|
||
|
return self:initializeItemHtml(
|
||
|
feed,
|
||
|
self:getItemHtml(
|
||
|
item,
|
||
|
feed.config.download_full_article
|
||
|
)
|
||
|
)
|
||
|
end)
|
||
|
|
||
|
-- Add the result to our table, or send a
|
||
|
-- result to the error callback.
|
||
|
if ok then
|
||
|
table.insert(initialized_feed_items, {
|
||
|
html = response.html,
|
||
|
images = response.images,
|
||
|
item_slug = FeedSource:getItemTitleWithDate(item),
|
||
|
item_title = item.title,
|
||
|
md5 = md5(item.title),
|
||
|
feed_title = feed.document.title,
|
||
|
})
|
||
|
else
|
||
|
error_callback(
|
||
|
T(_("Could not get content for: %1"), feed.document.title)
|
||
|
)
|
||
|
end
|
||
|
|
||
|
end
|
||
|
|
||
|
if #initialized_feed_items > 0 then
|
||
|
return initialized_feed_items
|
||
|
else
|
||
|
return nil
|
||
|
end
|
||
|
end
|
||
|
|
||
|
function FeedSource:initializeItemHtml(feed, html)
|
||
|
local url = feed.config[1]
|
||
|
-- local download_full_article = feed.config.download_full_article ~= false
|
||
|
local include_images = feed.config.include_images ~= false
|
||
|
local filter_element = feed.config.filter_element or
|
||
|
feed.config.filter_element == nil
|
||
|
local enable_filter = feed.config.enable_filter ~= false
|
||
|
local item_images, item_html = DownloadBackend:getImagesAndHtml(
|
||
|
html,
|
||
|
url,
|
||
|
include_images,
|
||
|
enable_filter,
|
||
|
filter_element
|
||
|
)
|
||
|
return {
|
||
|
html = item_html,
|
||
|
images = item_images
|
||
|
}
|
||
|
end
|
||
|
|
||
|
function FeedSource:getFeedType(document, rss_cb, atom_cb)
|
||
|
-- Check to see if the feed uses RSS.
|
||
|
local is_rss = document.rss and
|
||
|
document.rss.channel and
|
||
|
document.rss.channel.title and
|
||
|
document.rss.channel.item and
|
||
|
document.rss.channel.item[1] and
|
||
|
document.rss.channel.item[1].title and
|
||
|
document.rss.channel.item[1].link
|
||
|
-- Check to see if the feed uses Atom.
|
||
|
local is_atom = document.feed and
|
||
|
document.feed.title and
|
||
|
document.feed.entry[1] and
|
||
|
document.feed.entry[1].title and
|
||
|
document.feed.entry[1].link
|
||
|
-- Setup the feed values based on feed type
|
||
|
if is_atom then
|
||
|
return atom_cb()
|
||
|
elseif is_rss then
|
||
|
return rss_cb()
|
||
|
end
|
||
|
-- Return the values through our callback, or call an
|
||
|
-- error message if the feed wasn't RSS or Atom
|
||
|
if not is_rss or not is_atom then
|
||
|
local error_message
|
||
|
if not is_rss then
|
||
|
error_message = _("(Reason: Couldn't process RSS)")
|
||
|
elseif not is_atom then
|
||
|
error_message = _("(Reason: Couldn't process Atom)")
|
||
|
end
|
||
|
error(error_message)
|
||
|
end
|
||
|
end
|
||
|
|
||
|
function FeedSource:getItemHtml(item, download_full_article)
|
||
|
if download_full_article then
|
||
|
return NewsHelpers:loadPage(
|
||
|
FeedSource:getFeedLink(item.link)
|
||
|
)
|
||
|
else
|
||
|
local feed_description = item.description or item.summary
|
||
|
local footer = _("This is just a description of the feed. To download the full article instead, go to the News Downloader settings and change 'download_full_article' to 'true'.")
|
||
|
return string.format([[<!DOCTYPE html>
|
||
|
<html>
|
||
|
<head><meta charset='UTF-8'><title>%s</title></head>
|
||
|
<body><header><h2>%s</h2></header><article>%s</article>
|
||
|
<br><footer><small>%s</small></footer>
|
||
|
</body>
|
||
|
</html>]], item.title, item.title, feed_description, footer)
|
||
|
end
|
||
|
end
|
||
|
|
||
|
-- @todo: move this elsewhere
|
||
|
function FeedSource:getEpubOutputDir(download_dir, sub_dir, epub_title)
|
||
|
|
||
|
local feed_output_dir = ("%s%s/"):format(
|
||
|
download_dir,
|
||
|
util.getSafeFilename(util.htmlEntitiesToUtf8(sub_dir)))
|
||
|
|
||
|
-- Create the output directory if it doesn't exist.
|
||
|
if not lfs.attributes(feed_output_dir, "mode") then
|
||
|
lfs.mkdir(feed_output_dir)
|
||
|
end
|
||
|
|
||
|
local file_name = FeedSource:getFeedTitle(epub_title)
|
||
|
|
||
|
return ("%s%s%s"):format(
|
||
|
feed_output_dir,
|
||
|
file_name,
|
||
|
self.file_extension
|
||
|
)
|
||
|
end
|
||
|
|
||
|
function FeedSource:createEpub(title, chapters, abs_output_path, progress_callback, error_callback)
|
||
|
|
||
|
local file_exists = lfs.attributes(abs_output_path, "mode")
|
||
|
|
||
|
if file_exists then
|
||
|
logger.dbg("NewsDownloader: Skipping. EPUB file already exists", abs_output_path)
|
||
|
return true
|
||
|
end
|
||
|
|
||
|
if #chapters == 0 then
|
||
|
error(_("Error: chapters contains 0 items"), 0)
|
||
|
end
|
||
|
|
||
|
local images = {}
|
||
|
|
||
|
for index, chapter in ipairs(chapters) do
|
||
|
for jndex, image in ipairs(chapter.images) do
|
||
|
table.insert(
|
||
|
images,
|
||
|
image
|
||
|
)
|
||
|
end
|
||
|
end
|
||
|
|
||
|
local epub = DownloadBackend:new{}
|
||
|
|
||
|
progress_callback(T(_("Building EPUB %1"), title))
|
||
|
epub:setTitle(title)
|
||
|
epub:addToc(chapters)
|
||
|
epub:addManifest(chapters, images)
|
||
|
|
||
|
progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding contents")))
|
||
|
epub:addContents(chapters)
|
||
|
|
||
|
progress_callback(T(_("Building EPUB %1: %2"), title, _("Adding images")))
|
||
|
epub:addImages(images)
|
||
|
|
||
|
progress_callback(T(_("Building EPUB %1: %2"), title, _("Writing EPUB to disk")))
|
||
|
local ok = pcall(function()
|
||
|
return epub:build(abs_output_path)
|
||
|
end)
|
||
|
|
||
|
if ok then
|
||
|
if lfs.attributes(abs_output_path, "mode") then
|
||
|
return true
|
||
|
end
|
||
|
end
|
||
|
|
||
|
return false
|
||
|
end
|
||
|
|
||
|
local function parseDate(dateTime)
|
||
|
-- Uses lua-feedparser https://github.com/slact/lua-feedparser
|
||
|
-- feedparser is available under the (new) BSD license.
|
||
|
-- see: koreader/plugins/newsdownloader.koplugin/lib/LICENCE_lua-feedparser
|
||
|
local date = dateparser.parse(dateTime)
|
||
|
return os.date("%y-%m-%d_%H-%M_", date)
|
||
|
end
|
||
|
|
||
|
function FeedSource:getFeedTitleWithDate(feed)
|
||
|
local title = util.getSafeFilename(FeedSource:getFeedTitle(feed.document.title))
|
||
|
return os.date("%y-%m-%d_%H-%M_") .. title
|
||
|
end
|
||
|
|
||
|
-- Creates a title with date from a feed item.
|
||
|
function FeedSource:getItemTitleWithDate(item)
|
||
|
local title = util.getSafeFilename(FeedSource:getFeedTitle(item.title))
|
||
|
if item.updated then
|
||
|
title = parseDate(item.updated) .. title
|
||
|
elseif item.pubDate then
|
||
|
title = parseDate(item.pubDate) .. title
|
||
|
elseif item.published then
|
||
|
title = parseDate(item.published) .. title
|
||
|
end
|
||
|
return title
|
||
|
end
|
||
|
|
||
|
-- If a title looks like <title>blabla</title> it'll just be feed.title.
|
||
|
-- If a title looks like <title attr="alb">blabla</title> then we get a table
|
||
|
-- where [1] is the title string and the attributes are also available.
|
||
|
function FeedSource:getFeedTitle(possible_title)
|
||
|
if type(possible_title) == "string" then
|
||
|
return util.htmlEntitiesToUtf8(possible_title)
|
||
|
elseif possible_title[1] and type(possible_title[1]) == "string" then
|
||
|
return util.htmlEntitiesToUtf8(possible_title[1])
|
||
|
end
|
||
|
end
|
||
|
-- There can be multiple links.
|
||
|
-- For now we just assume the first link is probably the right one.
|
||
|
--- @todo Write unit tests.
|
||
|
-- Some feeds that can be used for unit test.
|
||
|
-- http://fransdejonge.com/feed/ for multiple links.
|
||
|
-- https://github.com/koreader/koreader/commits/master.atom for single link with attributes.
|
||
|
function FeedSource:getFeedLink(possible_link)
|
||
|
local E = {}
|
||
|
if type(possible_link) == "string" then
|
||
|
return possible_link
|
||
|
elseif (possible_link._attr or E).href then
|
||
|
return possible_link._attr.href
|
||
|
elseif ((possible_link[1] or E)._attr or E).href then
|
||
|
return possible_link[1]._attr.href
|
||
|
end
|
||
|
end
|
||
|
|
||
|
|
||
|
return FeedSource
|