2
0
mirror of https://github.com/koreader/koreader synced 2024-11-18 03:25:46 +00:00
koreader/plugins/opds.koplugin/opdsparser.lua

94 lines
3.2 KiB
Lua
Raw Normal View History

--[[
This code is derived from the LAPHLibs which can be found here:
https://github.com/Wiladams/LAPHLibs
--]]
local util = require("util")
local luxl = require("luxl")
local ffi = require("ffi")
local OPDSParser = {}
local unescape_map = {
["lt"] = "<",
["gt"] = ">",
["amp"] = "&",
["quot"] = '"',
["apos"] = "'"
}
2016-02-16 02:08:04 +00:00
local gsub = string.gsub
local function unescape(str)
return gsub(str, '(&(#?)([%d%a]+);)', function(orig, n, s)
2016-12-27 10:00:13 +00:00
if unescape_map[s] then
return unescape_map[s]
elseif n == "#" then -- unescape unicode
return util.unicodeCodepointToUtf8(tonumber(s))
2016-12-27 10:00:13 +00:00
else
return orig
end
end)
end
2016-12-27 10:00:13 +00:00
function OPDSParser:createFlatXTable(xlex, curr_element)
curr_element = curr_element or {}
local curr_attr_name
local attr_count = 0
-- start reading the thing
for event, offset, size in xlex:Lexemes() do
local txt = ffi.string(xlex.buf + offset, size)
2016-12-27 10:00:13 +00:00
if event == luxl.EVENT_START then
if txt ~= "xml" then
-- does current element already have something
-- with this name?
-- if it does, if it's a table, add to it
-- if it doesn't, then add a table
local tab = self:createFlatXTable(xlex)
if txt == "entry" or txt == "link" then
if curr_element[txt] == nil then
curr_element[txt] = {}
end
table.insert(curr_element[txt], tab)
elseif type(curr_element) == "table" then
curr_element[txt] = tab
end
end
2016-12-27 10:00:13 +00:00
elseif event == luxl.EVENT_ATTR_NAME then
curr_attr_name = unescape(txt)
elseif event == luxl.EVENT_ATTR_VAL then
curr_element[curr_attr_name] = unescape(txt)
attr_count = attr_count + 1
2016-12-27 10:00:13 +00:00
curr_attr_name = nil
elseif event == luxl.EVENT_TEXT then
curr_element = unescape(txt)
elseif event == luxl.EVENT_END then
return curr_element
end
end
2016-12-27 10:00:13 +00:00
return curr_element
end
function OPDSParser:parse(text)
-- Murder Calibre's whole "content" block, because luxl doesn't really deal well with various XHTML quirks,
-- as the list of crappy replacements below attests to...
-- There's also a high probability of finding orphaned tags or badly nested ones in there, which will screw everything up.
text = text:gsub('<content type="xhtml">.-</content>', '')
-- luxl doesn't handle XML comments, so strip them
text = text:gsub("<!%-%-.-%-%->", "")
-- luxl is also particular about the syntax for self-closing, empty & orphaned tags...
text = text:gsub("<(%l+)/>", "<%1 />")
-- We also need to handle the slash-less variants for br & hr...
text = text:gsub("<([bh]r)>", "<%1 />")
-- Some OPDS catalogs wrap text in a CDATA section, remove it as it causes parsing problems
2018-10-22 20:57:04 +00:00
text = text:gsub("<!%[CDATA%[(.-)%]%]>", function (s)
return s:gsub( "%p", {["&"] = "&amp;", ["<"] = "&lt;", [">"] = "&gt;" } )
end )
local xlex = luxl.new(text, #text)
return assert(self:createFlatXTable(xlex))
end
return OPDSParser