2
0
mirror of https://github.com/koreader/koreader synced 2024-11-16 06:12:56 +00:00
koreader/plugins/newsdownloader.koplugin/lib/xml.lua
mwoz123 d6c81f5097 Simple News (RSS/Atom) downloader plugin (#2592)
* Simple News (RSS/Atom) downloader plugin
* Limit news from sources
* new News Downloader menu entry - goto news dir
* Initial configuration logic
* Download each feed source to separate folder
* 2 feed sources in example config
2017-04-19 17:59:45 +02:00

484 lines
17 KiB
Lua

---
-- Overview:
-- =========
--
-- This module provides a non-validating XML stream parser in Lua.
--
-- Features:
-- =========
--
-- * Tokenises well-formed XML (relatively robustly)
-- * Flexible handler based event API (see below)
-- * Parses all XML Infoset elements - ie.
-- - Tags
-- - Text
-- - Comments
-- - CDATA
-- - XML Decl
-- - Processing Instructions
-- - DOCTYPE declarations
-- * Provides limited well-formedness checking
-- (checks for basic syntax & balanced tags only)
-- * Flexible whitespace handling (selectable)
-- * Entity Handling (selectable)
--
-- Limitations:
-- ============
--
-- * Non-validating
-- * No charset handling
-- * No namespace support
-- * Shallow well-formedness checking only (fails
-- to detect most semantic errors)
--
-- API:
-- ====
--
-- The parser provides a partially object-oriented API with
-- functionality split into tokeniser and handler components.
--
-- The handler instance is passed to the tokeniser and receives
-- callbacks for each XML element processed (if a suitable handler
-- function is defined). The API is conceptually similar to the
-- SAX API but implemented differently.
--
-- The following events are generated by the tokeniser
--
-- handler:start - Start Tag
-- handler:end - End Tag
-- handler:text - Text
-- handler:decl - XML Declaration
-- handler:pi - Processing Instruction
-- handler:comment - Comment
-- handler:dtd - DOCTYPE definition
-- handler:cdata - CDATA
--
-- The function prototype for all the callback functions is
--
-- callback(val,attrs,start,end)
--
-- where attrs is a table and val/attrs are overloaded for
-- specific callbacks - ie.
--
-- Callback val attrs (table)
-- -------- --- -------------
-- start name { attributes (name=val).. }
-- end name nil
-- text <text> nil
-- cdata <text> nil
-- decl "xml" { attributes (name=val).. }
-- pi pi name { attributes (if present)..
-- _text = <PI Text>
-- }
-- comment <text> nil
-- dtd root element { _root = <Root Element>,
-- _type = SYSTEM|PUBLIC,
-- _name = <name>,
-- _uri = <uri>,
-- _internal = <internal dtd>
-- }
--
-- (start & end provide the character positions of the start/end
-- of the element)
--
-- XML data is passed to the parser instance through the 'parse'
-- method (Note: must be passed a single string currently)
--
-- Options
-- =======
--
-- Parser options are controlled through the 'self.options' table.
-- Available options are -
--
-- * stripWS
--
-- Strip non-significant whitespace (leading/trailing)
-- and do not generate events for empty text elements
--
-- * expandEntities
--
-- Expand entities (standard entities + single char
-- numeric entities only currently - could be extended
-- at runtime if suitable DTD parser added elements
-- to table (see obj._ENTITIES). May also be possible
-- to expand multibyre entities for UTF-8 only
--
-- * errorHandler
--
-- Custom error handler function
--
-- NOTE: Boolean options must be set to 'nil' not '0'
--
-- Usage
-- =====
--
-- Create a handler instance -
--
-- h = { start = function(t,a,s,e) .... end,
-- end = function(t,a,s,e) .... end,
-- text = function(t,a,s,e) .... end,
-- cdata = text }
--
-- (or use predefined handler - see handler.lua)
--
-- Create parser instance -
--
-- p = xmlParser(h)
--
-- Set options -
--
-- p.options.xxxx = nil
--
-- Parse XML data -
--
-- xmlParser:parse("<?xml... ")
-- License:
-- ========
--
-- This code is freely distributable under the terms of the Lua license
-- (http://www.lua.org/copyright.html)
--
-- History
-- =======
-- Added parameter parseAttributes (boolean) in xmlParser.parse method
-- If true (default value), tag attributtes are parsed.
-- by Manoel Campos da Silva Filho
-- http://manoelcampos.com
-- http://about.me/manoelcampos
--
-- $Id: xml.lua,v 1.1.1.1 2001/11/28 06:11:33 paulc Exp $
--
-- $Log: xml.lua,v $
-- Revision 1.1.1.1 2001/11/28 06:11:33 paulc
-- Initial Import
--
--@author Paul Chakravarti (paulc@passtheaardvark.com)<p/>
---Parses a XML string
--@param handler Handler object to be used to convert the XML string
--to another formats. @see handler.lua
local xmlParser = function(handler)
local obj = {}
-- Public attributes
obj.options = {
stripWS = 1,
expandEntities = 1,
errorHandler = function(err,pos)
error(string.format("%s [char=%d]\n",
err or "Parse Error",pos))
end,
}
-- Public methods
obj.parse = function(self, str, parseAttributes)
if parseAttributes == nil then
parseAttributes = true
end
self._handler.parseAttributes = parseAttributes
local match,endmatch,pos = 0,0,1
local text,endt1,endt2,tagstr,tagname,attrs,starttext,endtext
local errstart,errend,extstart,extend
while match do
-- Get next tag (first pass - fix exceptions below)
match,endmatch,text,endt1,tagstr,endt2 = string.find(str,self._XML,pos)
if not match then
if string.find(str, self._WS,pos) then
-- No more text - check document complete
if #self._stack ~= 0 then
self:_err(self._errstr.incompleteXmlErr,pos)
else
break
end
else
-- Unparsable text
self:_err(self._errstr.xmlErr,pos)
end
end
-- Handle leading text
starttext = match
endtext = match + string.len(text) - 1
match = match + string.len(text)
text = self:_parseEntities(self:_stripWS(text))
if text ~= "" and self._handler.text then
self._handler:text(text,nil,match,endtext)
end
-- Test for tag type
if string.find(string.sub(tagstr,1,5),"?xml%s") then
-- XML Declaration
match,endmatch,text = string.find(str,self._PI,pos)
if not match then
self:_err(self._errstr.declErr,pos)
end
if match ~= 1 then
-- Must be at start of doc if present
self:_err(self._errstr.declStartErr,pos)
end
tagname,attrs = self:_parseTag(text)
-- TODO: Check attributes are valid
-- Check for version (mandatory)
if attrs.version == nil then
self:_err(self._errstr.declAttrErr,pos)
end
if self._handler.decl then
self._handler:decl(tagname,attrs,match,endmatch)
end
elseif string.sub(tagstr,1,1) == "?" then
-- Processing Instruction
match,endmatch,text = string.find(str,self._PI,pos)
if not match then
self:_err(self._errstr.piErr,pos)
end
if self._handler.pi then
-- Parse PI attributes & text
tagname,attrs = self:_parseTag(text)
local pi = string.sub(text,string.len(tagname)+1)
if pi ~= "" then
if attrs then
attrs._text = pi
else
attrs = { _text = pi }
end
end
self._handler:pi(tagname,attrs,match,endmatch)
end
elseif string.sub(tagstr,1,3) == "!--" then
-- Comment
match,endmatch,text = string.find(str,self._COMMENT,pos)
if not match then
self:_err(self._errstr.commentErr,pos)
end
if self._handler.comment then
text = self:_parseEntities(self:_stripWS(text))
self._handler:comment(text,next,match,endmatch)
end
elseif string.sub(tagstr,1,8) == "!DOCTYPE" then
-- DTD
match,endmatch,attrs = self:_parseDTD(string,pos)
if not match then
self:_err(self._errstr.dtdErr,pos)
end
if self._handler.dtd then
self._handler:dtd(attrs._root,attrs,match,endmatch)
end
elseif string.sub(tagstr,1,8) == "![CDATA[" then
-- CDATA
match,endmatch,text = string.find(str,self._CDATA,pos)
if not match then
self:_err(self._errstr.cdataErr,pos)
end
if self._handler.cdata then
self._handler:cdata(text,nil,match,endmatch)
end
else
-- Normal tag
-- Need check for embedded '>' in attribute value and extend
-- match recursively if necessary eg. <tag attr="123>456">
while 1 do
errstart,errend = string.find(tagstr,self._ATTRERR1)
if errend == nil then
errstart,errend = string.find(tagstr,self._ATTRERR2)
if errend == nil then
break
end
end
extstart,extend,endt2 = string.find(str,self._TAGEXT,endmatch+1)
tagstr = tagstr .. string.sub(string,endmatch,extend-1)
if not match then
self:_err(self._errstr.xmlErr,pos)
end
endmatch = extend
end
-- Extract tagname/attrs
tagname,attrs = self:_parseTag(tagstr)
if (endt1=="/") then
-- End tag
if self._handler.endtag then
if attrs then
-- Shouldnt have any attributes in endtag
self:_err(string.format("%s (/%s)",
self._errstr.endTagErr,
tagname)
,pos)
end
if table.remove(self._stack) ~= tagname then
self:_err(string.format("%s (/%s)",
self._errstr.unmatchedTagErr,
tagname)
,pos)
end
self._handler:endtag(tagname,nil,match,endmatch)
end
else
-- Start Tag
table.insert(self._stack,tagname)
if self._handler.starttag then
self._handler:starttag(tagname,attrs,match,endmatch)
end
--TODO: Tags com fechamento automático estão sendo
--retornadas como uma tabela, o que complica
--para a app NCLua tratar isso. É preciso
--fazer com que seja retornado um campo string vazio.
-- Self-Closing Tag
if (endt2=="/") then
table.remove(self._stack)
if self._handler.endtag then
self._handler:endtag(tagname,nil,match,endmatch)
end
end
end
end
pos = endmatch + 1
end
end
-- Private attribures/functions
obj._handler = handler
obj._stack = {}
obj._XML = '^([^<]*)<(%/?)([^>]-)(%/?)>'
obj._ATTR1 = '([%w-:_]+)%s*=%s*"(.-)"'
obj._ATTR2 = '([%w-:_]+)%s*=%s*\'(.-)\''
obj._CDATA = '<%!%[CDATA%[(.-)%]%]>'
obj._PI = '<%?(.-)%?>'
obj._COMMENT = '<!%-%-(.-)%-%->'
obj._TAG = '^(.-)%s.*'
obj._LEADINGWS = '^%s+'
obj._TRAILINGWS = '%s+$'
obj._WS = '^%s*$'
obj._DTD1 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*(%b[])%s*>'
obj._DTD2 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*(%b[])%s*>'
obj._DTD3 = '<!DOCTYPE%s+(.-)%s*(%b[])%s*>'
obj._DTD4 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*>'
obj._DTD5 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*>'
obj._ATTRERR1 = '=%s*"[^"]*$'
obj._ATTRERR2 = '=%s*\'[^\']*$'
obj._TAGEXT = '(%/?)>'
obj._ENTITIES = { ["&lt;"] = "<",
["&gt;"] = ">",
["&amp;"] = "&",
["&quot;"] = '"',
["&apos;"] = "'",
["&#(%d+);"] = function (x)
local d = tonumber(x)
if d >= 0 and d < 256 then
return string.char(d)
else
return "&#"..d..";"
end
end,
["&#x(%x+);"] = function (x)
local d = tonumber(x,16)
if d >= 0 and d < 256 then
return string.char(d)
else
return "&#x"..x..";"
end
end,
}
obj._err = function(self,err,pos)
if self.options.errorHandler then
self.options.errorHandler(err,pos)
end
end
obj._errstr = { xmlErr = "Error Parsing XML",
declErr = "Error Parsing XMLDecl",
declStartErr = "XMLDecl not at start of document",
declAttrErr = "Invalid XMLDecl attributes",
piErr = "Error Parsing Processing Instruction",
commentErr = "Error Parsing Comment",
cdataErr = "Error Parsing CDATA",
dtdErr = "Error Parsing DTD",
endTagErr = "End Tag Attributes Invalid",
unmatchedTagErr = "Unbalanced Tag",
incompleteXmlErr = "Incomplete XML Document",
}
obj._stripWS = function(self,s)
if self.options.stripWS then
s = string.gsub(s,'^%s+','')
s = string.gsub(s,'%s+$','')
end
return s
end
obj._parseEntities = function(self,s)
if self.options.expandEntities then
--for k,v in self._ENTITIES do
for k,v in pairs(self._ENTITIES) do
--print (k, v)
s = string.gsub(s,k,v)
end
end
return s
end
obj._parseDTD = function(self,s,pos)
-- match,endmatch,root,type,name,uri,internal
local m,e,r,t,n,u,i
m,e,r,t,u,i = string.find(s,self._DTD1,pos)
if m then
return m,e,{_root=r,_type=t,_uri=u,_internal=i}
end
m,e,r,t,n,u,i = string.find(s,self._DTD2,pos)
if m then
return m,e,{_root=r,_type=t,_name=n,_uri=u,_internal=i}
end
m,e,r,i = string.find(s,self._DTD3,pos)
if m then
return m,e,{_root=r,_internal=i}
end
m,e,r,t,u = string.find(s,self._DTD4,pos)
if m then
return m,e,{_root=r,_type=t,_uri=u}
end
m,e,r,t,n,u = string.find(s,self._DTD5,pos)
if m then
return m,e,{_root=r,_type=t,_name=n,_uri=u}
end
return nil
end
---Parses a string representing a tag
--@param s String containing tag text
--@return Returns a string containing the tagname and a table attrs
--containing the atributtes of tag
obj._parseTag = function(self,s)
local attrs = {}
local tagname = string.gsub(s,self._TAG,'%1')
string.gsub(s,self._ATTR1,function (k,v)
attrs[string.lower(k)]=self:_parseEntities(v)
attrs._ = 1
end)
string.gsub(s,self._ATTR2,function (k,v)
attrs[string.lower(k)]=self:_parseEntities(v)
attrs._ = 1
end)
if attrs._ then
attrs._ = nil
else
attrs = nil
end
return tagname,attrs
end
return obj
end
return { xmlParser = xmlParser }