mirror of
https://github.com/koreader/koreader
synced 2024-11-13 19:11:25 +00:00
d6c81f5097
* Simple News (RSS/Atom) downloader plugin * Limit news from sources * new News Downloader menu entry - goto news dir * Initial configuration logic * Download each feed source to separate folder * 2 feed sources in example config
484 lines
17 KiB
Lua
484 lines
17 KiB
Lua
---
|
|
-- Overview:
|
|
-- =========
|
|
--
|
|
-- This module provides a non-validating XML stream parser in Lua.
|
|
--
|
|
-- Features:
|
|
-- =========
|
|
--
|
|
-- * Tokenises well-formed XML (relatively robustly)
|
|
-- * Flexible handler based event API (see below)
|
|
-- * Parses all XML Infoset elements - ie.
|
|
-- - Tags
|
|
-- - Text
|
|
-- - Comments
|
|
-- - CDATA
|
|
-- - XML Decl
|
|
-- - Processing Instructions
|
|
-- - DOCTYPE declarations
|
|
-- * Provides limited well-formedness checking
|
|
-- (checks for basic syntax & balanced tags only)
|
|
-- * Flexible whitespace handling (selectable)
|
|
-- * Entity Handling (selectable)
|
|
--
|
|
-- Limitations:
|
|
-- ============
|
|
--
|
|
-- * Non-validating
|
|
-- * No charset handling
|
|
-- * No namespace support
|
|
-- * Shallow well-formedness checking only (fails
|
|
-- to detect most semantic errors)
|
|
--
|
|
-- API:
|
|
-- ====
|
|
--
|
|
-- The parser provides a partially object-oriented API with
|
|
-- functionality split into tokeniser and handler components.
|
|
--
|
|
-- The handler instance is passed to the tokeniser and receives
|
|
-- callbacks for each XML element processed (if a suitable handler
|
|
-- function is defined). The API is conceptually similar to the
|
|
-- SAX API but implemented differently.
|
|
--
|
|
-- The following events are generated by the tokeniser
|
|
--
|
|
-- handler:start - Start Tag
|
|
-- handler:end - End Tag
|
|
-- handler:text - Text
|
|
-- handler:decl - XML Declaration
|
|
-- handler:pi - Processing Instruction
|
|
-- handler:comment - Comment
|
|
-- handler:dtd - DOCTYPE definition
|
|
-- handler:cdata - CDATA
|
|
--
|
|
-- The function prototype for all the callback functions is
|
|
--
|
|
-- callback(val,attrs,start,end)
|
|
--
|
|
-- where attrs is a table and val/attrs are overloaded for
|
|
-- specific callbacks - ie.
|
|
--
|
|
-- Callback val attrs (table)
|
|
-- -------- --- -------------
|
|
-- start name { attributes (name=val).. }
|
|
-- end name nil
|
|
-- text <text> nil
|
|
-- cdata <text> nil
|
|
-- decl "xml" { attributes (name=val).. }
|
|
-- pi pi name { attributes (if present)..
|
|
-- _text = <PI Text>
|
|
-- }
|
|
-- comment <text> nil
|
|
-- dtd root element { _root = <Root Element>,
|
|
-- _type = SYSTEM|PUBLIC,
|
|
-- _name = <name>,
|
|
-- _uri = <uri>,
|
|
-- _internal = <internal dtd>
|
|
-- }
|
|
--
|
|
-- (start & end provide the character positions of the start/end
|
|
-- of the element)
|
|
--
|
|
-- XML data is passed to the parser instance through the 'parse'
|
|
-- method (Note: must be passed a single string currently)
|
|
--
|
|
-- Options
|
|
-- =======
|
|
--
|
|
-- Parser options are controlled through the 'self.options' table.
|
|
-- Available options are -
|
|
--
|
|
-- * stripWS
|
|
--
|
|
-- Strip non-significant whitespace (leading/trailing)
|
|
-- and do not generate events for empty text elements
|
|
--
|
|
-- * expandEntities
|
|
--
|
|
-- Expand entities (standard entities + single char
|
|
-- numeric entities only currently - could be extended
|
|
-- at runtime if suitable DTD parser added elements
|
|
-- to table (see obj._ENTITIES). May also be possible
|
|
-- to expand multibyre entities for UTF-8 only
|
|
--
|
|
-- * errorHandler
|
|
--
|
|
-- Custom error handler function
|
|
--
|
|
-- NOTE: Boolean options must be set to 'nil' not '0'
|
|
--
|
|
-- Usage
|
|
-- =====
|
|
--
|
|
-- Create a handler instance -
|
|
--
|
|
-- h = { start = function(t,a,s,e) .... end,
|
|
-- end = function(t,a,s,e) .... end,
|
|
-- text = function(t,a,s,e) .... end,
|
|
-- cdata = text }
|
|
--
|
|
-- (or use predefined handler - see handler.lua)
|
|
--
|
|
-- Create parser instance -
|
|
--
|
|
-- p = xmlParser(h)
|
|
--
|
|
-- Set options -
|
|
--
|
|
-- p.options.xxxx = nil
|
|
--
|
|
-- Parse XML data -
|
|
--
|
|
-- xmlParser:parse("<?xml... ")
|
|
-- License:
|
|
-- ========
|
|
--
|
|
-- This code is freely distributable under the terms of the Lua license
|
|
-- (http://www.lua.org/copyright.html)
|
|
--
|
|
-- History
|
|
-- =======
|
|
-- Added parameter parseAttributes (boolean) in xmlParser.parse method
|
|
-- If true (default value), tag attributtes are parsed.
|
|
-- by Manoel Campos da Silva Filho
|
|
-- http://manoelcampos.com
|
|
-- http://about.me/manoelcampos
|
|
|
|
--
|
|
-- $Id: xml.lua,v 1.1.1.1 2001/11/28 06:11:33 paulc Exp $
|
|
--
|
|
-- $Log: xml.lua,v $
|
|
-- Revision 1.1.1.1 2001/11/28 06:11:33 paulc
|
|
-- Initial Import
|
|
--
|
|
--@author Paul Chakravarti (paulc@passtheaardvark.com)<p/>
|
|
|
|
|
|
---Parses a XML string
|
|
--@param handler Handler object to be used to convert the XML string
|
|
--to another formats. @see handler.lua
|
|
local xmlParser = function(handler)
|
|
local obj = {}
|
|
-- Public attributes
|
|
|
|
obj.options = {
|
|
stripWS = 1,
|
|
expandEntities = 1,
|
|
errorHandler = function(err,pos)
|
|
error(string.format("%s [char=%d]\n",
|
|
err or "Parse Error",pos))
|
|
end,
|
|
}
|
|
|
|
-- Public methods
|
|
|
|
obj.parse = function(self, str, parseAttributes)
|
|
if parseAttributes == nil then
|
|
parseAttributes = true
|
|
end
|
|
self._handler.parseAttributes = parseAttributes
|
|
|
|
local match,endmatch,pos = 0,0,1
|
|
local text,endt1,endt2,tagstr,tagname,attrs,starttext,endtext
|
|
local errstart,errend,extstart,extend
|
|
while match do
|
|
-- Get next tag (first pass - fix exceptions below)
|
|
match,endmatch,text,endt1,tagstr,endt2 = string.find(str,self._XML,pos)
|
|
if not match then
|
|
if string.find(str, self._WS,pos) then
|
|
-- No more text - check document complete
|
|
if #self._stack ~= 0 then
|
|
self:_err(self._errstr.incompleteXmlErr,pos)
|
|
else
|
|
break
|
|
end
|
|
else
|
|
-- Unparsable text
|
|
self:_err(self._errstr.xmlErr,pos)
|
|
end
|
|
end
|
|
-- Handle leading text
|
|
starttext = match
|
|
endtext = match + string.len(text) - 1
|
|
match = match + string.len(text)
|
|
text = self:_parseEntities(self:_stripWS(text))
|
|
if text ~= "" and self._handler.text then
|
|
self._handler:text(text,nil,match,endtext)
|
|
end
|
|
-- Test for tag type
|
|
if string.find(string.sub(tagstr,1,5),"?xml%s") then
|
|
-- XML Declaration
|
|
match,endmatch,text = string.find(str,self._PI,pos)
|
|
if not match then
|
|
self:_err(self._errstr.declErr,pos)
|
|
end
|
|
if match ~= 1 then
|
|
-- Must be at start of doc if present
|
|
self:_err(self._errstr.declStartErr,pos)
|
|
end
|
|
tagname,attrs = self:_parseTag(text)
|
|
-- TODO: Check attributes are valid
|
|
-- Check for version (mandatory)
|
|
if attrs.version == nil then
|
|
self:_err(self._errstr.declAttrErr,pos)
|
|
end
|
|
if self._handler.decl then
|
|
self._handler:decl(tagname,attrs,match,endmatch)
|
|
end
|
|
elseif string.sub(tagstr,1,1) == "?" then
|
|
-- Processing Instruction
|
|
match,endmatch,text = string.find(str,self._PI,pos)
|
|
if not match then
|
|
self:_err(self._errstr.piErr,pos)
|
|
end
|
|
if self._handler.pi then
|
|
-- Parse PI attributes & text
|
|
tagname,attrs = self:_parseTag(text)
|
|
local pi = string.sub(text,string.len(tagname)+1)
|
|
if pi ~= "" then
|
|
if attrs then
|
|
attrs._text = pi
|
|
else
|
|
attrs = { _text = pi }
|
|
end
|
|
end
|
|
self._handler:pi(tagname,attrs,match,endmatch)
|
|
end
|
|
elseif string.sub(tagstr,1,3) == "!--" then
|
|
-- Comment
|
|
match,endmatch,text = string.find(str,self._COMMENT,pos)
|
|
if not match then
|
|
self:_err(self._errstr.commentErr,pos)
|
|
end
|
|
if self._handler.comment then
|
|
text = self:_parseEntities(self:_stripWS(text))
|
|
self._handler:comment(text,next,match,endmatch)
|
|
end
|
|
elseif string.sub(tagstr,1,8) == "!DOCTYPE" then
|
|
-- DTD
|
|
match,endmatch,attrs = self:_parseDTD(string,pos)
|
|
if not match then
|
|
self:_err(self._errstr.dtdErr,pos)
|
|
end
|
|
if self._handler.dtd then
|
|
self._handler:dtd(attrs._root,attrs,match,endmatch)
|
|
end
|
|
elseif string.sub(tagstr,1,8) == "![CDATA[" then
|
|
-- CDATA
|
|
match,endmatch,text = string.find(str,self._CDATA,pos)
|
|
if not match then
|
|
self:_err(self._errstr.cdataErr,pos)
|
|
end
|
|
if self._handler.cdata then
|
|
self._handler:cdata(text,nil,match,endmatch)
|
|
end
|
|
else
|
|
-- Normal tag
|
|
|
|
-- Need check for embedded '>' in attribute value and extend
|
|
-- match recursively if necessary eg. <tag attr="123>456">
|
|
|
|
while 1 do
|
|
errstart,errend = string.find(tagstr,self._ATTRERR1)
|
|
if errend == nil then
|
|
errstart,errend = string.find(tagstr,self._ATTRERR2)
|
|
if errend == nil then
|
|
break
|
|
end
|
|
end
|
|
extstart,extend,endt2 = string.find(str,self._TAGEXT,endmatch+1)
|
|
tagstr = tagstr .. string.sub(string,endmatch,extend-1)
|
|
if not match then
|
|
self:_err(self._errstr.xmlErr,pos)
|
|
end
|
|
endmatch = extend
|
|
end
|
|
|
|
-- Extract tagname/attrs
|
|
|
|
tagname,attrs = self:_parseTag(tagstr)
|
|
|
|
if (endt1=="/") then
|
|
-- End tag
|
|
if self._handler.endtag then
|
|
if attrs then
|
|
-- Shouldnt have any attributes in endtag
|
|
self:_err(string.format("%s (/%s)",
|
|
self._errstr.endTagErr,
|
|
tagname)
|
|
,pos)
|
|
end
|
|
if table.remove(self._stack) ~= tagname then
|
|
self:_err(string.format("%s (/%s)",
|
|
self._errstr.unmatchedTagErr,
|
|
tagname)
|
|
,pos)
|
|
end
|
|
self._handler:endtag(tagname,nil,match,endmatch)
|
|
end
|
|
else
|
|
-- Start Tag
|
|
table.insert(self._stack,tagname)
|
|
if self._handler.starttag then
|
|
self._handler:starttag(tagname,attrs,match,endmatch)
|
|
end
|
|
--TODO: Tags com fechamento automático estão sendo
|
|
--retornadas como uma tabela, o que complica
|
|
--para a app NCLua tratar isso. É preciso
|
|
--fazer com que seja retornado um campo string vazio.
|
|
-- Self-Closing Tag
|
|
if (endt2=="/") then
|
|
table.remove(self._stack)
|
|
if self._handler.endtag then
|
|
self._handler:endtag(tagname,nil,match,endmatch)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
pos = endmatch + 1
|
|
end
|
|
end
|
|
|
|
-- Private attribures/functions
|
|
|
|
obj._handler = handler
|
|
obj._stack = {}
|
|
|
|
obj._XML = '^([^<]*)<(%/?)([^>]-)(%/?)>'
|
|
obj._ATTR1 = '([%w-:_]+)%s*=%s*"(.-)"'
|
|
obj._ATTR2 = '([%w-:_]+)%s*=%s*\'(.-)\''
|
|
obj._CDATA = '<%!%[CDATA%[(.-)%]%]>'
|
|
obj._PI = '<%?(.-)%?>'
|
|
obj._COMMENT = '<!%-%-(.-)%-%->'
|
|
obj._TAG = '^(.-)%s.*'
|
|
obj._LEADINGWS = '^%s+'
|
|
obj._TRAILINGWS = '%s+$'
|
|
obj._WS = '^%s*$'
|
|
obj._DTD1 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*(%b[])%s*>'
|
|
obj._DTD2 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*(%b[])%s*>'
|
|
obj._DTD3 = '<!DOCTYPE%s+(.-)%s*(%b[])%s*>'
|
|
obj._DTD4 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*>'
|
|
obj._DTD5 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*>'
|
|
|
|
obj._ATTRERR1 = '=%s*"[^"]*$'
|
|
obj._ATTRERR2 = '=%s*\'[^\']*$'
|
|
obj._TAGEXT = '(%/?)>'
|
|
|
|
obj._ENTITIES = { ["<"] = "<",
|
|
[">"] = ">",
|
|
["&"] = "&",
|
|
["""] = '"',
|
|
["'"] = "'",
|
|
["&#(%d+);"] = function (x)
|
|
local d = tonumber(x)
|
|
if d >= 0 and d < 256 then
|
|
return string.char(d)
|
|
else
|
|
return "&#"..d..";"
|
|
end
|
|
end,
|
|
["&#x(%x+);"] = function (x)
|
|
local d = tonumber(x,16)
|
|
if d >= 0 and d < 256 then
|
|
return string.char(d)
|
|
else
|
|
return "&#x"..x..";"
|
|
end
|
|
end,
|
|
}
|
|
|
|
obj._err = function(self,err,pos)
|
|
if self.options.errorHandler then
|
|
self.options.errorHandler(err,pos)
|
|
end
|
|
end
|
|
|
|
obj._errstr = { xmlErr = "Error Parsing XML",
|
|
declErr = "Error Parsing XMLDecl",
|
|
declStartErr = "XMLDecl not at start of document",
|
|
declAttrErr = "Invalid XMLDecl attributes",
|
|
piErr = "Error Parsing Processing Instruction",
|
|
commentErr = "Error Parsing Comment",
|
|
cdataErr = "Error Parsing CDATA",
|
|
dtdErr = "Error Parsing DTD",
|
|
endTagErr = "End Tag Attributes Invalid",
|
|
unmatchedTagErr = "Unbalanced Tag",
|
|
incompleteXmlErr = "Incomplete XML Document",
|
|
}
|
|
|
|
obj._stripWS = function(self,s)
|
|
if self.options.stripWS then
|
|
s = string.gsub(s,'^%s+','')
|
|
s = string.gsub(s,'%s+$','')
|
|
end
|
|
return s
|
|
end
|
|
|
|
obj._parseEntities = function(self,s)
|
|
if self.options.expandEntities then
|
|
--for k,v in self._ENTITIES do
|
|
for k,v in pairs(self._ENTITIES) do
|
|
--print (k, v)
|
|
s = string.gsub(s,k,v)
|
|
end
|
|
end
|
|
return s
|
|
end
|
|
|
|
obj._parseDTD = function(self,s,pos)
|
|
-- match,endmatch,root,type,name,uri,internal
|
|
local m,e,r,t,n,u,i
|
|
m,e,r,t,u,i = string.find(s,self._DTD1,pos)
|
|
if m then
|
|
return m,e,{_root=r,_type=t,_uri=u,_internal=i}
|
|
end
|
|
m,e,r,t,n,u,i = string.find(s,self._DTD2,pos)
|
|
if m then
|
|
return m,e,{_root=r,_type=t,_name=n,_uri=u,_internal=i}
|
|
end
|
|
m,e,r,i = string.find(s,self._DTD3,pos)
|
|
if m then
|
|
return m,e,{_root=r,_internal=i}
|
|
end
|
|
m,e,r,t,u = string.find(s,self._DTD4,pos)
|
|
if m then
|
|
return m,e,{_root=r,_type=t,_uri=u}
|
|
end
|
|
m,e,r,t,n,u = string.find(s,self._DTD5,pos)
|
|
if m then
|
|
return m,e,{_root=r,_type=t,_name=n,_uri=u}
|
|
end
|
|
return nil
|
|
end
|
|
|
|
---Parses a string representing a tag
|
|
--@param s String containing tag text
|
|
--@return Returns a string containing the tagname and a table attrs
|
|
--containing the atributtes of tag
|
|
obj._parseTag = function(self,s)
|
|
local attrs = {}
|
|
local tagname = string.gsub(s,self._TAG,'%1')
|
|
string.gsub(s,self._ATTR1,function (k,v)
|
|
attrs[string.lower(k)]=self:_parseEntities(v)
|
|
attrs._ = 1
|
|
end)
|
|
string.gsub(s,self._ATTR2,function (k,v)
|
|
attrs[string.lower(k)]=self:_parseEntities(v)
|
|
attrs._ = 1
|
|
end)
|
|
if attrs._ then
|
|
attrs._ = nil
|
|
else
|
|
attrs = nil
|
|
end
|
|
return tagname,attrs
|
|
end
|
|
|
|
return obj
|
|
|
|
end
|
|
|
|
return { xmlParser = xmlParser }
|