Skip to content

Commit b56ea86

Browse files
author
Gavin Kistner
committed
Version 0.3; namespace support, entity testing, dom parser
1 parent 7f2b082 commit b56ea86

File tree

4 files changed

+174
-95
lines changed

4 files changed

+174
-95
lines changed

README.md

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# SLAXML
22
SLAXML is a pure-Lua SAX-like streaming XML parser. It is more robust than
3-
many (simpler) pattern-based parsers that exist ([such as mine][1]), properly supporting code like `<expr test="5 > 7" />`, CDATA nodes, comments, and processing instructions.
3+
many (simpler) pattern-based parsers that exist ([such as mine][1]), properly
4+
supporting code like `<expr test="5 > 7" />`, CDATA nodes, comments, namespaces,
5+
and processing instructions.
46

5-
It is currently not a truly valid XML parser, however, as it allows completely invalid XML such
6-
as `<foo></bar>` to be parsed (and reported) as such. It is also not namespace-aware. See the "Limitations / TODO" section below for more details.
7+
It is currently not a truly valid XML parser, however, as it allows some invalid
8+
XML such as `<foo></bar>` to be parsed (and reported) as such.
9+
See the "Limitations / TODO" section below for more details.
710

811
[1]: http://phrogz.net/lua/AKLOMParser.lua
912

@@ -14,12 +17,13 @@ as `<foo></bar>` to be parsed (and reported) as such. It is also not namespace-a
1417

1518
-- Specify as many/few of these as you like
1619
parser = SLAXML:parser{
17-
startElement = function(name) end, -- When "<foo" is seen
18-
attribute = function(name,value) end, -- attribute found
19-
closeElement = function(name) end, -- When "</foo" or "/>" is seen
20-
text = function(text) end, -- text and CDATA nodes
21-
comment = function(content) end, -- comments
22-
pi = function(target,content) end, -- processing instructions e.g. "<?yes mon?>"
20+
startElement = function(name,nsURI) end, -- When "<foo" or <x:foo is seen
21+
attribute = function(name,value,nsURI) end, -- attribute found on current element
22+
closeElement = function(name) end, -- When "</foo" or "/>" is seen
23+
text = function(text) end, -- text and CDATA nodes
24+
comment = function(content) end, -- comments
25+
pi = function(target,content) end, -- processing instructions e.g. "<?yes mon?>"
26+
namespace = function(nsURI) end, -- when xmlns="..." is seen (after startElement)
2327
}
2428

2529
myxml = io.open('my.xml'):read()
@@ -32,8 +36,39 @@ If you just want to see if it parses your document correctly, you can also use j
3236

3337
…which will cause SLAXML to use its built-in callbacks that print the results as seen.
3438

39+
If you want to build a table object model from your XML (with simple collections like
40+
`.kids` and `.attr` for navigating the hierarchy) then you can alternatively:
41+
42+
require 'slaxdom'
43+
local doc = SLAXML:dom(myxml)
44+
print( doc.root.name )
45+
print( doc.root.nsURI )
46+
print( doc.root.attr['version'] )
47+
for i,node in ipairs(doc.root.kids) do
48+
-- includes elements, comments, textnodes and PIs
49+
print("Child #",i,"is",node.type,node.name)
50+
end
51+
for i,el in ipairs(doc.root.el) do
52+
-- includes only elements
53+
print("Element #",i,"is",node.name)
54+
for name,value in pairs(node.attr) do
55+
print("",name,"=",value)
56+
end
57+
end
58+
59+
3560
# History
3661

62+
## v0.3 2013-Feb-15
63+
### Features
64+
+ Support namespaces for elements and attributes
65+
+ `<foo xmlns="bar">` will call `startElement("foo",nil)` followed by `namespace("bar")`
66+
+ Child elements inheriting the default namespace will call `startElement("child","bar")`
67+
+ `<xy:foo>` will call `startElement("foo","uri-for-xy-namespace")` or error if not found
68+
+ `<foo xy:bar="yay">` will call `attribute("bar","yay","uri-for-xy-namespace")` or error if not found
69+
+ Add (optional) DOM parser that validates hierarchy and supports namespaces
70+
- Except that namespaced attributes with the same name will collide
71+
3772
## v0.2 2013-Feb-15
3873
### Features
3974
+ Supports expanding numeric entities e.g. `&#34;` -> `"`
@@ -49,13 +84,8 @@ If you just want to see if it parses your document correctly, you can also use j
4984

5085
### Limitations / TODO
5186
- Does not require or enforce well-formed XML (or report/fail on invalid)
52-
- No support for namespaces:
53-
- `xmlns="…"` attributes look like any other
54-
- `xmlns:foo="…"` attributes will report name as "xmlns:foo"
55-
- `<foo:bar>` elements will report name as "foo:bar"
56-
- `foo:bar="…"` attributes will report name as "foo:bar"
5787
- No support for entity expansion other than
58-
`&lt; &gt; &quot; &apos; &amp;`
88+
`&lt; &gt; &quot; &apos; &amp;` and numeric ASCII entities like `&#10;`
5989
- XML Declarations (`<?xml version="1.x"?>`) are incorrectly reported
6090
as Processing Instructions
6191
- No support for DTDs

slaxdom.lua

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
-- Optional parser that creates a flat DOM from parsing
2+
require 'slaxml'
3+
function SLAXML:dom(xml,ignoreWhitespace)
4+
SLAXML.ignoreWhitespace = ignoreWhitespace
5+
local push, pop = table.insert, table.remove
6+
local stack = {}
7+
local doc = { type="document", name="#doc", kids={} }
8+
local current = doc
9+
local builder = SLAXML:parser{
10+
startElement = function(name,nsURI)
11+
local el = { type="element", name=name, kids={}, el={}, attr={}, nsURI=nsURI }
12+
if current==doc then
13+
if doc.root then
14+
error(("Encountered element '%s' when the document already has a root '%s' element"):format(name,doc.root.name))
15+
else
16+
doc.root = el
17+
end
18+
end
19+
if current.type~="element" and current.type~="document" then
20+
error(("Encountered an element inside of a %s"):format(current.type))
21+
else
22+
push(current.kids,el)
23+
if current.el then push(current.el,el) end
24+
end
25+
current = el
26+
push(stack,el)
27+
end,
28+
namespace = function(nsURI)
29+
current.nsURI = nsURI
30+
end,
31+
attribute = function(name,value,nsURI)
32+
if not current or current.type~="element" then
33+
error(("Encountered an attribute %s=%s but I wasn't inside an element"):format(name,value))
34+
else
35+
-- TODO: differentiate namespaced attributes
36+
current.attr[name] = value
37+
end
38+
end,
39+
closeElement = function(name)
40+
if current.name~=name or current.type~="element" then
41+
error(("Received a close element notification for '%s' but was inside a '%s' %s"):format(name,current.name,current.type))
42+
end
43+
pop(stack)
44+
current = stack[#stack]
45+
end,
46+
text = function(value)
47+
if current.type~='document' then
48+
if current.type~="element" then
49+
error(("Received a text notification '%s' but was inside a %s"):format(value,current.type))
50+
else
51+
push(current.kids,{type='text',name='#text',value=value,text=value})
52+
if current.text then current.text = current.text..value else current.text=value end
53+
end
54+
end
55+
end,
56+
comment = function(value)
57+
push(current.kids,{type='comment',name='#comment',value=value,text=value})
58+
end,
59+
pi = function(name,value)
60+
push(current.kids,{type='pi',name=name,value=value})
61+
end
62+
}
63+
builder:parse(xml)
64+
return doc
65+
end

slaxml.lua

Lines changed: 57 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
--[=====================================================================[
2-
v0.2 Copyright © 2013 Gavin Kistner <[email protected]>; MIT Licensed
2+
v0.3 Copyright © 2013 Gavin Kistner <[email protected]>; MIT Licensed
33
See http://github.com/Phrogz/SLAXML for details.
44
--]=====================================================================]
55
SLAXML = {
6-
VERSION = "0.2",
6+
VERSION = "0.3",
77
ignoreWhitespace = true,
88
_call = {
99
pi = function(target,content)
@@ -12,18 +12,21 @@ SLAXML = {
1212
comment = function(content)
1313
print(string.format("<!-- %s -->",content))
1414
end,
15-
startElement = function(name)
16-
print(string.format("<%s>",name))
15+
startElement = function(name,nsURI)
16+
print(string.format("<%s %s>",name,nsURI or "-"))
1717
end,
18-
attribute = function(name,value)
19-
print(string.format(" %s=%q",name,value))
18+
attribute = function(name,value,nsURI)
19+
print(string.format(" %s=%q (%s)",name,value,nsURI or "-"))
2020
end,
2121
text = function(text)
2222
print(string.format(" text: %q",text))
2323
end,
2424
closeElement = function(name)
2525
print(string.format("</%s>",name))
2626
end,
27+
namespace = function(nsURI) -- applies a default namespace to the current element
28+
print(string.format(" (xmlns=%s)",nsURI))
29+
end,
2730
}
2831
}
2932

@@ -33,13 +36,14 @@ end
3336

3437
function SLAXML:parse(xml)
3538
-- Cache references for maximum speed
36-
local find, sub, gsub, char = string.find, string.sub, string.gsub, string.char
39+
local find, sub, gsub, char, push, pop = string.find, string.sub, string.gsub, string.char, table.insert, table.remove
3740
-- local sub, gsub, find, push, pop, unescape = string.sub, string.gsub, string.find, table.insert, table.remove, unescape
38-
local first, last, match1, match2, pos2
41+
local first, last, match1, match2, match3, pos2, nsURI
3942
local pos = 1
4043
local state = "text"
4144
local textStart = 1
4245
local currentElement
46+
local nsStack = {}
4347

4448
local entityMap = { ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
4549
local entitySwap = function(orig,n,s) return entityMap[s] or n=="#" and char(s) or orig end
@@ -79,13 +83,29 @@ function SLAXML:parse(xml)
7983
end
8084
end
8185

86+
local function nsForPrefix(prefix)
87+
for i=#nsStack,1,-1 do if nsStack[i][prefix] then return nsStack[i][prefix] end end
88+
error(("Cannot find namespace for prefix %s"):format(prefix))
89+
end
90+
8291
local function startElement()
83-
first, last, match1 = find( xml, '^<([:%a_][:%w_.-]*)', pos )
92+
first, last, match1 = find( xml, '^<([%a_][%w_.-]*)', pos )
8493
if first then
94+
nsURI = nil
8595
finishText()
86-
currentElement = match1
87-
if self._call.startElement then self._call.startElement(match1) end
8896
pos = last+1
97+
first,last,match2 = find(xml, '^:([%a_][%w_.-]*)', pos )
98+
if first then
99+
nsURI = nsForPrefix(match1)
100+
currentElement = match2
101+
match1 = match2
102+
pos = last+1
103+
else
104+
currentElement = match1
105+
for i=#nsStack,1,-1 do if nsStack[i]['!'] then nsURI = nsStack[i]['!']; break end end
106+
end
107+
if self._call.startElement then self._call.startElement(match1,nsURI) end
108+
push(nsStack,{})
89109
return true
90110
end
91111
end
@@ -96,18 +116,34 @@ function SLAXML:parse(xml)
96116
pos2 = last+1
97117
first, last, match2 = find( xml, '^"([^<"]+)"', pos2 ) -- FIXME: disallow non-entity ampersands
98118
if first then
99-
if self._call.attribute then self._call.attribute(match1,unescape(match2)) end
100119
pos = last+1
101-
return true
120+
match2 = unescape(match2)
102121
else
103122
first, last, match2 = find( xml, "^'([^<']+)'", pos2 ) -- FIXME: disallow non-entity ampersands
104123
if first then
105-
-- TODO: unescape entities in match2
106-
if self._call.attribute then self._call.attribute(match1,unescape(match2)) end
107124
pos = last+1
108-
return true
125+
match2 = unescape(match2)
126+
end
127+
end
128+
end
129+
if match1 and match2 then
130+
nsURI = nil
131+
local prefix,name = string.match(match1,'^([^:]+):([^:]+)$')
132+
if prefix then
133+
if prefix=='xmlns' then
134+
nsStack[#nsStack][name] = match2
135+
else
136+
nsURI = nsForPrefix(prefix)
137+
match1 = name
138+
end
139+
else
140+
if match1=='xmlns' then
141+
nsStack[#nsStack]['!'] = match2
142+
if self._call.namespace then self._call.namespace(match2) end
109143
end
110144
end
145+
if self._call.attribute then self._call.attribute(match1,match2,nsURI) end
146+
return true
111147
end
112148
end
113149

@@ -128,7 +164,10 @@ function SLAXML:parse(xml)
128164
state = "text"
129165
pos = last+1
130166
textStart = pos
131-
if match1=="/" and self._call.closeElement then self._call.closeElement(currentElement) end
167+
if match1=="/" then
168+
pop(nsStack)
169+
if self._call.closeElement then self._call.closeElement(currentElement) end
170+
end
132171
return true
133172
end
134173
end
@@ -140,6 +179,7 @@ function SLAXML:parse(xml)
140179
if self._call.closeElement then self._call.closeElement(match1) end
141180
pos = last+1
142181
textStart = pos
182+
pop(nsStack)
143183
return true
144184
end
145185
end
@@ -162,64 +202,4 @@ function SLAXML:parse(xml)
162202
end
163203
end
164204
end
165-
end
166-
167-
function SLAXML:dom(xml,ignoreWhitespace,slim)
168-
SLAXML.ignoreWhitespace = ignoreWhitespace
169-
local push, pop = table.insert, table.remove
170-
local stack = {}
171-
local doc = { type="document", name="#doc", kids={} }
172-
local current = doc
173-
local builder = SLAXML:parser{
174-
startElement = function(name)
175-
local el = { type="element", name=name, kids={}, el={}, attr={} }
176-
if current==doc then
177-
if doc.root then
178-
error(("Encountered element '%s' when the document already has a root '%s' element"):format(name,doc.root.name))
179-
else
180-
doc.root = el
181-
end
182-
end
183-
if current.type~="element" and current.type~="document" then
184-
error(("Encountered an element inside of a %s"):format(current.type))
185-
else
186-
push(current.kids,el)
187-
if current.el then push(current.el,el) end
188-
end
189-
current = el
190-
push(stack,el)
191-
end,
192-
attribute = function(name,value)
193-
if not current or current.type~="element" then
194-
error(("Encountered an attribute %s=%s but I wasn't inside an element"):format(name,value))
195-
else
196-
current.attr[name] = value
197-
end
198-
end,
199-
closeElement = function(name)
200-
if current.name~=name or current.type~="element" then
201-
error(("Received a close element notification for '%s' but was inside a '%s' %s"):format(name,current.name,current.type))
202-
end
203-
pop(stack)
204-
current = stack[#stack]
205-
end,
206-
text = function(value)
207-
if current.type~='document' then
208-
if current.type~="element" then
209-
error(("Received a text notification '%s' but was inside a %s"):format(value,current.type))
210-
else
211-
push(current.kids,{type='text',name='#text',value=value,text=value})
212-
if current.text then current.text = current.text..value else current.text=value end
213-
end
214-
end
215-
end,
216-
comment = function(value)
217-
push(current.kids,{type='comment',name='#comment',value=value,text=value})
218-
end,
219-
pi = function(name,value)
220-
push(current.kids,{type='pi',name=name,value=value})
221-
end
222-
}
223-
builder:parse(xml)
224-
return doc
225205
end

0 commit comments

Comments
 (0)