2 sax
.parser = function (strict
, opt
) { return new SAXParser(strict
, opt
) };
3 sax
.SAXParser
= SAXParser
;
5 function SAXParser (strict
, opt
) {
6 this.c
= this.comment
= this.sgmlDecl
=
7 this.textNode
= this.tagName
= this.doctype
=
8 this.procInstName
= this.procInstBody
= this.entity
=
9 this.attribName
= this.attribValue
= this.q
=
10 this.cdata
= this.sgmlDecl
= "";
12 this.tagCase
= this.opt
.lowercasetags
? "toLowerCase" : "toUpperCase";
14 this.closed
= this.closedRoot
= this.sawRoot
= false;
15 this.tag
= this.error
= null;
16 this.strict
= !!strict
;
18 this.ENTITIES
= Object
.create(sax
.ENTITIES
);
20 // just for error reporting
21 this.position
= this.line
= this.column
= 0;
22 emit(this, "onready");
24 SAXParser
.prototype = {
26 resume : function () { this.error
= null; return this },
27 close : function () { return this.write(null) },
30 // character classes and tokens
31 var whitespace
= "\n\t ",
32 // this really needs to be replaced with character classes.
33 // XML allows all manner of ridiculous numbers and digits.
34 number
= "0124356789",
35 letter
= "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
36 // (Letter | '_' | ':')
37 nameStart
= letter
+"_:",
38 nameBody
= nameStart
+number
+"-.",
40 entity
= number
+letter
+"#",
43 function is (charclass
, c
) { return charclass
.indexOf(c
) !== -1 }
44 function not (charclass
, c
) { return !is(charclass
, c
) }
49 , TEXT
: S
++ // general stuff
50 , TEXT_ENTITY
: S
++ // & and such.
51 , OPEN_WAKA
: S
++ // <
52 , SGML_DECL
: S
++ // <!BLARG
53 , SGML_DECL_QUOTED
: S
++ // <!BLARG foo "bar
54 , DOCTYPE
: S
++ // <!DOCTYPE
55 , DOCTYPE_QUOTED
: S
++ // <!DOCTYPE "//blah
56 , DOCTYPE_DTD
: S
++ // <!DOCTYPE "//blah" [ ...
57 , DOCTYPE_DTD_QUOTED
: S
++ // <!DOCTYPE "//blah" [ "foo
58 , COMMENT_STARTING
: S
++ // <!-
59 , COMMENT
: S
++ // <!--
60 , COMMENT_ENDING
: S
++ // <!-- blah -
61 , COMMENT_ENDED
: S
++ // <!-- blah --
62 , CDATA
: S
++ // <![CDATA[ something
63 , CDATA_ENDING
: S
++ // ]
64 , CDATA_ENDING_2
: S
++ // ]]
65 , PROC_INST
: S
++ // <?hi
66 , PROC_INST_BODY
: S
++ // <?hi there
67 , PROC_INST_QUOTED
: S
++ // <?hi there
68 , PROC_INST_ENDING
: S
++ // <?hi there ?
69 , OPEN_TAG
: S
++ // <strong
70 , OPEN_TAG_SLASH
: S
++ // <strong /
72 , ATTRIB_NAME
: S
++ // <a foo
73 , ATTRIB_NAME_SAW_WHITE
: S
++ // <a foo _
74 , ATTRIB_VALUE
: S
++ // <a foo="bar
75 , ATTRIB_VALUE_QUOTED
: S
++ // <a foo="bar
76 , ATTRIB_VALUE_UNQUOTED
: S
++ // <a foo="bar
77 , ATTRIB_VALUE_ENTITY_Q
: S
++ // <foo bar="""
78 , ATTRIB_VALUE_ENTITY_U
: S
++ // <foo bar="
79 , CLOSE_TAG
: S
++ // </a
80 , CLOSE_TAG_SAW_WHITE
: S
++ // </a >
91 for (var S
in sax
.STATE
) sax
.STATE
[sax
.STATE
[S
]] = S
;
95 sax
.EVENTS
= [ // for discoverability.
96 "text", "processinginstruction", "sgmldeclaration",
97 "doctype", "comment", "attribute", "opentag", "closetag",
98 "cdata", "error", "end", "ready" ];
100 function emit (parser
, event
, data
) {
101 parser
[event
] && parser
[event
](data
);
103 function emitNode (parser
, nodeType
, data
) {
104 if (parser
.textNode
) closeText(parser
);
105 emit(parser
, nodeType
, data
);
107 function closeText (parser
) {
108 parser
.textNode
= textopts(parser
.opt
, parser
.textNode
);
109 if (parser
.textNode
) emit(parser
, "ontext", parser
.textNode
);
110 parser
.textNode
= "";
112 function textopts (opt
, text
) {
113 if (opt
.trim
) text
= text
.trim();
114 if (opt
.normalize
) text
= text
.replace(/\s+/g, " ");
117 function error (parser
, er
) {
119 er
+= "\nLine: "+parser
.line
+
120 "\nColumn: "+parser
.column
+
124 emit(parser
, "onerror", er
);
127 function end (parser
) {
128 if (parser
.state
!== S
.TEXT
) error(parser
, "Unexpected end");
131 parser
.closed
= true;
132 emit(parser
, "onend");
133 SAXParser
.call(parser
, parser
.strict
, parser
.opt
);
136 function strictFail (parser
, message
) {
137 if (parser
.strict
) error(parser
, message
);
139 function newTag (parser
) {
140 if (!parser
.strict
) parser
.tagName
= parser
.tagName
[parser
.tagCase
]();
141 parser
.tag
= { name
: parser
.tagName
, attributes
: {} };
143 function openTag (parser
) {
144 parser
.sawRoot
= true;
145 parser
.tags
.push(parser
.tag
);
146 emitNode(parser
, "onopentag", parser
.tag
);
148 parser
.tagName
= parser
.attribName
= parser
.attribValue
= "";
149 parser
.state
= S
.TEXT
;
151 function closeTag (parser
) {
152 if (!parser
.tagName
) {
153 strictFail(parser
, "Weird empty close tag.");
154 parser
.textNode
+= "</>";
155 parser
.state
= S
.TEXT
;
159 if (!parser
.strict
) parser
.tagName
= parser
.tagName
[parser
.tagCase
]();
160 var closeTo
= parser
.tagName
, close
= parser
.tags
.pop();
162 throw "wtf "+parser
.tagName
+" "+parser
.tags
+" "+parser
.line
+ " "+parser
.position
;
164 if (closeTo
!== close
.name
) strictFail(parser
, "Unexpected close tag.");
166 parser
.tagName
= close
.name
;
167 emitNode(parser
, "onclosetag", parser
.tagName
);
168 } while (closeTo
!== close
.name
);
169 if (parser
.tags
.length
=== 0) parser
.closedRoot
= true;
170 parser
.tagName
= parser
.attribValue
= parser
.attribName
= "";
172 parser
.state
= S
.TEXT
;
174 function parseEntity (parser
) {
175 var entity
= parser
.entity
.toLowerCase(), num
, numStr
= "";
176 if (parser
.ENTITIES
[entity
]) return parser
.ENTITIES
[entity
];
177 if (entity
.charAt(0) === "#") {
178 if (entity
.charAt(1) === "x") {
179 entity
= entity
.slice(2);
180 num
= parseInt(entity
, 16), numStr
= num
.toString(16);
182 entity
= entity
.slice(1);
183 num
= parseInt(entity
, 10), numStr
= num
.toString(10);
186 if (numStr
.toLowerCase() !== entity
) {
187 strictFail(parser
, "Invalid character entity");
188 return "&"+parser
.entity
+ ";";
190 return String
.fromCharCode(num
);
193 function write (chunk
) {
195 if (this.error
) throw this.error
;
196 if (parser
.closed
) return error(parser
,
197 "Cannot write after close. Assign an onready handler.");
198 if (chunk
=== null) return end(parser
);
200 while (parser
.c
= c
= chunk
.charAt(i
++)) {
205 } else parser
.column
++;
206 switch (parser
.state
) {
208 if (c
=== "<") parser
.state
= S
.OPEN_WAKA
;
209 else if (not(whitespace
,c
)) {
210 // have to process this as a text node.
211 // weird, but happens.
212 strictFail(parser
, "Non-whitespace before first tag.");
218 if (c
=== "<") parser
.state
= S
.OPEN_WAKA
;
220 if (not(whitespace
, c
) && (!parser
.sawRoot
|| parser
.closedRoot
))
221 strictFail("Text data outside of root node.");
222 if (c
=== "&") parser
.state
= S
.TEXT_ENTITY
;
223 else parser
.textNode
+= c
;
227 // either a /, ?, !, or text is coming next.
229 parser
.state
= S
.SGML_DECL
;
230 parser
.sgmlDecl
= "";
231 } else if (is(whitespace
, c
)) {
233 } else if (is(nameStart
,c
)) {
234 parser
.state
= S
.OPEN_TAG
;
236 } else if (c
=== "/") {
237 parser
.state
= S
.CLOSE_TAG
;
239 } else if (c
=== "?") {
240 parser
.state
= S
.PROC_INST
;
241 parser
.procInstName
= parser
.procInstBody
= "";
243 strictFail(parser
, "Unencoded <");
244 parser
.textNode
+= "<" + c
;
245 parser
.state
= S
.TEXT
;
249 if ((parser
.sgmlDecl
+c
).toUpperCase() === CDATA
) {
250 parser
.state
= S
.CDATA
;
251 parser
.sgmlDecl
= "";
253 } else if (parser
.sgmlDecl
+c
=== "--") {
254 parser
.state
= S
.COMMENT
;
256 parser
.sgmlDecl
= "";
257 } else if ((parser
.sgmlDecl
+c
).toUpperCase() === DOCTYPE
) {
258 parser
.state
= S
.DOCTYPE
;
259 if (parser
.doctype
|| parser
.sawRoot
) strictFail(parser
,
260 "Inappropriately located doctype declaration");
262 parser
.sgmlDecl
= "";
263 } else if (c
=== ">") {
264 emitNode(parser
, "onsgmldeclaration", parser
.sgmlDecl
);
265 parser
.sgmlDecl
= "";
266 parser
.state
= S
.TEXT
;
267 } else if (is(quote
, c
)) {
268 parser
.state
= S
.SGML_DECL_QUOTED
;
269 parser
.sgmlDecl
+= c
;
270 } else parser
.sgmlDecl
+= c
;
272 case S
.SGML_DECL_QUOTED
:
273 if (c
=== parser
.q
) {
274 parser
.state
= S
.SGML_DECL
;
277 parser
.sgmlDecl
+= c
;
281 parser
.state
= S
.TEXT
;
282 emitNode(parser
, "ondoctype", parser
.doctype
);
283 parser
.doctype
= true; // just remember that we saw it.
286 if (c
=== "[") parser
.state
= S
.DOCTYPE_DTD
;
287 else if (is(quote
, c
)) {
288 parser
.state
= S
.DOCTYPE_QUOTED
;
293 case S
.DOCTYPE_QUOTED
:
295 if (c
=== parser
.q
) {
297 parser
.state
= S
.DOCTYPE
;
302 if (c
=== "]") parser
.state
= S
.DOCTYPE
;
303 else if (is(quote
,c
)) {
304 parser
.state
= S
.DOCTYPE_DTD_QUOTED
;
308 case S
.DOCTYPE_DTD_QUOTED
:
310 if (c
=== parser
.q
) {
311 parser
.state
= S
.DOCTYPE_DTD
;
316 if (c
=== "-") parser
.state
= S
.COMMENT_ENDING
;
317 else parser
.comment
+= c
;
319 case S
.COMMENT_ENDING
:
321 parser
.state
= S
.COMMENT_ENDED
;
322 parser
.comment
= textopts(parser
.opt
, parser
.comment
);
323 if (parser
.comment
) emitNode(parser
, "oncomment", parser
.comment
);
326 strictFail(parser
, "Invalid comment");
327 parser
.comment
+= "-" + c
;
330 case S
.COMMENT_ENDED
:
331 if (c
!== ">") strictFail(parser
, "Malformed comment");
332 else parser
.state
= S
.TEXT
;
335 if (c
=== "]") parser
.state
= S
.CDATA_ENDING
;
336 else parser
.cdata
+= c
;
339 if (c
=== "]") parser
.state
= S
.CDATA_ENDING_2
;
341 parser
.cdata
+= "]" + c
;
342 parser
.state
= S
.CDATA
;
345 case S
.CDATA_ENDING_2
:
347 emitNode(parser
, "oncdata", parser
.cdata
);
349 parser
.state
= S
.TEXT
;
351 parser
.cdata
+= "]]" + c
;
352 parser
.state
= S
.CDATA
;
356 if (c
=== "?") parser
.state
= S
.PROC_INST_ENDING
;
357 else if (is(whitespace
, c
)) parser
.state
= S
.PROC_INST_BODY
;
358 else parser
.procInstName
+= c
;
360 case S
.PROC_INST_BODY
:
361 if (!parser
.procInstBody
&& is(whitespace
, c
)) continue;
362 else if (c
=== "?") parser
.state
= S
.PROC_INST_ENDING
;
363 else if (is(quote
, c
)) {
364 parser
.state
= S
.PROC_INST_QUOTED
;
366 parser
.procInstBody
+= c
;
367 } else parser
.procInstBody
+= c
;
369 case S
.PROC_INST_ENDING
:
371 emitNode(parser
, "onprocessinginstruction", {
372 name
: parser
.procInstName
,
373 body
: parser
.procInstBody
375 parser
.procInstName
= parser
.procInstBody
= "";
376 parser
.state
= S
.TEXT
;
378 parser
.procInstBody
+= "?" + c
;
379 parser
.state
= S
.PROC_INST_BODY
;
382 case S
.PROC_INST_QUOTED
:
383 parser
.procInstBody
+= c
;
384 if (c
=== parser
.q
) {
385 parser
.state
= S
.PROC_INST_BODY
;
390 if (is(nameBody
, c
)) parser
.tagName
+= c
;
393 if (c
=== ">") openTag(parser
);
394 else if (c
=== "/") parser
.state
= S
.OPEN_TAG_SLASH
;
396 if (not(whitespace
, c
)) strictFail(
397 parser
, "Invalid character in tag name");
398 parser
.state
= S
.ATTRIB
;
402 case S
.OPEN_TAG_SLASH
:
407 strictFail(parser
, "Forward-slash in opening tag not followed by >");
408 parser
.state
= S
.ATTRIB
;
412 // haven't read the attribute name yet.
413 if (is(whitespace
, c
)) continue;
414 else if (c
=== ">") openTag(parser
);
415 else if (is(nameStart
, c
)) {
416 parser
.attribName
= c
;
417 parser
.attribValue
= "";
418 parser
.state
= S
.ATTRIB_NAME
;
419 } else strictFail(parser
, "Invalid attribute name");
422 if (c
=== "=") parser
.state
= S
.ATTRIB_VALUE
;
423 else if (is(whitespace
, c
)) parser
.state
= S
.ATTRIB_NAME_SAW_WHITE
;
424 else if (is(nameBody
, c
)) parser
.attribName
+= c
;
425 else strictFail(parser
, "Invalid attribute name");
427 case S
.ATTRIB_NAME_SAW_WHITE
:
428 if (c
=== "=") parser
.state
= S
.ATTRIB_VALUE
;
429 else if (is(whitespace
, c
)) continue;
431 strictFail(parser
, "Attribute without value");
432 parser
.tag
.attributes
[parser
.attribName
] = "";
433 parser
.attribValue
= "";
434 emitNode(parser
, "onattribute", { name
: parser
.attribName
, value
: "" });
435 parser
.attribName
= "";
436 if (c
=== ">") openTag(parser
);
437 else if (is(nameStart
, c
)) {
438 parser
.attribName
= c
;
439 parser
.state
= S
.ATTRIB_NAME
;
441 strictFail(parser
, "Invalid attribute name");
442 parser
.state
= S
.ATTRIB
;
449 parser
.state
= S
.ATTRIB_VALUE_QUOTED
;
451 strictFail(parser
, "Unquoted attribute value");
452 parser
.state
= S
.ATTRIB_VALUE_UNQUOTED
;
453 parser
.attribValue
= c
;
456 case S
.ATTRIB_VALUE_QUOTED
:
457 if (c
!== parser
.q
) {
458 if (c
=== "&") parser
.state
= S
.ATTRIB_VALUE_ENTITY_Q
;
459 else parser
.attribValue
+= c
;
462 parser
.tag
.attributes
[parser
.attribName
] = parser
.attribValue
;
463 emitNode(parser
, "onattribute", {
464 name
:parser
.attribName
, value
:parser
.attribValue
});
465 parser
.attribName
= parser
.attribValue
= "";
467 parser
.state
= S
.ATTRIB
;
469 case S
.ATTRIB_VALUE_UNQUOTED
:
470 if (not(whitespace
+">",c
)) {
471 if (c
=== "&") parser
.state
= S
.ATTRIB_VALUE_ENTITY_U
;
472 else parser
.attribValue
+= c
;
475 emitNode(parser
, "onattribute", {
476 name
:parser
.attribName
, value
:parser
.attribValue
});
477 parser
.attribName
= parser
.attribValue
= "";
478 if (c
=== ">") openTag(parser
);
479 else parser
.state
= S
.ATTRIB
;
482 if (!parser
.tagName
) {
483 if (is(whitespace
, c
)) continue;
484 else if (not(nameStart
, c
)) strictFail(parser
,
485 "Invalid tagname in closing tag.");
486 else parser
.tagName
= c
;
488 else if (c
=== ">") closeTag(parser
);
489 else if (is(nameBody
, c
)) parser
.tagName
+= c
;
491 if (not(whitespace
, c
)) strictFail(parser
,
492 "Invalid tagname in closing tag");
493 parser
.state
= S
.CLOSE_TAG_SAW_WHITE
;
496 case S
.CLOSE_TAG_SAW_WHITE
:
497 if (is(whitespace
, c
)) continue;
498 if (c
=== ">") closeTag(parser
);
499 else strictFail("Invalid characters in closing tag");
502 case S
.ATTRIB_VALUE_ENTITY_Q
:
503 case S
.ATTRIB_VALUE_ENTITY_U
:
504 switch(parser
.state
) {
506 var returnState
= S
.TEXT
, buffer
= "textNode";
508 case S
.ATTRIB_VALUE_ENTITY_Q
:
509 var returnState
= S
.ATTRIB_VALUE_QUOTED
, buffer
= "attribValue";
511 case S
.ATTRIB_VALUE_ENTITY_U
:
512 var returnState
= S
.ATTRIB_VALUE_UNQUOTED
, buffer
= "attribValue";
516 parser
[buffer
] += parseEntity(parser
);
518 parser
.state
= returnState
;
520 else if (is(entity
, c
)) parser
.entity
+= c
;
522 strictFail("Invalid character entity");
523 parser
[buffer
] += "&" + parser
.entity
;
525 parser
.state
= returnState
;
529 throw "Unknown state: " + parser
.state
;