initial commit
[JIRC.git] / node_modules / jsdom / example / pure / sax.js
1 var sax = exports;
2 sax.parser = function (strict, opt) { return new SAXParser(strict, opt) };
3 sax.SAXParser = SAXParser;
4
5 function SAXParser (strict, opt) {
6 this.c = this.comment = this.sgmlDecl =
7 this.textNode = this.tagName = this.doctype =
8 this.procInstName = this.procInstBody = this.entity =
9 this.attribName = this.attribValue = this.q =
10 this.cdata = this.sgmlDecl = "";
11 this.opt = opt || {};
12 this.tagCase = this.opt.lowercasetags ? "toLowerCase" : "toUpperCase";
13 this.tags = [];
14 this.closed = this.closedRoot = this.sawRoot = false;
15 this.tag = this.error = null;
16 this.strict = !!strict;
17 this.state = S.BEGIN;
18 this.ENTITIES = Object.create(sax.ENTITIES);
19
20 // just for error reporting
21 this.position = this.line = this.column = 0;
22 emit(this, "onready");
23 }
24 SAXParser.prototype = {
25 write : write,
26 resume : function () { this.error = null; return this },
27 close : function () { return this.write(null) },
28 }
29
30 // character classes and tokens
31 var whitespace = "\n\t ",
32 // this really needs to be replaced with character classes.
33 // XML allows all manner of ridiculous numbers and digits.
34 number = "0124356789",
35 letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
36 // (Letter | '_' | ':')
37 nameStart = letter+"_:",
38 nameBody = nameStart+number+"-.",
39 quote = "'\"",
40 entity = number+letter+"#",
41 CDATA = "[CDATA[",
42 DOCTYPE = "DOCTYPE";
43 function is (charclass, c) { return charclass.indexOf(c) !== -1 }
44 function not (charclass, c) { return !is(charclass, c) }
45
46 var S = 0;
47 sax.STATE =
48 { BEGIN : S++
49 , TEXT : S++ // general stuff
50 , TEXT_ENTITY : S++ // &amp and such.
51 , OPEN_WAKA : S++ // <
52 , SGML_DECL : S++ // <!BLARG
53 , SGML_DECL_QUOTED : S++ // <!BLARG foo "bar
54 , DOCTYPE : S++ // <!DOCTYPE
55 , DOCTYPE_QUOTED : S++ // <!DOCTYPE "//blah
56 , DOCTYPE_DTD : S++ // <!DOCTYPE "//blah" [ ...
57 , DOCTYPE_DTD_QUOTED : S++ // <!DOCTYPE "//blah" [ "foo
58 , COMMENT_STARTING : S++ // <!-
59 , COMMENT : S++ // <!--
60 , COMMENT_ENDING : S++ // <!-- blah -
61 , COMMENT_ENDED : S++ // <!-- blah --
62 , CDATA : S++ // <![CDATA[ something
63 , CDATA_ENDING : S++ // ]
64 , CDATA_ENDING_2 : S++ // ]]
65 , PROC_INST : S++ // <?hi
66 , PROC_INST_BODY : S++ // <?hi there
67 , PROC_INST_QUOTED : S++ // <?hi there
68 , PROC_INST_ENDING : S++ // <?hi there ?
69 , OPEN_TAG : S++ // <strong
70 , OPEN_TAG_SLASH : S++ // <strong /
71 , ATTRIB : S++ // <a
72 , ATTRIB_NAME : S++ // <a foo
73 , ATTRIB_NAME_SAW_WHITE : S++ // <a foo _
74 , ATTRIB_VALUE : S++ // <a foo="bar
75 , ATTRIB_VALUE_QUOTED : S++ // <a foo="bar
76 , ATTRIB_VALUE_UNQUOTED : S++ // <a foo="bar
77 , ATTRIB_VALUE_ENTITY_Q : S++ // <foo bar="&quot;"
78 , ATTRIB_VALUE_ENTITY_U : S++ // <foo bar=&quot;
79 , CLOSE_TAG : S++ // </a
80 , CLOSE_TAG_SAW_WHITE : S++ // </a >
81 }
82
83 sax.ENTITIES =
84 { "apos" : "'"
85 , "quot" : '"'
86 , "amp" : "&"
87 , "gt" : ">"
88 , "lt" : "<"
89 }
90
91 for (var S in sax.STATE) sax.STATE[sax.STATE[S]] = S;
92
93 // shorthand
94 S = sax.STATE;
95 sax.EVENTS = [ // for discoverability.
96 "text", "processinginstruction", "sgmldeclaration",
97 "doctype", "comment", "attribute", "opentag", "closetag",
98 "cdata", "error", "end", "ready" ];
99
100 function emit (parser, event, data) {
101 parser[event] && parser[event](data);
102 }
103 function emitNode (parser, nodeType, data) {
104 if (parser.textNode) closeText(parser);
105 emit(parser, nodeType, data);
106 }
107 function closeText (parser) {
108 parser.textNode = textopts(parser.opt, parser.textNode);
109 if (parser.textNode) emit(parser, "ontext", parser.textNode);
110 parser.textNode = "";
111 }
112 function textopts (opt, text) {
113 if (opt.trim) text = text.trim();
114 if (opt.normalize) text = text.replace(/\s+/g, " ");
115 return text;
116 }
117 function error (parser, er) {
118 closeText(parser);
119 er += "\nLine: "+parser.line+
120 "\nColumn: "+parser.column+
121 "\nChar: "+parser.c;
122 er = new Error(er);
123 parser.error = er;
124 emit(parser, "onerror", er);
125 return parser;
126 }
127 function end (parser) {
128 if (parser.state !== S.TEXT) error(parser, "Unexpected end");
129 closeText(parser);
130 parser.c = "";
131 parser.closed = true;
132 emit(parser, "onend");
133 SAXParser.call(parser, parser.strict, parser.opt);
134 return parser;
135 }
136 function strictFail (parser, message) {
137 if (parser.strict) error(parser, message);
138 }
139 function newTag (parser) {
140 if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase]();
141 parser.tag = { name : parser.tagName, attributes : {} };
142 }
143 function openTag (parser) {
144 parser.sawRoot = true;
145 parser.tags.push(parser.tag);
146 emitNode(parser, "onopentag", parser.tag);
147 parser.tag = null;
148 parser.tagName = parser.attribName = parser.attribValue = "";
149 parser.state = S.TEXT;
150 }
151 function closeTag (parser) {
152 if (!parser.tagName) {
153 strictFail(parser, "Weird empty close tag.");
154 parser.textNode += "</>";
155 parser.state = S.TEXT;
156 return;
157 }
158 do {
159 if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase]();
160 var closeTo = parser.tagName, close = parser.tags.pop();
161 if (!close) {
162 throw "wtf "+parser.tagName+" "+parser.tags+" "+parser.line+ " "+parser.position;
163 }
164 if (closeTo !== close.name) strictFail(parser, "Unexpected close tag.");
165 parser.tag = close;
166 parser.tagName = close.name;
167 emitNode(parser, "onclosetag", parser.tagName);
168 } while (closeTo !== close.name);
169 if (parser.tags.length === 0) parser.closedRoot = true;
170 parser.tagName = parser.attribValue = parser.attribName = "";
171 parser.tag = null;
172 parser.state = S.TEXT;
173 }
174 function parseEntity (parser) {
175 var entity = parser.entity.toLowerCase(), num, numStr = "";
176 if (parser.ENTITIES[entity]) return parser.ENTITIES[entity];
177 if (entity.charAt(0) === "#") {
178 if (entity.charAt(1) === "x") {
179 entity = entity.slice(2);
180 num = parseInt(entity, 16), numStr = num.toString(16);
181 } else {
182 entity = entity.slice(1);
183 num = parseInt(entity, 10), numStr = num.toString(10);
184 }
185 }
186 if (numStr.toLowerCase() !== entity) {
187 strictFail(parser, "Invalid character entity");
188 return "&"+parser.entity + ";";
189 }
190 return String.fromCharCode(num);
191 }
192
193 function write (chunk) {
194 var parser = this;
195 if (this.error) throw this.error;
196 if (parser.closed) return error(parser,
197 "Cannot write after close. Assign an onready handler.");
198 if (chunk === null) return end(parser);
199 var i = 0, c = ""
200 while (parser.c = c = chunk.charAt(i++)) {
201 parser.position ++;
202 if (c === "\n") {
203 parser.line ++;
204 parser.column = 0;
205 } else parser.column ++;
206 switch (parser.state) {
207 case S.BEGIN:
208 if (c === "<") parser.state = S.OPEN_WAKA;
209 else if (not(whitespace,c)) {
210 // have to process this as a text node.
211 // weird, but happens.
212 strictFail(parser, "Non-whitespace before first tag.");
213 parser.textNode = c;
214 state = S.TEXT;
215 }
216 continue;
217 case S.TEXT:
218 if (c === "<") parser.state = S.OPEN_WAKA;
219 else {
220 if (not(whitespace, c) && (!parser.sawRoot || parser.closedRoot))
221 strictFail("Text data outside of root node.");
222 if (c === "&") parser.state = S.TEXT_ENTITY;
223 else parser.textNode += c;
224 }
225 continue;
226 case S.OPEN_WAKA:
227 // either a /, ?, !, or text is coming next.
228 if (c === "!") {
229 parser.state = S.SGML_DECL;
230 parser.sgmlDecl = "";
231 } else if (is(whitespace, c)) {
232 // wait for it...
233 } else if (is(nameStart,c)) {
234 parser.state = S.OPEN_TAG;
235 parser.tagName = c;
236 } else if (c === "/") {
237 parser.state = S.CLOSE_TAG;
238 parser.tagName = "";
239 } else if (c === "?") {
240 parser.state = S.PROC_INST;
241 parser.procInstName = parser.procInstBody = "";
242 } else {
243 strictFail(parser, "Unencoded <");
244 parser.textNode += "<" + c;
245 parser.state = S.TEXT;
246 }
247 continue;
248 case S.SGML_DECL:
249 if ((parser.sgmlDecl+c).toUpperCase() === CDATA) {
250 parser.state = S.CDATA;
251 parser.sgmlDecl = "";
252 parser.cdata = "";
253 } else if (parser.sgmlDecl+c === "--") {
254 parser.state = S.COMMENT;
255 parser.comment = "";
256 parser.sgmlDecl = "";
257 } else if ((parser.sgmlDecl+c).toUpperCase() === DOCTYPE) {
258 parser.state = S.DOCTYPE;
259 if (parser.doctype || parser.sawRoot) strictFail(parser,
260 "Inappropriately located doctype declaration");
261 parser.doctype = "";
262 parser.sgmlDecl = "";
263 } else if (c === ">") {
264 emitNode(parser, "onsgmldeclaration", parser.sgmlDecl);
265 parser.sgmlDecl = "";
266 parser.state = S.TEXT;
267 } else if (is(quote, c)) {
268 parser.state = S.SGML_DECL_QUOTED;
269 parser.sgmlDecl += c;
270 } else parser.sgmlDecl += c;
271 continue;
272 case S.SGML_DECL_QUOTED:
273 if (c === parser.q) {
274 parser.state = S.SGML_DECL;
275 parser.q = "";
276 }
277 parser.sgmlDecl += c;
278 continue;
279 case S.DOCTYPE:
280 if (c === ">") {
281 parser.state = S.TEXT;
282 emitNode(parser, "ondoctype", parser.doctype);
283 parser.doctype = true; // just remember that we saw it.
284 } else {
285 parser.doctype += c;
286 if (c === "[") parser.state = S.DOCTYPE_DTD;
287 else if (is(quote, c)) {
288 parser.state = S.DOCTYPE_QUOTED;
289 parser.q = c;
290 }
291 }
292 continue;
293 case S.DOCTYPE_QUOTED:
294 parser.doctype += c;
295 if (c === parser.q) {
296 parser.q = "";
297 parser.state = S.DOCTYPE;
298 }
299 continue;
300 case S.DOCTYPE_DTD:
301 parser.doctype += c;
302 if (c === "]") parser.state = S.DOCTYPE;
303 else if (is(quote,c)) {
304 parser.state = S.DOCTYPE_DTD_QUOTED;
305 parser.q = c;
306 }
307 continue;
308 case S.DOCTYPE_DTD_QUOTED:
309 parser.doctype += c;
310 if (c === parser.q) {
311 parser.state = S.DOCTYPE_DTD;
312 parser.q = "";
313 }
314 continue;
315 case S.COMMENT:
316 if (c === "-") parser.state = S.COMMENT_ENDING;
317 else parser.comment += c;
318 continue;
319 case S.COMMENT_ENDING:
320 if (c === "-") {
321 parser.state = S.COMMENT_ENDED;
322 parser.comment = textopts(parser.opt, parser.comment);
323 if (parser.comment) emitNode(parser, "oncomment", parser.comment);
324 parser.comment = "";
325 } else {
326 strictFail(parser, "Invalid comment");
327 parser.comment += "-" + c;
328 }
329 continue;
330 case S.COMMENT_ENDED:
331 if (c !== ">") strictFail(parser, "Malformed comment");
332 else parser.state = S.TEXT;
333 continue;
334 case S.CDATA:
335 if (c === "]") parser.state = S.CDATA_ENDING;
336 else parser.cdata += c;
337 continue;
338 case S.CDATA_ENDING:
339 if (c === "]") parser.state = S.CDATA_ENDING_2;
340 else {
341 parser.cdata += "]" + c;
342 parser.state = S.CDATA;
343 }
344 continue;
345 case S.CDATA_ENDING_2:
346 if (c === ">") {
347 emitNode(parser, "oncdata", parser.cdata);
348 parser.cdata = "";
349 parser.state = S.TEXT;
350 } else {
351 parser.cdata += "]]" + c;
352 parser.state = S.CDATA;
353 }
354 continue;
355 case S.PROC_INST:
356 if (c === "?") parser.state = S.PROC_INST_ENDING;
357 else if (is(whitespace, c)) parser.state = S.PROC_INST_BODY;
358 else parser.procInstName += c;
359 continue;
360 case S.PROC_INST_BODY:
361 if (!parser.procInstBody && is(whitespace, c)) continue;
362 else if (c === "?") parser.state = S.PROC_INST_ENDING;
363 else if (is(quote, c)) {
364 parser.state = S.PROC_INST_QUOTED;
365 parser.q = c;
366 parser.procInstBody += c;
367 } else parser.procInstBody += c;
368 continue;
369 case S.PROC_INST_ENDING:
370 if (c === ">") {
371 emitNode(parser, "onprocessinginstruction", {
372 name : parser.procInstName,
373 body : parser.procInstBody
374 });
375 parser.procInstName = parser.procInstBody = "";
376 parser.state = S.TEXT;
377 } else {
378 parser.procInstBody += "?" + c;
379 parser.state = S.PROC_INST_BODY;
380 }
381 continue;
382 case S.PROC_INST_QUOTED:
383 parser.procInstBody += c;
384 if (c === parser.q) {
385 parser.state = S.PROC_INST_BODY;
386 parser.q = "";
387 }
388 continue;
389 case S.OPEN_TAG:
390 if (is(nameBody, c)) parser.tagName += c;
391 else {
392 newTag(parser);
393 if (c === ">") openTag(parser);
394 else if (c === "/") parser.state = S.OPEN_TAG_SLASH;
395 else {
396 if (not(whitespace, c)) strictFail(
397 parser, "Invalid character in tag name");
398 parser.state = S.ATTRIB;
399 }
400 }
401 continue;
402 case S.OPEN_TAG_SLASH:
403 if (c === ">") {
404 openTag(parser);
405 closeTag(parser);
406 } else {
407 strictFail(parser, "Forward-slash in opening tag not followed by >");
408 parser.state = S.ATTRIB;
409 }
410 continue;
411 case S.ATTRIB:
412 // haven't read the attribute name yet.
413 if (is(whitespace, c)) continue;
414 else if (c === ">") openTag(parser);
415 else if (is(nameStart, c)) {
416 parser.attribName = c;
417 parser.attribValue = "";
418 parser.state = S.ATTRIB_NAME;
419 } else strictFail(parser, "Invalid attribute name");
420 continue;
421 case S.ATTRIB_NAME:
422 if (c === "=") parser.state = S.ATTRIB_VALUE;
423 else if (is(whitespace, c)) parser.state = S.ATTRIB_NAME_SAW_WHITE;
424 else if (is(nameBody, c)) parser.attribName += c;
425 else strictFail(parser, "Invalid attribute name");
426 continue;
427 case S.ATTRIB_NAME_SAW_WHITE:
428 if (c === "=") parser.state = S.ATTRIB_VALUE;
429 else if (is(whitespace, c)) continue;
430 else {
431 strictFail(parser, "Attribute without value");
432 parser.tag.attributes[parser.attribName] = "";
433 parser.attribValue = "";
434 emitNode(parser, "onattribute", { name : parser.attribName, value : "" });
435 parser.attribName = "";
436 if (c === ">") openTag(parser);
437 else if (is(nameStart, c)) {
438 parser.attribName = c;
439 parser.state = S.ATTRIB_NAME;
440 } else {
441 strictFail(parser, "Invalid attribute name");
442 parser.state = S.ATTRIB;
443 }
444 }
445 continue;
446 case S.ATTRIB_VALUE:
447 if (is(quote, c)) {
448 parser.q = c;
449 parser.state = S.ATTRIB_VALUE_QUOTED;
450 } else {
451 strictFail(parser, "Unquoted attribute value");
452 parser.state = S.ATTRIB_VALUE_UNQUOTED;
453 parser.attribValue = c;
454 }
455 continue;
456 case S.ATTRIB_VALUE_QUOTED:
457 if (c !== parser.q) {
458 if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_Q;
459 else parser.attribValue += c;
460 continue;
461 }
462 parser.tag.attributes[parser.attribName] = parser.attribValue;
463 emitNode(parser, "onattribute", {
464 name:parser.attribName, value:parser.attribValue});
465 parser.attribName = parser.attribValue = "";
466 parser.q = "";
467 parser.state = S.ATTRIB;
468 continue;
469 case S.ATTRIB_VALUE_UNQUOTED:
470 if (not(whitespace+">",c)) {
471 if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_U;
472 else parser.attribValue += c;
473 continue;
474 }
475 emitNode(parser, "onattribute", {
476 name:parser.attribName, value:parser.attribValue});
477 parser.attribName = parser.attribValue = "";
478 if (c === ">") openTag(parser);
479 else parser.state = S.ATTRIB;
480 continue;
481 case S.CLOSE_TAG:
482 if (!parser.tagName) {
483 if (is(whitespace, c)) continue;
484 else if (not(nameStart, c)) strictFail(parser,
485 "Invalid tagname in closing tag.");
486 else parser.tagName = c;
487 }
488 else if (c === ">") closeTag(parser);
489 else if (is(nameBody, c)) parser.tagName += c;
490 else {
491 if (not(whitespace, c)) strictFail(parser,
492 "Invalid tagname in closing tag");
493 parser.state = S.CLOSE_TAG_SAW_WHITE;
494 }
495 continue;
496 case S.CLOSE_TAG_SAW_WHITE:
497 if (is(whitespace, c)) continue;
498 if (c === ">") closeTag(parser);
499 else strictFail("Invalid characters in closing tag");
500 continue;
501 case S.TEXT_ENTITY:
502 case S.ATTRIB_VALUE_ENTITY_Q:
503 case S.ATTRIB_VALUE_ENTITY_U:
504 switch(parser.state) {
505 case S.TEXT_ENTITY:
506 var returnState = S.TEXT, buffer = "textNode";
507 break;
508 case S.ATTRIB_VALUE_ENTITY_Q:
509 var returnState = S.ATTRIB_VALUE_QUOTED, buffer = "attribValue";
510 break;
511 case S.ATTRIB_VALUE_ENTITY_U:
512 var returnState = S.ATTRIB_VALUE_UNQUOTED, buffer = "attribValue";
513 break;
514 }
515 if (c === ";") {
516 parser[buffer] += parseEntity(parser);
517 parser.entity = "";
518 parser.state = returnState;
519 }
520 else if (is(entity, c)) parser.entity += c;
521 else {
522 strictFail("Invalid character entity");
523 parser[buffer] += "&" + parser.entity;
524 parser.entity = "";
525 parser.state = returnState;
526 }
527 continue;
528 default:
529 throw "Unknown state: " + parser.state;
530 break;
531 }
532 }
533 return parser;
534 }
535