39c8b14f |
1 | /*********************************************** |
2 | Copyright 2010, Chris Winberry <chris@winberry.net>. All rights reserved. |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy |
4 | of this software and associated documentation files (the "Software"), to |
5 | deal in the Software without restriction, including without limitation the |
6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
7 | sell copies of the Software, and to permit persons to whom the Software is |
8 | furnished to do so, subject to the following conditions: |
9 | |
10 | The above copyright notice and this permission notice shall be included in |
11 | all copies or substantial portions of the Software. |
12 | |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
19 | IN THE SOFTWARE. |
20 | ***********************************************/ |
21 | /* v1.6.3 */ |
22 | |
23 | (function () { |
24 | |
25 | function runningInNode () { |
26 | return( |
27 | (typeof require) == "function" |
28 | && |
29 | (typeof exports) == "object" |
30 | && |
31 | (typeof module) == "object" |
32 | && |
33 | (typeof __filename) == "string" |
34 | && |
35 | (typeof __dirname) == "string" |
36 | ); |
37 | } |
38 | |
39 | if (!runningInNode()) { |
40 | if (!this.Tautologistics) |
41 | this.Tautologistics = {}; |
42 | else if (this.Tautologistics.NodeHtmlParser) |
43 | return; //NodeHtmlParser already defined! |
44 | this.Tautologistics.NodeHtmlParser = {}; |
45 | exports = this.Tautologistics.NodeHtmlParser; |
46 | } |
47 | |
48 | //Types of elements found in the DOM |
49 | var ElementType = { |
50 | Text: "text" //Plain text |
51 | , Directive: "directive" //Special tag <!...> |
52 | , Comment: "comment" //Special tag <!--...--> |
53 | , Script: "script" //Special tag <script>...</script> |
54 | , Style: "style" //Special tag <style>...</style> |
55 | , Tag: "tag" //Any tag that isn't special |
56 | } |
57 | |
58 | function Parser (handler) { |
59 | this.validateHandler(handler); |
60 | this._handler = handler; |
61 | this.reset(); |
62 | } |
63 | |
64 | //**"Static"**// |
65 | //Regular expressions used for cleaning up and parsing (stateless) |
66 | Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace |
67 | Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents |
68 | Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on |
69 | Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element |
70 | |
71 | //Regular expressions used for parsing (stateful) |
72 | Parser._reAttrib = //Find attributes in a tag |
73 | /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; |
74 | Parser._reTags = /[\<\>]/g; //Find tag markers |
75 | |
76 | //**Public**// |
77 | //Methods// |
78 | //Parses a complete HTML and pushes it to the handler |
79 | Parser.prototype.parseComplete = function Parser$parseComplete (data) { |
80 | this.reset(); |
81 | this.parseChunk(data); |
82 | this.done(); |
83 | } |
84 | |
85 | //Parses a piece of an HTML document |
86 | Parser.prototype.parseChunk = function Parser$parseChunk (data) { |
87 | if (this._done) |
88 | this.handleError(new Error("Attempted to parse chunk after parsing already done")); |
89 | this._buffer += data; //FIXME: this can be a bottleneck |
90 | this.parseTags(); |
91 | } |
92 | |
93 | //Tells the parser that the HTML being parsed is complete |
94 | Parser.prototype.done = function Parser$done () { |
95 | if (this._done) |
96 | return; |
97 | this._done = true; |
98 | |
99 | //Push any unparsed text into a final element in the element list |
100 | if (this._buffer.length) { |
101 | var rawData = this._buffer; |
102 | this._buffer = ""; |
103 | var element = { |
104 | raw: rawData |
105 | , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") |
106 | , type: this._parseState |
107 | }; |
108 | if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style) |
109 | element.name = this.parseTagName(element.data); |
110 | this.parseAttribs(element); |
111 | this._elements.push(element); |
112 | } |
113 | |
114 | this.writeHandler(); |
115 | this._handler.done(); |
116 | } |
117 | |
118 | //Resets the parser to a blank state, ready to parse a new HTML document |
119 | Parser.prototype.reset = function Parser$reset () { |
120 | this._buffer = ""; |
121 | this._done = false; |
122 | this._elements = []; |
123 | this._elementsCurrent = 0; |
124 | this._current = 0; |
125 | this._next = 0; |
126 | this._parseState = ElementType.Text; |
127 | this._prevTagSep = ''; |
128 | this._tagStack = []; |
129 | this._handler.reset(); |
130 | } |
131 | |
132 | //**Private**// |
133 | //Properties// |
134 | Parser.prototype._handler = null; //Handler for parsed elements |
135 | Parser.prototype._buffer = null; //Buffer of unparsed data |
136 | Parser.prototype._done = false; //Flag indicating whether parsing is done |
137 | Parser.prototype._elements = null; //Array of parsed elements |
138 | Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed |
139 | Parser.prototype._current = 0; //Position in data that has already been parsed |
140 | Parser.prototype._next = 0; //Position in data of the next tag marker (<>) |
141 | Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed |
142 | Parser.prototype._prevTagSep = ''; //Previous tag marker found |
143 | //Stack of element types previously encountered; keeps track of when |
144 | //parsing occurs inside a script/comment/style tag |
145 | Parser.prototype._tagStack = null; |
146 | |
147 | //Methods// |
148 | //Takes an array of elements and parses any found attributes |
149 | Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) { |
150 | var idxEnd = elements.length; |
151 | var idx = 0; |
152 | |
153 | while (idx < idxEnd) { |
154 | var element = elements[idx++]; |
155 | if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style) |
156 | this.parseAttribs(element); |
157 | } |
158 | |
159 | return(elements); |
160 | } |
161 | |
162 | //Takes an element and adds an "attribs" property for any element attributes found |
163 | Parser.prototype.parseAttribs = function Parser$parseAttribs (element) { |
164 | //Only parse attributes for tags |
165 | if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag) |
166 | return; |
167 | |
168 | var tagName = element.data.split(Parser._reWhitespace, 1)[0]; |
169 | var attribRaw = element.data.substring(tagName.length); |
170 | if (attribRaw.length < 1) |
171 | return; |
172 | |
173 | var match; |
174 | Parser._reAttrib.lastIndex = 0; |
175 | while (match = Parser._reAttrib.exec(attribRaw)) { |
176 | if (element.attribs == undefined) |
177 | element.attribs = {}; |
178 | |
179 | if (typeof match[1] == "string" && match[1].length) { |
180 | element.attribs[match[1]] = match[2]; |
181 | } else if (typeof match[3] == "string" && match[3].length) { |
182 | element.attribs[match[3].toString()] = match[4].toString(); |
183 | } else if (typeof match[5] == "string" && match[5].length) { |
184 | element.attribs[match[5]] = match[6]; |
185 | } else if (typeof match[7] == "string" && match[7].length) { |
186 | element.attribs[match[7]] = match[7]; |
187 | } |
188 | } |
189 | } |
190 | |
191 | //Extracts the base tag name from the data value of an element |
192 | Parser.prototype.parseTagName = function Parser$parseTagName (data) { |
193 | if (data == null || data == "") |
194 | return(""); |
195 | var match = Parser._reTagName.exec(data); |
196 | if (!match) |
197 | return(""); |
198 | return((match[1] ? "/" : "") + match[2]); |
199 | } |
200 | |
201 | //Parses through HTML text and returns an array of found elements |
202 | //I admit, this function is rather large but splitting up had an noticeable impact on speed |
203 | Parser.prototype.parseTags = function Parser$parseTags () { |
204 | var bufferEnd = this._buffer.length - 1; |
205 | while (Parser._reTags.test(this._buffer)) { |
206 | this._next = Parser._reTags.lastIndex - 1; |
207 | var tagSep = this._buffer.charAt(this._next); //The currently found tag marker |
208 | var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse |
209 | |
210 | //A new element to eventually be appended to the element list |
211 | var element = { |
212 | raw: rawData |
213 | , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") |
214 | , type: this._parseState |
215 | }; |
216 | |
217 | var elementName = this.parseTagName(element.data); |
218 | |
219 | //This section inspects the current tag stack and modifies the current |
220 | //element if we're actually parsing a special area (script/comment/style tag) |
221 | if (this._tagStack.length) { //We're parsing inside a script/comment/style tag |
222 | if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag |
223 | if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack |
224 | this._tagStack.pop(); |
225 | else { //Not a closing script tag |
226 | if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment |
227 | //All data from here to script close is now a text element |
228 | element.type = ElementType.Text; |
229 | //If the previous element is text, append the current text to it |
230 | if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { |
231 | var prevElement = this._elements[this._elements.length - 1]; |
232 | prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; |
233 | element.raw = element.data = ""; //This causes the current element to not be added to the element list |
234 | } |
235 | } |
236 | } |
237 | } |
238 | else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag |
239 | if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack |
240 | this._tagStack.pop(); |
241 | else { |
242 | if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment |
243 | //All data from here to style close is now a text element |
244 | element.type = ElementType.Text; |
245 | //If the previous element is text, append the current text to it |
246 | if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { |
247 | if (element.raw != "") { |
248 | var prevElement = this._elements[this._elements.length - 1]; |
249 | prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; |
250 | element.raw = element.data = ""; //This causes the current element to not be added to the element list |
251 | } |
252 | else{ //Element is empty, so just append the last tag marker found |
253 | if (prevElement) { |
254 | prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep; |
255 | } |
256 | } |
257 | } |
258 | else //The previous element was not text |
259 | if (element.raw != "") |
260 | element.raw = element.data = element.raw; |
261 | } |
262 | } |
263 | } |
264 | else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag |
265 | var rawLen = element.raw.length; |
266 | if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") { |
267 | //Actually, we're no longer in a style tag, so pop it off the stack |
268 | this._tagStack.pop(); |
269 | //If the previous element is a comment, append the current text to it |
270 | if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { |
271 | var prevElement = this._elements[this._elements.length - 1]; |
272 | prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, ""); |
273 | element.raw = element.data = ""; //This causes the current element to not be added to the element list |
274 | element.type = ElementType.Text; |
275 | } |
276 | else //Previous element not a comment |
277 | element.type = ElementType.Comment; //Change the current element's type to a comment |
278 | } |
279 | else { //Still in a comment tag |
280 | element.type = ElementType.Comment; |
281 | //If the previous element is a comment, append the current text to it |
282 | if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { |
283 | var prevElement = this._elements[this._elements.length - 1]; |
284 | prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep; |
285 | element.raw = element.data = ""; //This causes the current element to not be added to the element list |
286 | element.type = ElementType.Text; |
287 | } |
288 | else |
289 | element.raw = element.data = element.raw + tagSep; |
290 | } |
291 | } |
292 | } |
293 | |
294 | //Processing of non-special tags |
295 | if (element.type == ElementType.Tag) { |
296 | element.name = elementName; |
297 | |
298 | if (element.raw.indexOf("!--") == 0) { //This tag is really comment |
299 | element.type = ElementType.Comment; |
300 | delete element["name"]; |
301 | var rawLen = element.raw.length; |
302 | //Check if the comment is terminated in the current element |
303 | if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">") |
304 | element.raw = element.data = element.raw.replace(Parser._reTrimComment, ""); |
305 | else { //It's not so push the comment onto the tag stack |
306 | element.raw += tagSep; |
307 | this._tagStack.push(ElementType.Comment); |
308 | } |
309 | } |
310 | else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) { |
311 | element.type = ElementType.Directive; |
312 | //TODO: what about CDATA? |
313 | } |
314 | else if (element.name == "script") { |
315 | element.type = ElementType.Script; |
316 | //Special tag, push onto the tag stack if not terminated |
317 | if (element.data.charAt(element.data.length - 1) != "/") |
318 | this._tagStack.push(ElementType.Script); |
319 | } |
320 | else if (element.name == "/script") |
321 | element.type = ElementType.Script; |
322 | else if (element.name == "style") { |
323 | element.type = ElementType.Style; |
324 | //Special tag, push onto the tag stack if not terminated |
325 | if (element.data.charAt(element.data.length - 1) != "/") |
326 | this._tagStack.push(ElementType.Style); |
327 | } |
328 | else if (element.name == "/style") |
329 | element.type = ElementType.Style; |
330 | if (element.name && element.name.charAt(0) == "/") |
331 | element.data = element.name; |
332 | } |
333 | |
334 | //Add all tags and non-empty text elements to the element list |
335 | if (element.raw != "" || element.type != ElementType.Text) { |
336 | this.parseAttribs(element); |
337 | this._elements.push(element); |
338 | //If tag self-terminates, add an explicit, separate closing tag |
339 | if ( |
340 | element.type != ElementType.Text |
341 | && |
342 | element.type != ElementType.Comment |
343 | && |
344 | element.type != ElementType.Directive |
345 | && |
346 | element.data.charAt(element.data.length - 1) == "/" |
347 | ) |
348 | this._elements.push({ |
349 | raw: "/" + element.name |
350 | , data: "/" + element.name |
351 | , name: "/" + element.name |
352 | , type: element.type |
353 | }); |
354 | } |
355 | this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text; |
356 | this._current = this._next + 1; |
357 | this._prevTagSep = tagSep; |
358 | } |
359 | |
360 | this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : ""; |
361 | this._current = 0; |
362 | |
363 | this.writeHandler(); |
364 | } |
365 | |
366 | //Checks the handler to make it is an object with the right "interface" |
367 | Parser.prototype.validateHandler = function Parser$validateHandler (handler) { |
368 | if ((typeof handler) != "object") |
369 | throw new Error("Handler is not an object"); |
370 | if ((typeof handler.reset) != "function") |
371 | throw new Error("Handler method 'reset' is invalid"); |
372 | if ((typeof handler.done) != "function") |
373 | throw new Error("Handler method 'done' is invalid"); |
374 | if ((typeof handler.writeTag) != "function") |
375 | throw new Error("Handler method 'writeTag' is invalid"); |
376 | if ((typeof handler.writeText) != "function") |
377 | throw new Error("Handler method 'writeText' is invalid"); |
378 | if ((typeof handler.writeComment) != "function") |
379 | throw new Error("Handler method 'writeComment' is invalid"); |
380 | if ((typeof handler.writeDirective) != "function") |
381 | throw new Error("Handler method 'writeDirective' is invalid"); |
382 | } |
383 | |
384 | //Writes parsed elements out to the handler |
385 | Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) { |
386 | forceFlush = !!forceFlush; |
387 | if (this._tagStack.length && !forceFlush) |
388 | return; |
389 | while (this._elements.length) { |
390 | var element = this._elements.shift(); |
391 | switch (element.type) { |
392 | case ElementType.Comment: |
393 | this._handler.writeComment(element); |
394 | break; |
395 | case ElementType.Directive: |
396 | this._handler.writeDirective(element); |
397 | break; |
398 | case ElementType.Text: |
399 | this._handler.writeText(element); |
400 | break; |
401 | default: |
402 | this._handler.writeTag(element); |
403 | break; |
404 | } |
405 | } |
406 | } |
407 | |
408 | Parser.prototype.handleError = function Parser$handleError (error) { |
409 | if ((typeof this._handler.error) == "function") |
410 | this._handler.error(error); |
411 | else |
412 | throw error; |
413 | } |
414 | |
415 | //TODO: make this a trully streamable handler |
416 | function RssHandler (callback) { |
417 | RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false }); |
418 | } |
419 | inherits(RssHandler, DefaultHandler); |
420 | |
421 | RssHandler.prototype.done = function RssHandler$done () { |
422 | var feed = { }; |
423 | var feedRoot; |
424 | |
425 | var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false); |
426 | if (found.length) { |
427 | feedRoot = found[0]; |
428 | } |
429 | if (feedRoot) { |
430 | if (feedRoot.name == "rss") { |
431 | feed.type = "rss"; |
432 | feedRoot = feedRoot.children[0]; //<channel/> |
433 | feed.id = ""; |
434 | try { |
435 | feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; |
436 | } catch (ex) { } |
437 | try { |
438 | feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data; |
439 | } catch (ex) { } |
440 | try { |
441 | feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data; |
442 | } catch (ex) { } |
443 | try { |
444 | feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data); |
445 | } catch (ex) { } |
446 | try { |
447 | feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data; |
448 | } catch (ex) { } |
449 | feed.items = []; |
450 | DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) { |
451 | var entry = {}; |
452 | try { |
453 | entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data; |
454 | } catch (ex) { } |
455 | try { |
456 | entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; |
457 | } catch (ex) { } |
458 | try { |
459 | entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data; |
460 | } catch (ex) { } |
461 | try { |
462 | entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data; |
463 | } catch (ex) { } |
464 | try { |
465 | entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data); |
466 | } catch (ex) { } |
467 | feed.items.push(entry); |
468 | }); |
469 | } else { |
470 | feed.type = "atom"; |
471 | try { |
472 | feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data; |
473 | } catch (ex) { } |
474 | try { |
475 | feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; |
476 | } catch (ex) { } |
477 | try { |
478 | feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href; |
479 | } catch (ex) { } |
480 | try { |
481 | feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data; |
482 | } catch (ex) { } |
483 | try { |
484 | feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data); |
485 | } catch (ex) { } |
486 | try { |
487 | feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data; |
488 | } catch (ex) { } |
489 | feed.items = []; |
490 | DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) { |
491 | var entry = {}; |
492 | try { |
493 | entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data; |
494 | } catch (ex) { } |
495 | try { |
496 | entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; |
497 | } catch (ex) { } |
498 | try { |
499 | entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href; |
500 | } catch (ex) { } |
501 | try { |
502 | entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data; |
503 | } catch (ex) { } |
504 | try { |
505 | entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data); |
506 | } catch (ex) { } |
507 | feed.items.push(entry); |
508 | }); |
509 | } |
510 | |
511 | this.dom = feed; |
512 | } |
513 | RssHandler.super_.prototype.done.call(this); |
514 | } |
515 | |
516 | /////////////////////////////////////////////////// |
517 | |
518 | function DefaultHandler (callback, options) { |
519 | this.reset(); |
520 | this._options = options ? options : { }; |
521 | if (this._options.ignoreWhitespace == undefined) |
522 | this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes |
523 | if (this._options.verbose == undefined) |
524 | this._options.verbose = true; //Keep data property for tags and raw property for all |
525 | if (this._options.enforceEmptyTags == undefined) |
526 | this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec |
527 | if ((typeof callback) == "function") |
528 | this._callback = callback; |
529 | } |
530 | |
531 | //**"Static"**// |
532 | //HTML Tags that shouldn't contain child nodes |
533 | DefaultHandler._emptyTags = { |
534 | area: 1 |
535 | , base: 1 |
536 | , basefont: 1 |
537 | , br: 1 |
538 | , col: 1 |
539 | , frame: 1 |
540 | , hr: 1 |
541 | , img: 1 |
542 | , input: 1 |
543 | , isindex: 1 |
544 | , link: 1 |
545 | , meta: 1 |
546 | , param: 1 |
547 | , embed: 1 |
548 | } |
549 | //Regex to detect whitespace only text nodes |
550 | DefaultHandler.reWhitespace = /^\s*$/; |
551 | |
552 | //**Public**// |
553 | //Properties// |
554 | DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML |
555 | //Methods// |
556 | //Resets the handler back to starting state |
557 | DefaultHandler.prototype.reset = function DefaultHandler$reset() { |
558 | this.dom = []; |
559 | this._done = false; |
560 | this._tagStack = []; |
561 | this._tagStack.last = function DefaultHandler$_tagStack$last () { |
562 | return(this.length ? this[this.length - 1] : null); |
563 | } |
564 | } |
565 | //Signals the handler that parsing is done |
566 | DefaultHandler.prototype.done = function DefaultHandler$done () { |
567 | this._done = true; |
568 | this.handleCallback(null); |
569 | } |
570 | DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) { |
571 | this.handleElement(element); |
572 | } |
573 | DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) { |
574 | if (this._options.ignoreWhitespace) |
575 | if (DefaultHandler.reWhitespace.test(element.data)) |
576 | return; |
577 | this.handleElement(element); |
578 | } |
579 | DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) { |
580 | this.handleElement(element); |
581 | } |
582 | DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) { |
583 | this.handleElement(element); |
584 | } |
585 | DefaultHandler.prototype.error = function DefaultHandler$error (error) { |
586 | this.handleCallback(error); |
587 | } |
588 | |
589 | //**Private**// |
590 | //Properties// |
591 | DefaultHandler.prototype._options = null; //Handler options for how to behave |
592 | DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done |
593 | DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed |
594 | DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed |
595 | //Methods// |
596 | DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) { |
597 | if ((typeof this._callback) != "function") |
598 | if (error) |
599 | throw error; |
600 | else |
601 | return; |
602 | this._callback(error, this.dom); |
603 | } |
604 | DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) { |
605 | if (this._done) |
606 | this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); |
607 | if (!this._options.verbose) { |
608 | // element.raw = null; //FIXME: Not clean |
609 | //FIXME: Serious performance problem using delete |
610 | delete element.raw; |
611 | if (element.type == "tag" || element.type == "script" || element.type == "style") |
612 | delete element.data; |
613 | } |
614 | if (!this._tagStack.last()) { //There are no parent elements |
615 | //If the element can be a container, add it to the tag stack and the top level list |
616 | if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { |
617 | if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag |
618 | this.dom.push(element); |
619 | if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[element.name]) { //Don't add tags to the tag stack that can't have children |
620 | this._tagStack.push(element); |
621 | } |
622 | } |
623 | } |
624 | else //Otherwise just add to the top level list |
625 | this.dom.push(element); |
626 | } |
627 | else { //There are parent elements |
628 | //If the element can be a container, add it as a child of the element |
629 | //on top of the tag stack and then add it to the tag stack |
630 | if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { |
631 | if (element.name.charAt(0) == "/") { |
632 | //This is a closing tag, scan the tagStack to find the matching opening tag |
633 | //and pop the stack up to the opening tag's parent |
634 | var baseName = element.name.substring(1); |
635 | if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[baseName]) { |
636 | var pos = this._tagStack.length - 1; |
637 | while (pos > -1 && this._tagStack[pos--].name != baseName) { } |
638 | if (pos > -1 || this._tagStack[0].name == baseName) |
639 | while (pos < this._tagStack.length - 1) |
640 | this._tagStack.pop(); |
641 | } |
642 | } |
643 | else { //This is not a closing tag |
644 | if (!this._tagStack.last().children) |
645 | this._tagStack.last().children = []; |
646 | this._tagStack.last().children.push(element); |
647 | if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[element.name]) //Don't add tags to the tag stack that can't have children |
648 | this._tagStack.push(element); |
649 | } |
650 | } |
651 | else { //This is not a container element |
652 | if (!this._tagStack.last().children) |
653 | this._tagStack.last().children = []; |
654 | this._tagStack.last().children.push(element); |
655 | } |
656 | } |
657 | } |
658 | |
659 | var DomUtils = { |
660 | testElement: function DomUtils$testElement (options, element) { |
661 | if (!element) { |
662 | return false; |
663 | } |
664 | |
665 | for (var key in options) { |
666 | if (key == "tag_name") { |
667 | if (element.type != "tag" && element.type != "script" && element.type != "style") { |
668 | return false; |
669 | } |
670 | if (!options["tag_name"](element.name)) { |
671 | return false; |
672 | } |
673 | } else if (key == "tag_type") { |
674 | if (!options["tag_type"](element.type)) { |
675 | return false; |
676 | } |
677 | } else if (key == "tag_contains") { |
678 | if (element.type != "text" && element.type != "comment" && element.type != "directive") { |
679 | return false; |
680 | } |
681 | if (!options["tag_contains"](element.data)) { |
682 | return false; |
683 | } |
684 | } else { |
685 | if (!element.attribs || !options[key](element.attribs[key])) { |
686 | return false; |
687 | } |
688 | } |
689 | } |
690 | |
691 | return true; |
692 | } |
693 | |
694 | , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) { |
695 | recurse = (recurse === undefined || recurse === null) || !!recurse; |
696 | limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit); |
697 | |
698 | if (!currentElement) { |
699 | return([]); |
700 | } |
701 | |
702 | var found = []; |
703 | var elementList; |
704 | |
705 | function getTest (checkVal) { |
706 | return(function (value) { return(value == checkVal); }); |
707 | } |
708 | for (var key in options) { |
709 | if ((typeof options[key]) != "function") { |
710 | options[key] = getTest(options[key]); |
711 | } |
712 | } |
713 | |
714 | if (DomUtils.testElement(options, currentElement)) { |
715 | found.push(currentElement); |
716 | } |
717 | |
718 | if (limit >= 0 && found.length >= limit) { |
719 | return(found); |
720 | } |
721 | |
722 | if (recurse && currentElement.children) { |
723 | elementList = currentElement.children; |
724 | } else if (currentElement instanceof Array) { |
725 | elementList = currentElement; |
726 | } else { |
727 | return(found); |
728 | } |
729 | |
730 | for (var i = 0; i < elementList.length; i++) { |
731 | found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit)); |
732 | if (limit >= 0 && found.length >= limit) { |
733 | break; |
734 | } |
735 | } |
736 | |
737 | return(found); |
738 | } |
739 | |
740 | , getElementById: function DomUtils$getElementById (id, currentElement, recurse) { |
741 | var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1); |
742 | return(result.length ? result[0] : null); |
743 | } |
744 | |
745 | , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) { |
746 | return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit)); |
747 | } |
748 | |
749 | , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) { |
750 | return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit)); |
751 | } |
752 | } |
753 | |
754 | function inherits (ctor, superCtor) { |
755 | var tempCtor = function(){}; |
756 | tempCtor.prototype = superCtor.prototype; |
757 | ctor.super_ = superCtor; |
758 | ctor.prototype = new tempCtor(); |
759 | ctor.prototype.constructor = ctor; |
760 | } |
761 | |
762 | exports.Parser = Parser; |
763 | |
764 | exports.DefaultHandler = DefaultHandler; |
765 | |
766 | exports.RssHandler = RssHandler; |
767 | |
768 | exports.ElementType = ElementType; |
769 | |
770 | exports.DomUtils = DomUtils; |
771 | |
772 | })(); |