node_modules/jsdom/node_modules/htmlparser/README.md

   1 #NodeHtmlParser
   2 A forgiving HTML/XML/RSS parser written in JS for both the browser and NodeJS (yes, despite the name it works just fine in any modern browser). The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output.
   3
   4 ##Installing
   5
   6         npm install htmlparser
   7
   8 ##Running Tests
   9
  10 ###Run tests under node:
  11         node runtests.js
  12
  13 ###Run tests in browser:
  14 View runtests.html in any browser
  15
  16 ##Usage In Node
  17
  18 ```javascript
  19 var htmlparser = require("htmlparser");
  20 var rawHtml = "Xyz <script language= javascript>var foo = '<<bar>>';< /  script><!--<!-- Waah! -- -->";
  21 var handler = new htmlparser.DefaultHandler(function (error, dom) {
  22         if (error)
  23                 [...do something for errors...]
  24         else
  25                 [...parsing done, do something...]
  26 });
  27 var parser = new htmlparser.Parser(handler);
  28 parser.parseComplete(rawHtml);
  29 sys.puts(sys.inspect(handler.dom, false, null));
  30 ```
  31
  32 ##Usage In Browser
  33
  34 ```javascript
  35 var handler = new Tautologistics.NodeHtmlParser.DefaultHandler(function (error, dom) {
  36         if (error)
  37                 [...do something for errors...]
  38         else
  39                 [...parsing done, do something...]
  40 });
  41 var parser = new Tautologistics.NodeHtmlParser.Parser(handler);
  42 parser.parseComplete(document.body.innerHTML);
  43 alert(JSON.stringify(handler.dom, null, 2));
  44 ```
  45
  46 ##Example output
  47
  48 ```javascript
  49 [ { raw: 'Xyz ', data: 'Xyz ', type: 'text' }
  50   , { raw: 'script language= javascript'
  51   , data: 'script language= javascript'
  52   , type: 'script'
  53   , name: 'script'
  54   , attribs: { language: 'javascript' }
  55   , children:
  56      [ { raw: 'var foo = \'<bar>\';<'
  57        , data: 'var foo = \'<bar>\';<'
  58        , type: 'text'
  59        }
  60      ]
  61   }
  62 , { raw: '<!-- Waah! -- '
  63   , data: '<!-- Waah! -- '
  64   , type: 'comment'
  65   }
  66 ]
  67 ```
  68
  69 ##Streaming To Parser
  70
  71 ```javascript
  72 while (...) {
  73         ...
  74         parser.parseChunk(chunk);
  75 }
  76 parser.done();
  77 ```
  78
  79 ##Parsing RSS/Atom Feeds
  80
  81 ```javascript
  82 new htmlparser.RssHandler(function (error, dom) {
  83         ...
  84 });
  85 ```
  86
  87 ##DefaultHandler Options
  88
  89 ###Usage
  90
  91 ```javascript
  92 var handler = new htmlparser.DefaultHandler(
  93           function (error) { ... }
  94         , { verbose: false, ignoreWhitespace: true }
  95         );
  96 ```
  97
  98 ###Option: ignoreWhitespace
  99 Indicates whether the DOM should exclude text nodes that consists solely of whitespace. The default value is "false".
 100
 101 ####Example: true
 102
 103 The following HTML:
 104
 105 ```html
 106 <font>
 107         <br>this is the text
 108 <font>
 109 ```
 110
 111 becomes:
 112
 113 ```javascript
 114 [ { raw: 'font'
 115   , data: 'font'
 116   , type: 'tag'
 117   , name: 'font'
 118   , children:
 119      [ { raw: 'br', data: 'br', type: 'tag', name: 'br' }
 120      , { raw: 'this is the text\n'
 121        , data: 'this is the text\n'
 122        , type: 'text'
 123        }
 124      , { raw: 'font', data: 'font', type: 'tag', name: 'font' }
 125      ]
 126   }
 127 ]
 128 ```
 129
 130 ####Example: false
 131
 132 The following HTML:
 133
 134 ```html
 135 <font>
 136         <br>this is the text
 137 <font>
 138 ```
 139
 140 becomes:
 141
 142 ```javascript
 143 [ { raw: 'font'
 144   , data: 'font'
 145   , type: 'tag'
 146   , name: 'font'
 147   , children:
 148      [ { raw: '\n\t', data: '\n\t', type: 'text' }
 149      , { raw: 'br', data: 'br', type: 'tag', name: 'br' }
 150      , { raw: 'this is the text\n'
 151        , data: 'this is the text\n'
 152        , type: 'text'
 153        }
 154      , { raw: 'font', data: 'font', type: 'tag', name: 'font' }
 155      ]
 156   }
 157 ]
 158 ```
 159
 160 ###Option: verbose
 161 Indicates whether to include extra information on each node in the DOM. This information consists of the "raw" attribute (original, unparsed text found between "<" and ">") and the "data" attribute on "tag", "script", and "comment" nodes. The default value is "true".
 162
 163 ####Example: true
 164 The following HTML:
 165
 166 ```html
 167 <a href="test.html">xxx</a>
 168 ```
 169
 170 becomes:
 171
 172 ```javascript
 173 [ { raw: 'a href="test.html"'
 174   , data: 'a href="test.html"'
 175   , type: 'tag'
 176   , name: 'a'
 177   , attribs: { href: 'test.html' }
 178   , children: [ { raw: 'xxx', data: 'xxx', type: 'text' } ]
 179   }
 180 ]
 181 ```
 182
 183 ####Example: false
 184 The following HTML:
 185
 186 ```javascript
 187 <a href="test.html">xxx</a>
 188 ```
 189
 190 becomes:
 191
 192 ```javascript
 193 [ { type: 'tag'
 194   , name: 'a'
 195   , attribs: { href: 'test.html' }
 196   , children: [ { data: 'xxx', type: 'text' } ]
 197   }
 198 ]
 199 ```
 200
 201 ###Option: enforceEmptyTags
 202 Indicates whether the DOM should prevent children on tags marked as empty in the HTML spec. Typically this should be set to "true" HTML parsing and "false" for XML parsing. The default value is "true".
 203
 204 ####Example: true
 205 The following HTML:
 206
 207 ```html
 208 <link>text</link>
 209 ```
 210
 211 becomes:
 212
 213 ```javascript
 214 [ { raw: 'link', data: 'link', type: 'tag', name: 'link' }
 215 , { raw: 'text', data: 'text', type: 'text' }
 216 ]
 217 ```
 218
 219 ####Example: false
 220 The following HTML:
 221
 222 ```html
 223 <link>text</link>
 224 ```
 225
 226 becomes:
 227
 228 ```javascript
 229 [ { raw: 'link'
 230   , data: 'link'
 231   , type: 'tag'
 232   , name: 'link'
 233   , children: [ { raw: 'text', data: 'text', type: 'text' } ]
 234   }
 235 ]
 236 ```
 237
 238 ##DomUtils
 239
 240 ###TBD (see utils_example.js for now)
 241
 242 ##Related Projects
 243
 244 Looking for CSS selectors to search the DOM? Try Node-SoupSelect, a port of SoupSelect to NodeJS: http://github.com/harryf/node-soupselect
 245
 246 There's also a port of hpricot to NodeJS that uses HtmlParser for HTML parsing: http://github.com/silentrob/Apricot
 247