1+ module . exports = Parser ;
2+
3+ var WritableStream = require ( "stream" ) . Writable ,
4+
5+ i = 0 ,
6+
7+ TEXT = i ++ ,
8+ TAG_START = i ++ , //after <
9+ IN_TAG_NAME = i ++ ,
10+ CLOSING_TAG_START = i ++ ,
11+ IN_CLOSING_TAG_NAME = i ++ ,
12+ AFTER_CLOSING_TAG_NAME = i ++ ,
13+
14+ //attributes
15+ BEFORE_ATTRIBUTE_NAME = i ++ ,
16+ IN_ATTRIBUTE_NAME = i ++ ,
17+ AFTER_ATTRIBUTE_NAME = i ++ ,
18+ BEFORE_ATTRIBUTE_VALUE = i ++ ,
19+ IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES = i ++ , // "
20+ IN_ATTRIBUTE_VALUE_SINGLE_QUOTES = i ++ , // '
21+ IN_ATTRIBUTE_VALUE_NO_QUOTES = i ++ ,
22+
23+ //declarations
24+ DECLARATION_START = i ++ , // !
25+ IN_DECLARATION = i ++ ,
26+
27+ //processing instructions
28+ IN_PROCESSING_INSTRUCTION = i ++ , // ?
29+
30+ //comments
31+ BEFORE_COMMENT = i ++ ,
32+ IN_COMMENT = i ++ ,
33+ COMMENT_END_1 = i ++ ,
34+ COMMENT_END_2 = i ++ ,
35+
36+ //cdata
37+ CDATA_1 = i ++ , // [
38+ CDATA_2 = i ++ , // C
39+ CDATA_3 = i ++ , // D
40+ CDATA_4 = i ++ , // A
41+ CDATA_5 = i ++ , // T
42+ CDATA_6 = i ++ , // A
43+ IN_CDATA = i ++ , // [
44+ CDATA_END_1 = i ++ , // ]
45+ CDATA_END_2 = i ++ ; // ]
46+
47+ //TODO add logic to handle special tags
48+
49+ function code ( c ) {
50+ return c . charCodeAt ( 0 ) ;
51+ }
52+
53+ function whitespace ( c ) {
54+ return c === code ( " " ) || c === code ( "\t" ) || c === code ( "\r" ) || c === code ( "\n" ) ;
55+ }
56+
57+ function Parser ( options ) {
58+ this . _state = TEXT ;
59+ this . _buffer = null ;
60+ this . _sectionStart = 0 ;
61+ this . _index = 0 ;
62+ this . _options = options ;
63+
64+ WritableStream . call ( this , options ) ;
65+ }
66+
67+ require ( "util" ) . inherits ( Parser , WritableStream ) ;
68+
69+ Parser . prototype . _write = function ( chunk , encoding , cb ) {
70+ if ( this . _buffer === null ) this . _buffer = chunk ;
71+ else this . _buffer = Buffer . concat ( [ this . _buffer , chunk ] ) ;
72+
73+ while ( this . _index < this . _buffer . length ) {
74+ var c = this . _buffer [ this . _index ] ;
75+ if ( this . _state === TEXT ) {
76+ if ( c === code ( "<" ) ) {
77+ this . _emitIfToken ( "text" ) ;
78+ this . _state = TAG_START ;
79+ }
80+ } else if ( this . _state === TAG_START ) {
81+ if ( c === code ( "!" ) ) {
82+ this . _state = DECLARATION_START ;
83+ this . _sectionStart = this . _index + 1 ;
84+ } else if ( c === code ( "?" ) ) {
85+ this . _state = IN_PROCESSING_INSTRUCTION ;
86+ this . _sectionStart = this . _index + 1 ;
87+ } else if ( c === code ( "/" ) ) {
88+ this . _state = CLOSING_TAG_START ;
89+ } else if ( ! whitespace ( c ) ) {
90+ this . _state = IN_TAG_NAME ;
91+ this . _sectionStart = this . _index ;
92+ }
93+ //TODO handle ">"
94+ } else if ( this . _state === IN_TAG_NAME ) {
95+ if ( c === code ( "/" ) ) {
96+ this . _emitToken ( "opentagname" ) ;
97+ this . emit ( "selfclosingtag" ) ;
98+ this . _state = AFTER_CLOSING_TAG_NAME ;
99+ } else if ( c === code ( ">" ) ) {
100+ this . _emitToken ( "opentagname" ) ;
101+ this . _state = TEXT ;
102+ this . _sectionStart = this . _index + 1 ;
103+ } else if ( whitespace ( c ) ) {
104+ this . _emitToken ( "opentagname" ) ;
105+ this . _state = BEFORE_ATTRIBUTE_NAME ;
106+ }
107+ } else if ( this . _state === CLOSING_TAG_START ) {
108+ if ( ! whitespace ( c ) ) {
109+ this . _state = IN_CLOSING_TAG_NAME ;
110+ this . _sectionStart = this . _index ;
111+ }
112+ // TODO handle ">"
113+ } else if ( this . _state === IN_CLOSING_TAG_NAME ) {
114+ if ( c === code ( ">" ) ) {
115+ this . _emitToken ( "closetag" ) ;
116+ this . _state = TEXT ;
117+ this . _sectionStart = this . _index + 1 ;
118+ } else if ( whitespace ( c ) ) {
119+ this . _emitToken ( "closetag" ) ;
120+ this . _state = AFTER_CLOSING_TAG_NAME ;
121+ }
122+ } else if ( this . _state === AFTER_CLOSING_TAG_NAME ) {
123+ //skip everything until ">"
124+ if ( c === code ( ">" ) ) {
125+ this . _state = TEXT ;
126+ this . _sectionStart = this . _index + 1 ;
127+ }
128+ }
129+
130+ /*
131+ * attributes
132+ */
133+ else if ( this . _state === BEFORE_ATTRIBUTE_NAME ) {
134+ if ( c === code ( "/" ) ) {
135+ this . emit ( "selfclosingtag" ) ;
136+ this . _state = AFTER_CLOSING_TAG_NAME ;
137+ } else if ( c === code ( ">" ) ) {
138+ this . _state = TEXT ;
139+ this . _sectionStart = this . _index + 1 ;
140+ } else if ( ! whitespace ( c ) ) {
141+ this . _state = IN_ATTRIBUTE_NAME ;
142+ this . _sectionStart = this . _index ;
143+ }
144+ } else if ( this . _state === IN_ATTRIBUTE_NAME ) {
145+ if ( c === code ( "=" ) ) {
146+ this . _emitIfToken ( "attribname" ) ;
147+ this . _state = BEFORE_ATTRIBUTE_VALUE ;
148+ } else if ( c === code ( "/" ) ) {
149+ this . _emitIfToken ( "attribname" ) ;
150+ this . emit ( "selfclosingtag" ) ;
151+ this . _state = AFTER_CLOSING_TAG_NAME ;
152+ } else if ( c === code ( ">" ) ) {
153+ this . _emitIfToken ( "attribname" ) ;
154+ this . _state = TEXT ;
155+ this . _sectionStart = this . _index + 1 ;
156+ } else if ( whitespace ( c ) ) {
157+ this . _emitIfToken ( "attribname" ) ;
158+ this . _state = AFTER_ATTRIBUTE_NAME ;
159+ }
160+ } else if ( this . _state === AFTER_ATTRIBUTE_NAME ) {
161+ if ( c === code ( "=" ) ) {
162+ this . _state = BEFORE_ATTRIBUTE_VALUE ;
163+ } else if ( c === code ( "/" ) ) {
164+ this . emit ( "selfclosingtag" ) ;
165+ this . _state = AFTER_CLOSING_TAG_NAME ;
166+ } else if ( c === code ( ">" ) ) {
167+ this . _state = TEXT ;
168+ this . _sectionStart = this . _index + 1 ;
169+ } else if ( ! whitespace ( c ) ) {
170+ this . _state = IN_ATTRIBUTE_NAME ;
171+ this . _sectionStart = this . _index ;
172+ }
173+ } else if ( this . _state === BEFORE_ATTRIBUTE_VALUE ) {
174+ if ( c === code ( "\"" ) ) {
175+ this . _state = IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES ;
176+ this . _sectionStart = this . _index + 1 ;
177+ } else if ( c === code ( "'" ) ) {
178+ this . _state = IN_ATTRIBUTE_VALUE_SINGLE_QUOTES ;
179+ this . _sectionStart = this . _index + 1 ;
180+ } else if ( ! whitespace ( c ) ) {
181+ this . _state = IN_ATTRIBUTE_VALUE_NO_QUOTES ;
182+ this . _sectionStart = this . _index ;
183+ }
184+ } else if ( this . _state === IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES ) {
185+ if ( c === code ( "\"" ) ) {
186+ this . _emitToken ( "attribvalue" ) ;
187+ this . _state = BEFORE_ATTRIBUTE_NAME ;
188+ }
189+ } else if ( this . _state === IN_ATTRIBUTE_VALUE_SINGLE_QUOTES ) {
190+ if ( c === code ( "'" ) ) {
191+ this . _emitToken ( "attribvalue" ) ;
192+ this . _state = BEFORE_ATTRIBUTE_NAME ;
193+ }
194+ } else if ( this . _state === IN_ATTRIBUTE_VALUE_NO_QUOTES ) {
195+ if ( c === code ( "/" ) ) {
196+ this . _emitToken ( "attribvalue" ) ;
197+ this . emit ( "selfclosingtag" ) ;
198+ this . _state = AFTER_CLOSING_TAG_NAME ;
199+ } else if ( c === code ( ">" ) ) {
200+ this . _emitToken ( "attribvalue" ) ;
201+ this . _state = TEXT ;
202+ this . _sectionStart = this . _index + 1 ;
203+ } else if ( whitespace ( c ) ) {
204+ this . _emitToken ( "attribvalue" ) ;
205+ this . _state = BEFORE_ATTRIBUTE_NAME ;
206+ }
207+ }
208+
209+ /*
210+ * declarations
211+ */
212+ else if ( this . _state === DECLARATION_START ) {
213+ if ( c === code ( "[" ) ) this . _state = CDATA_1 ;
214+ else if ( c === code ( "-" ) ) this . _state = BEFORE_COMMENT ;
215+ else this . _state = IN_DECLARATION ;
216+ } else if ( this . _state === IN_DECLARATION ) {
217+ if ( c === code ( ">" ) ) {
218+ this . _emitToken ( "declaration" ) ;
219+ this . _state = TEXT ;
220+ this . _sectionStart = this . _index + 1 ;
221+ }
222+ }
223+
224+ /*
225+ * processing instructions
226+ */
227+ else if ( this . _state === IN_PROCESSING_INSTRUCTION ) {
228+ if ( c === code ( ">" ) ) {
229+ this . _emitToken ( "processinginstruction" ) ;
230+ this . _state = TEXT ;
231+ this . _sectionStart = this . _index + 1 ;
232+ }
233+ }
234+
235+ /*
236+ * comments
237+ */
238+ else if ( this . _state === BEFORE_COMMENT ) {
239+ if ( c === code ( "-" ) ) {
240+ this . _state = IN_COMMENT ;
241+ this . _sectionStart = this . _index + 1 ;
242+ } else {
243+ this . _state = IN_DECLARATION ;
244+ }
245+ } else if ( this . _state === IN_COMMENT ) {
246+ if ( c === code ( "-" ) ) this . _state = COMMENT_END_1 ;
247+ } else if ( this . _state === COMMENT_END_1 ) {
248+ if ( c === code ( "-" ) ) this . _state = COMMENT_END_2 ;
249+ else this . _state = IN_COMMENT ;
250+ } else if ( this . _state === COMMENT_END_2 ) {
251+ if ( c === code ( ">" ) ) {
252+ //remove 2 trailing chars
253+ this . emit ( "comment" , this . _buffer . toString ( "utf8" , this . _sectionStart , this . _index - 2 ) ) ;
254+ this . _state = TEXT ;
255+ this . _sectionStart = this . _index + 1 ;
256+ } else {
257+ this . _state = IN_COMMENT ;
258+ }
259+ }
260+
261+ /*
262+ * cdata
263+ */
264+ else if ( this . _state === CDATA_1 ) {
265+ if ( c === code ( "C" ) ) this . _state = CDATA_2 ;
266+ else this . _state = IN_DECLARATION ;
267+ } else if ( this . _state === CDATA_2 ) {
268+ if ( c === code ( "D" ) ) this . _state = CDATA_3 ;
269+ else this . _state = IN_DECLARATION ;
270+ } else if ( this . _state === CDATA_3 ) {
271+ if ( c === code ( "A" ) ) this . _state = CDATA_4 ;
272+ else this . _state = IN_DECLARATION ;
273+ } else if ( this . _state === CDATA_4 ) {
274+ if ( c === code ( "T" ) ) this . _state = CDATA_5 ;
275+ else this . _state = IN_DECLARATION ;
276+ } else if ( this . _state === CDATA_5 ) {
277+ if ( c === code ( "A" ) ) this . _state = CDATA_6 ;
278+ else this . _state = IN_DECLARATION ;
279+ } else if ( this . _state === CDATA_6 ) {
280+ if ( c === code ( "[" ) ) {
281+ this . _state = IN_CDATA ;
282+ this . _sectionStart = this . _index + 1 ;
283+ } else {
284+ this . _state = IN_DECLARATION ;
285+ }
286+ } else if ( this . _state === IN_CDATA ) {
287+ if ( c === code ( "]" ) ) this . _state = CDATA_END_1 ;
288+ } else if ( this . _state === CDATA_END_1 ) {
289+ if ( c === code ( "]" ) ) this . _state = CDATA_END_2 ;
290+ else this . _state = IN_CDATA ;
291+ } else if ( this . _state === CDATA_END_2 ) {
292+ if ( c === code ( ">" ) ) {
293+ //remove 2 trailing chars
294+ this . emit ( "cdata" , this . _buffer . toString ( "utf8" , this . _sectionStart , this . _index - 2 ) ) ;
295+ this . _state = TEXT ;
296+ this . _sectionStart = this . _index + 1 ;
297+ } else {
298+ this . _state = IN_CDATA ;
299+ }
300+ } else {
301+ throw Error ( "unknown state " + this . _state ) ;
302+ }
303+
304+ this . _index ++ ;
305+ }
306+
307+ //cleanup
308+ if ( this . _sectionStart === - 1 ) {
309+ this . _buffer = null ;
310+ } else {
311+ this . _sectionStart = 0 ;
312+
313+ if ( this . _sectionStart === this . _index - 1 ) {
314+ this . _buffer = null ;
315+ } else {
316+ this . _buffer = this . _buffer . slice ( this . _sectionStart ) ;
317+ }
318+ }
319+
320+ cb ( ) ;
321+ } ;
322+
323+ Parser . prototype . _emitToken = function ( name ) {
324+ this . emit ( name , this . _buffer . toString ( "utf8" , this . _sectionStart , this . _index ) ) ;
325+ this . _sectionStart = - 1 ;
326+ } ;
327+
328+ Parser . prototype . _emitIfToken = function ( name ) {
329+ if ( this . _index > this . _sectionStart ) {
330+ this . emit ( name , this . _buffer . toString ( "utf8" , this . _sectionStart , this . _index ) ) ;
331+ }
332+ this . _sectionStart = - 1 ;
333+ } ;
334+
335+ /*
336+ //overwritten for better debuggability
337+ Parser.prototype.emit = function(){
338+ process.stdout.write("[" + this._state + "]\t");
339+ console.log.apply(null, [].map.call(arguments, Function.prototype.call, String.prototype.trim));
340+ WritableStream.prototype.emit.apply(this, arguments);
341+ };
342+
343+ Parser.prototype.end = function(){
344+ WritableStream.prototype.end.apply(this, arguments);
345+
346+ if(this._state === TEXT) return;
347+ console.log("the game must go on!", this._state);
348+ };
349+ */
0 commit comments