Skip to content

Commit 5e6fcb3

Browse files
committed
landed first version of FSM based tokenizer
fsm style taken from creationix/jsonparse support for special tags (<script> & <style>) is missing
1 parent 8756001 commit 5e6fcb3

File tree

1 file changed

+349
-0
lines changed

1 file changed

+349
-0
lines changed

lib/Tokenizer.js

Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
module.exports = Parser;
2+
3+
var WritableStream = require("stream").Writable,
4+
5+
i = 0,
6+
7+
TEXT = i++,
8+
TAG_START = i++, //after <
9+
IN_TAG_NAME = i++,
10+
CLOSING_TAG_START = i++,
11+
IN_CLOSING_TAG_NAME = i++,
12+
AFTER_CLOSING_TAG_NAME = i++,
13+
14+
//attributes
15+
BEFORE_ATTRIBUTE_NAME = i++,
16+
IN_ATTRIBUTE_NAME = i++,
17+
AFTER_ATTRIBUTE_NAME = i++,
18+
BEFORE_ATTRIBUTE_VALUE = i++,
19+
IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES = i++, // "
20+
IN_ATTRIBUTE_VALUE_SINGLE_QUOTES = i++, // '
21+
IN_ATTRIBUTE_VALUE_NO_QUOTES = i++,
22+
23+
//declarations
24+
DECLARATION_START = i++, // !
25+
IN_DECLARATION = i++,
26+
27+
//processing instructions
28+
IN_PROCESSING_INSTRUCTION = i++, // ?
29+
30+
//comments
31+
BEFORE_COMMENT = i++,
32+
IN_COMMENT = i++,
33+
COMMENT_END_1 = i++,
34+
COMMENT_END_2 = i++,
35+
36+
//cdata
37+
CDATA_1 = i++, // [
38+
CDATA_2 = i++, // C
39+
CDATA_3 = i++, // D
40+
CDATA_4 = i++, // A
41+
CDATA_5 = i++, // T
42+
CDATA_6 = i++, // A
43+
IN_CDATA = i++,// [
44+
CDATA_END_1 = i++, // ]
45+
CDATA_END_2 = i++; // ]
46+
47+
//TODO add logic to handle special tags
48+
49+
function code(c){
50+
return c.charCodeAt(0);
51+
}
52+
53+
function whitespace(c){
54+
return c === code(" ") || c === code("\t") || c === code("\r") || c === code("\n");
55+
}
56+
57+
function Parser(options){
58+
this._state = TEXT;
59+
this._buffer = null;
60+
this._sectionStart = 0;
61+
this._index = 0;
62+
this._options = options;
63+
64+
WritableStream.call(this, options);
65+
}
66+
67+
require("util").inherits(Parser, WritableStream);
68+
69+
Parser.prototype._write = function(chunk, encoding, cb){
70+
if(this._buffer === null) this._buffer = chunk;
71+
else this._buffer = Buffer.concat([this._buffer, chunk]);
72+
73+
while(this._index < this._buffer.length){
74+
var c = this._buffer[this._index];
75+
if(this._state === TEXT){
76+
if(c === code("<")){
77+
this._emitIfToken("text");
78+
this._state = TAG_START;
79+
}
80+
} else if(this._state === TAG_START){
81+
if(c === code("!")){
82+
this._state = DECLARATION_START;
83+
this._sectionStart = this._index + 1;
84+
} else if(c === code("?")){
85+
this._state = IN_PROCESSING_INSTRUCTION;
86+
this._sectionStart = this._index + 1;
87+
} else if(c === code("/")){
88+
this._state = CLOSING_TAG_START;
89+
} else if(!whitespace(c)){
90+
this._state = IN_TAG_NAME;
91+
this._sectionStart = this._index;
92+
}
93+
//TODO handle ">"
94+
} else if(this._state === IN_TAG_NAME){
95+
if(c === code("/")){
96+
this._emitToken("opentagname");
97+
this.emit("selfclosingtag");
98+
this._state = AFTER_CLOSING_TAG_NAME;
99+
} else if(c === code(">")){
100+
this._emitToken("opentagname");
101+
this._state = TEXT;
102+
this._sectionStart = this._index + 1;
103+
} else if(whitespace(c)){
104+
this._emitToken("opentagname");
105+
this._state = BEFORE_ATTRIBUTE_NAME;
106+
}
107+
} else if(this._state === CLOSING_TAG_START){
108+
if(!whitespace(c)){
109+
this._state = IN_CLOSING_TAG_NAME;
110+
this._sectionStart = this._index;
111+
}
112+
// TODO handle ">"
113+
} else if(this._state === IN_CLOSING_TAG_NAME){
114+
if(c === code(">")){
115+
this._emitToken("closetag");
116+
this._state = TEXT;
117+
this._sectionStart = this._index + 1;
118+
} else if(whitespace(c)){
119+
this._emitToken("closetag");
120+
this._state = AFTER_CLOSING_TAG_NAME;
121+
}
122+
} else if(this._state === AFTER_CLOSING_TAG_NAME){
123+
//skip everything until ">"
124+
if(c === code(">")){
125+
this._state = TEXT;
126+
this._sectionStart = this._index + 1;
127+
}
128+
}
129+
130+
/*
131+
* attributes
132+
*/
133+
else if(this._state === BEFORE_ATTRIBUTE_NAME){
134+
if(c === code("/")){
135+
this.emit("selfclosingtag");
136+
this._state = AFTER_CLOSING_TAG_NAME;
137+
} else if(c === code(">")){
138+
this._state = TEXT;
139+
this._sectionStart = this._index + 1;
140+
} else if(!whitespace(c)){
141+
this._state = IN_ATTRIBUTE_NAME;
142+
this._sectionStart = this._index;
143+
}
144+
} else if(this._state === IN_ATTRIBUTE_NAME){
145+
if(c === code("=")){
146+
this._emitIfToken("attribname");
147+
this._state = BEFORE_ATTRIBUTE_VALUE;
148+
} else if(c === code("/")){
149+
this._emitIfToken("attribname");
150+
this.emit("selfclosingtag");
151+
this._state = AFTER_CLOSING_TAG_NAME;
152+
} else if(c === code(">")){
153+
this._emitIfToken("attribname");
154+
this._state = TEXT;
155+
this._sectionStart = this._index + 1;
156+
} else if(whitespace(c)){
157+
this._emitIfToken("attribname");
158+
this._state = AFTER_ATTRIBUTE_NAME;
159+
}
160+
} else if(this._state === AFTER_ATTRIBUTE_NAME){
161+
if(c === code("=")){
162+
this._state = BEFORE_ATTRIBUTE_VALUE;
163+
} else if(c === code("/")){
164+
this.emit("selfclosingtag");
165+
this._state = AFTER_CLOSING_TAG_NAME;
166+
} else if(c === code(">")){
167+
this._state = TEXT;
168+
this._sectionStart = this._index + 1;
169+
} else if(!whitespace(c)){
170+
this._state = IN_ATTRIBUTE_NAME;
171+
this._sectionStart = this._index;
172+
}
173+
} else if(this._state === BEFORE_ATTRIBUTE_VALUE){
174+
if(c === code("\"")){
175+
this._state = IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES;
176+
this._sectionStart = this._index + 1;
177+
} else if(c === code("'")){
178+
this._state = IN_ATTRIBUTE_VALUE_SINGLE_QUOTES;
179+
this._sectionStart = this._index + 1;
180+
} else if(!whitespace(c)){
181+
this._state = IN_ATTRIBUTE_VALUE_NO_QUOTES;
182+
this._sectionStart = this._index;
183+
}
184+
} else if(this._state === IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES){
185+
if(c === code("\"")){
186+
this._emitToken("attribvalue");
187+
this._state = BEFORE_ATTRIBUTE_NAME;
188+
}
189+
} else if(this._state === IN_ATTRIBUTE_VALUE_SINGLE_QUOTES){
190+
if(c === code("'")){
191+
this._emitToken("attribvalue");
192+
this._state = BEFORE_ATTRIBUTE_NAME;
193+
}
194+
} else if(this._state === IN_ATTRIBUTE_VALUE_NO_QUOTES){
195+
if(c === code("/")){
196+
this._emitToken("attribvalue");
197+
this.emit("selfclosingtag");
198+
this._state = AFTER_CLOSING_TAG_NAME;
199+
} else if(c === code(">")){
200+
this._emitToken("attribvalue");
201+
this._state = TEXT;
202+
this._sectionStart = this._index + 1;
203+
} else if(whitespace(c)){
204+
this._emitToken("attribvalue");
205+
this._state = BEFORE_ATTRIBUTE_NAME;
206+
}
207+
}
208+
209+
/*
210+
* declarations
211+
*/
212+
else if(this._state === DECLARATION_START){
213+
if(c === code("[")) this._state = CDATA_1;
214+
else if(c === code("-")) this._state = BEFORE_COMMENT;
215+
else this._state = IN_DECLARATION;
216+
} else if(this._state === IN_DECLARATION){
217+
if(c === code(">")){
218+
this._emitToken("declaration");
219+
this._state = TEXT;
220+
this._sectionStart = this._index + 1;
221+
}
222+
}
223+
224+
/*
225+
* processing instructions
226+
*/
227+
else if(this._state === IN_PROCESSING_INSTRUCTION){
228+
if(c === code(">")){
229+
this._emitToken("processinginstruction");
230+
this._state = TEXT;
231+
this._sectionStart = this._index + 1;
232+
}
233+
}
234+
235+
/*
236+
* comments
237+
*/
238+
else if(this._state === BEFORE_COMMENT){
239+
if(c === code("-")){
240+
this._state = IN_COMMENT;
241+
this._sectionStart = this._index + 1;
242+
} else {
243+
this._state = IN_DECLARATION;
244+
}
245+
} else if(this._state === IN_COMMENT){
246+
if(c === code("-")) this._state = COMMENT_END_1;
247+
} else if(this._state === COMMENT_END_1){
248+
if(c === code("-")) this._state = COMMENT_END_2;
249+
else this._state = IN_COMMENT;
250+
} else if(this._state === COMMENT_END_2){
251+
if(c === code(">")){
252+
//remove 2 trailing chars
253+
this.emit("comment", this._buffer.toString("utf8", this._sectionStart, this._index - 2));
254+
this._state = TEXT;
255+
this._sectionStart = this._index + 1;
256+
} else {
257+
this._state = IN_COMMENT;
258+
}
259+
}
260+
261+
/*
262+
* cdata
263+
*/
264+
else if(this._state === CDATA_1){
265+
if(c === code("C")) this._state = CDATA_2;
266+
else this._state = IN_DECLARATION;
267+
} else if(this._state === CDATA_2){
268+
if(c === code("D")) this._state = CDATA_3;
269+
else this._state = IN_DECLARATION;
270+
} else if(this._state === CDATA_3){
271+
if(c === code("A")) this._state = CDATA_4;
272+
else this._state = IN_DECLARATION;
273+
} else if(this._state === CDATA_4){
274+
if(c === code("T")) this._state = CDATA_5;
275+
else this._state = IN_DECLARATION;
276+
} else if(this._state === CDATA_5){
277+
if(c === code("A")) this._state = CDATA_6;
278+
else this._state = IN_DECLARATION;
279+
} else if(this._state === CDATA_6){
280+
if(c === code("[")){
281+
this._state = IN_CDATA;
282+
this._sectionStart = this._index + 1;
283+
} else {
284+
this._state = IN_DECLARATION;
285+
}
286+
} else if(this._state === IN_CDATA){
287+
if(c === code("]")) this._state = CDATA_END_1;
288+
} else if(this._state === CDATA_END_1){
289+
if(c === code("]")) this._state = CDATA_END_2;
290+
else this._state = IN_CDATA;
291+
} else if(this._state === CDATA_END_2){
292+
if(c === code(">")){
293+
//remove 2 trailing chars
294+
this.emit("cdata", this._buffer.toString("utf8", this._sectionStart, this._index - 2));
295+
this._state = TEXT;
296+
this._sectionStart = this._index + 1;
297+
} else {
298+
this._state = IN_CDATA;
299+
}
300+
} else {
301+
throw Error("unknown state " + this._state);
302+
}
303+
304+
this._index++;
305+
}
306+
307+
//cleanup
308+
if(this._sectionStart === -1){
309+
this._buffer = null;
310+
} else {
311+
this._sectionStart = 0;
312+
313+
if(this._sectionStart === this._index - 1){
314+
this._buffer = null;
315+
} else {
316+
this._buffer = this._buffer.slice(this._sectionStart);
317+
}
318+
}
319+
320+
cb();
321+
};
322+
323+
Parser.prototype._emitToken = function(name){
324+
this.emit(name, this._buffer.toString("utf8", this._sectionStart, this._index));
325+
this._sectionStart = -1;
326+
};
327+
328+
Parser.prototype._emitIfToken = function(name){
329+
if(this._index > this._sectionStart){
330+
this.emit(name, this._buffer.toString("utf8", this._sectionStart, this._index));
331+
}
332+
this._sectionStart = -1;
333+
};
334+
335+
/*
336+
//overwritten for better debuggability
337+
Parser.prototype.emit = function(){
338+
process.stdout.write("[" + this._state + "]\t");
339+
console.log.apply(null, [].map.call(arguments, Function.prototype.call, String.prototype.trim));
340+
WritableStream.prototype.emit.apply(this, arguments);
341+
};
342+
343+
Parser.prototype.end = function(){
344+
WritableStream.prototype.end.apply(this, arguments);
345+
346+
if(this._state === TEXT) return;
347+
console.log("the game must go on!", this._state);
348+
};
349+
*/

0 commit comments

Comments
 (0)