dlangui.dml.tokenizer source code

1 module dlangui.dml.tokenizer;
2 
3 import dlangui.core.types;
4 import dlangui.core.linestream;
5 
6 import std.conv : to;
7 import std.utf : toUTF32;
8 import std.algorithm : equal, min, max;
9 
10 enum TokenType : ushort {
11     /// end of file
12     eof,
13     /// end of line
14     eol,
15     /// whitespace
16     whitespace,
17     /// string literal
18     str,
19     /// integer literal
20     integer,
21     /// floating point literal
22     floating,
23     /// comment
24     comment,
25     /// ident
26     ident,
27     /// error
28     error,
29     // operators
30     /// : operator
31     colon,
32     /// . operator
33     dot,
34     /// ; operator
35     semicolon,
36     /// / operator
37     divide,
38     /// , operator
39     comma,
40     /// - operator
41     minus,
42     /// + operator
43     plus,
44     /// {
45     curlyOpen,
46     /// }
47     curlyClose,
48     /// (
49     open,
50     /// )
51     close,
52     /// [
53     squareOpen,
54     /// ]
55     squareClose,
56 }
57 
58 struct Token {
59     TokenType type;
60     ushort line;
61     ushort pos;
62     bool multiline;
63     string text;
64     union {
65         int intvalue;
66         double floatvalue;
67     }
68     public @property string toString() const {
69         if (type == TokenType.integer)
70             return "" ~ to!string(line) ~ ":" ~ to!string(pos) ~ " " ~ to!string(type) ~ " " ~ to!string(intvalue);
71         else if (type == TokenType.floating)
72             return "" ~ to!string(line) ~ ":" ~ to!string(pos) ~ " " ~ to!string(type) ~ " " ~ to!string(floatvalue);
73         else
74             return "" ~ to!string(line) ~ ":" ~ to!string(pos) ~ " " ~ to!string(type) ~ " \"" ~ text ~ "\"";
75     }
76     @property bool isMultilineComment() {
77         return type == TokenType.comment && multiline;
78     }
79 }
80 
81 class ParserException : Exception {
82     protected string _msg;
83     protected string _file;
84     protected int _line;
85     protected int _pos;
86 
87     @property string file() { return _file; }
88     @property string msg() { return _msg; }
89     @property int line() { return _line; }
90     @property int pos() { return _pos; }
91 
92     this(string msg, string file, int line, int pos) {
93         super(msg ~ " at " ~ file ~ " line " ~ to!string(line) ~ " column " ~ to!string(pos));
94         _msg = msg;
95         _file = file;
96         _line = line;
97         _pos = pos;
98     }
99 }
100 
101 /// simple tokenizer for DlangUI ML
102 class Tokenizer {
103 
104     protected string[] _singleLineCommentPrefixes = ["//"];
105     protected LineStream  _lines;
106     protected dchar[] _lineText;
107     protected ushort _line;
108     protected ushort _pos;
109     protected int _len;
110     protected dchar _prevChar;
111     protected string _filename;
112     protected Token _token;
113 
114     enum : int {
115         EOF_CHAR = 0x001A,
116         EOL_CHAR = 0x000A
117     }
118 
119     this(string source, string filename = "", string[] singleLineCommentPrefixes = ["//"]) {
120         _singleLineCommentPrefixes = singleLineCommentPrefixes;
121         _filename = filename;
122         _lines = LineStream.create(source, filename);
123         _lineText = _lines.readLine();
124         _len = cast(int)_lineText.length;
125         _line = 0;
126         _pos = 0;
127         _prevChar = 0;
128     }
129 
130     ~this() {
131         destroy(_lines);
132         _lines = null;
133     }
134 
135     protected dchar peekChar() {
136         if (_pos < _len)
137             return _lineText[_pos];
138         else if (_lineText is null)
139             return EOF_CHAR;
140         return EOL_CHAR;
141     }
142 
143     protected dchar peekNextChar() {
144         if (_pos < _len - 1)
145             return _lineText[_pos + 1];
146         else if (_lineText is null)
147             return EOF_CHAR;
148         return EOL_CHAR;
149     }
150 
151     protected dchar nextChar() {
152         if (_pos < _len)
153             _prevChar = _lineText[_pos++];
154         else if (_lineText is null)
155             _prevChar = EOF_CHAR;
156         else {
157             _lineText = _lines.readLine();
158             _len = cast(int)_lineText.length;
159             _line++;
160             _pos = 0;
161             _prevChar = EOL_CHAR;
162         }
163         return _prevChar;
164     }
165 
166     protected dchar skipChar() {
167         nextChar();
168         return peekChar();
169     }
170 
171     protected void setTokenStart() {
172         _token.pos = _pos;
173         _token.line = _line;
174         _token.text = null;
175         _token.intvalue = 0;
176     }
177 
178     protected ref const(Token) parseEof() {
179         _token.type = TokenType.eof;
180         return _token;
181     }
182 
183     protected ref const(Token) parseEol() {
184         _token.type = TokenType.eol;
185         nextChar();
186         return _token;
187     }
188 
189     protected ref const(Token) parseWhiteSpace() {
190         _token.type = TokenType.whitespace;
191         for(;;) {
192             dchar ch = skipChar();
193             if (ch != ' ' && ch != '\t')
194                 break;
195         }
196         return _token;
197     }
198 
199     static bool isAlpha(dchar ch) {
200         return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_';
201     }
202 
203     static bool isNum(dchar ch) {
204         return (ch >= '0' && ch <= '9');
205     }
206 
207     static bool isAlphaNum(dchar ch) {
208         return isNum(ch) || isAlpha(ch);
209     }
210 
211     private char[] _stringbuf;
212     protected ref const(Token) parseString() {
213         _token.type = TokenType.str;
214         //skipChar(); // skip "
215         bool lastBackslash = false;
216         _stringbuf.length = 0;
217         dchar quoteChar = peekChar();
218         for(;;) {
219             dchar ch = skipChar();
220             if (ch == quoteChar) { // '\"'
221                 if (lastBackslash) {
222                     _stringbuf ~= ch;
223                     lastBackslash = false;
224                 } else {
225                     skipChar();
226                     break;
227                 }
228             } else if (ch == '\\') {
229                 if (lastBackslash) {
230                     _stringbuf ~= ch;
231                     lastBackslash = false;
232                 } else {
233                     lastBackslash = true;
234                 }
235             } else if (ch == EOL_CHAR) {
236                 skipChar();
237                 break;
238             } else if (lastBackslash) {
239                 if (ch == 'n')
240                     ch = '\n';
241                 else if (ch == 't')
242                     ch = '\t';
243                 _stringbuf ~= ch;
244                 lastBackslash = false;
245             } else {
246                 _stringbuf ~= ch;
247                 lastBackslash = false;
248             }
249         }
250         _token.text = _stringbuf.dup;
251         return _token;
252     }
253 
254     protected ref const(Token) parseIdent() {
255         _token.type = TokenType.ident;
256         _stringbuf.length = 0;
257         _stringbuf ~= peekChar();
258         for(;;) {
259             dchar ch = skipChar();
260             if (!isAlphaNum(ch))
261                 break;
262             _stringbuf ~= ch;
263         }
264         _token.text = _stringbuf.dup;
265         return _token;
266     }
267 
268     protected ref const(Token) parseFloating(int n) {
269         _token.type = TokenType.floating;
270         dchar ch = peekChar();
271         // floating point
272         int div = 1;
273         int n2 = 0;
274         for (;;) {
275             ch = skipChar();
276             if (!isNum(ch))
277                 break;
278             n2 = n2 * 10 + (ch - '0');
279             div *= 10;
280         }
281         _token.floatvalue = cast(double)n + (div > 0 ? cast(double)n2 / div : 0.0);
282         string suffix;
283         if (ch == '%') {
284             suffix ~= ch;
285             ch = skipChar();
286         } else {
287             while (ch >= 'a' && ch <= 'z') {
288                 suffix ~= ch;
289                 ch = skipChar();
290             }
291         }
292         if (isAlphaNum(ch) || ch == '.')
293             return parseError();
294         _token.text = suffix;
295         return _token;
296     }
297 
298     protected ref const(Token) parseHex(int prefixLen) {
299         dchar ch = 0;
300         foreach(i; 0 .. prefixLen)
301             ch = skipChar();
302 
303         uint n = parseHexDigit(ch);
304         if (n == uint.max)
305             return parseError();
306 
307         for(;;) {
308             ch = skipChar();
309             uint digit = parseHexDigit(ch);
310             if (digit == uint.max)
311                 break;
312             n = (n << 4) + digit;
313         }
314         string suffix;
315         if (ch == '%') {
316             suffix ~= ch;
317             ch = skipChar();
318         } else {
319             while (ch >= 'a' && ch <= 'z') {
320                 suffix ~= ch;
321                 ch = skipChar();
322             }
323         }
324         if (isAlphaNum(ch) || ch == '.')
325             return parseError();
326         _token.type = TokenType.integer;
327         _token.intvalue = n;
328         _token.text = suffix;
329         return _token;
330     }
331 
332     protected ref const(Token) parseNumber() {
333         dchar ch = peekChar();
334         uint n = ch - '0';
335         for(;;) {
336             ch = skipChar();
337             if (!isNum(ch))
338                 break;
339             n = n * 10 + (ch - '0');
340         }
341         if (ch == '.')
342             return parseFloating(n);
343         string suffix;
344         if (ch == '%') {
345             suffix ~= ch;
346             ch = skipChar();
347         } else {
348             while (ch >= 'a' && ch <= 'z') {
349                 suffix ~= ch;
350                 ch = skipChar();
351             }
352         }
353         if (isAlphaNum(ch) || ch == '.')
354             return parseError();
355         _token.type = TokenType.integer;
356         _token.intvalue = n;
357         _token.text = suffix;
358         return _token;
359     }
360 
361     protected ref const(Token) parseSingleLineComment() {
362         for(;;) {
363             dchar ch = skipChar();
364             if (ch == EOL_CHAR || ch == EOF_CHAR)
365                 break;
366         }
367         _token.type = TokenType.comment;
368         _token.multiline = false;
369         return _token;
370     }
371 
372     protected ref const(Token) parseMultiLineComment() {
373         skipChar();
374         for(;;) {
375             dchar ch = skipChar();
376             if (ch == '*' && peekNextChar() == '/') {
377                 skipChar();
378                 skipChar();
379                 break;
380             }
381             if (ch == EOF_CHAR)
382                 break;
383         }
384         _token.type = TokenType.comment;
385         _token.multiline = true;
386         return _token;
387     }
388 
389     protected ref const(Token) parseError() {
390         _token.type = TokenType.error;
391         for(;;) {
392             dchar ch = skipChar();
393             if (ch == ' ' || ch == '\t' || ch == EOL_CHAR || ch == EOF_CHAR)
394                 break;
395         }
396         return _token;
397     }
398 
399     protected ref const(Token) parseOp(TokenType op) {
400         _token.type = op;
401         skipChar();
402         return _token;
403     }
404 
405     /// get next token
406     ref const(Token) nextToken() {
407         setTokenStart();
408         dchar ch = peekChar();
409         if (ch == EOF_CHAR)
410             return parseEof();
411         if (ch == EOL_CHAR)
412             return parseEol();
413         if (ch == ' ' || ch == '\t')
414             return parseWhiteSpace();
415         if (ch == '\"' || ch == '\'' || ch == '`')
416             return parseString();
417         if (isAlpha(ch))
418             return parseIdent();
419         if (ch == '0' && peekNextChar == 'x')
420             return parseHex(2);
421         if (ch == '#')
422             return parseHex(1);
423         if (isNum(ch))
424             return parseNumber();
425         if (ch == '.' && isNum(peekNextChar()))
426             return parseFloating(0);
427         foreach(prefix; _singleLineCommentPrefixes) {
428             if (ch == prefix[0] && (prefix.length == 1 || peekNextChar() == prefix[1]))
429                 return parseSingleLineComment();
430         }
431         if (ch == '/' && peekNextChar() == '*')
432             return parseMultiLineComment();
433         switch (ch) {
434             case '.': return parseOp(TokenType.dot);
435             case ':': return parseOp(TokenType.colon);
436             case ';': return parseOp(TokenType.semicolon);
437             case ',': return parseOp(TokenType.comma);
438             case '-': return parseOp(TokenType.minus);
439             case '+': return parseOp(TokenType.plus);
440             case '{': return parseOp(TokenType.curlyOpen);
441             case '}': return parseOp(TokenType.curlyClose);
442             case '(': return parseOp(TokenType.open);
443             case ')': return parseOp(TokenType.close);
444             case '[': return parseOp(TokenType.squareOpen);
445             case ']': return parseOp(TokenType.squareClose);
446             case '/': return parseOp(TokenType.divide);
447             default:
448                 return parseError();
449         }
450     }
451 
452     string getContextSource() {
453         string s = toUTF8(cast(dstring)_lineText);
454         if (_pos == 0)
455             return " near `^^^" ~ s[0..min($,30)] ~ "`";
456         if (_pos >= _len)
457             return " near `" ~ s[max(_len - 30, 0) .. $] ~ "^^^`";
458         return " near `" ~ s[max(_pos - 15, 0) .. _pos] ~ "^^^" ~ s[_pos .. min(_pos + 15, $)] ~ "`";
459     }
460 
461     @property string filename() {
462         return filename;
463     }
464     @property int line() {
465         return _token.line;
466     }
467     @property int pos() {
468         return _token.pos;
469     }
470 
471     void emitError(string msg) {
472         throw new ParserException(msg ~ getContextSource(), _filename, _token.line, _token.pos);
473     }
474 
475     void emitError(string msg, ref const Token token) {
476         throw new ParserException(msg, _filename, token.line, token.pos);
477     }
478 }
479 
480 /// tokenize source into array of tokens (excluding EOF)
481 public Token[] tokenize(string code, string[] _singleLineCommentPrefixes = ["//"], bool skipSpace = false, bool skipEols = false, bool skipComments = false) {
482     Token[] res;
483     auto tokenizer = new Tokenizer(code, "", _singleLineCommentPrefixes);
484     for (;;) {
485         auto token = tokenizer.nextToken();
486         if (token.type == TokenType.eof)
487             break;
488         if (skipSpace && token.type == TokenType.whitespace)
489             continue;
490         if (skipEols &&  token.type == TokenType.eol)
491             continue;
492         if (skipComments &&  token.type == TokenType.comment)
493             continue;
494         res ~= token;
495     }
496     return res;
497 }
498 
499 /// exclude whitespace tokens at beginning and end of token sequence
500 Token[] trimSpaceTokens(Token[] tokens, bool trimBeginning = true, bool trimEnd = true) {
501     if (trimBeginning)
502         while(tokens.length > 0 && tokens[0].type == TokenType.whitespace)
503             tokens = tokens[1 .. $];
504     if (trimEnd)
505         while(tokens.length > 0 && tokens[$ - 1].type == TokenType.whitespace)
506             tokens = tokens[0 .. $ - 1];
507     return tokens;
508 }