1 module dlangui.dml.tokenizer;
2
3 import dlangui.core.types;
4 import dlangui.core.linestream;
5
6 import std.conv : to;
7 import std.utf : toUTF32;
8 import std.algorithm : equal, min, max;
9
10 enum TokenType : ushort {
11 /// end of file
12 eof,
13 /// end of line
14 eol,
15 /// whitespace
16 whitespace,
17 /// string literal
18 str,
19 /// integer literal
20 integer,
21 /// floating point literal
22 floating,
23 /// comment
24 comment,
25 /// ident
26 ident,
27 /// error
28 error,
29 // operators
30 /// : operator
31 colon,
32 /// . operator
33 dot,
34 /// ; operator
35 semicolon,
36 /// / operator
37 divide,
38 /// , operator
39 comma,
40 /// - operator
41 minus,
42 /// + operator
43 plus,
44 /// {
45 curlyOpen,
46 /// }
47 curlyClose,
48 /// (
49 open,
50 /// )
51 close,
52 /// [
53 squareOpen,
54 /// ]
55 squareClose,
56 }
57
58 struct Token {
59 TokenType type;
60 ushort line;
61 ushort pos;
62 bool multiline;
63 string text;
64 union {
65 int intvalue;
66 double floatvalue;
67 }
68 public @property string toString() const {
69 if (type == TokenType.integer)
70 return "" ~ to!string(line) ~ ":" ~ to!string(pos) ~ " " ~ to!string(type) ~ " " ~ to!string(intvalue);
71 else if (type == TokenType.floating)
72 return "" ~ to!string(line) ~ ":" ~ to!string(pos) ~ " " ~ to!string(type) ~ " " ~ to!string(floatvalue);
73 else
74 return "" ~ to!string(line) ~ ":" ~ to!string(pos) ~ " " ~ to!string(type) ~ " \"" ~ text ~ "\"";
75 }
76 @property bool isMultilineComment() {
77 return type == TokenType.comment && multiline;
78 }
79 }
80
81 class ParserException : Exception {
82 protected string _msg;
83 protected string _file;
84 protected int _line;
85 protected int _pos;
86
87 @property string file() { return _file; }
88 @property string msg() { return _msg; }
89 @property int line() { return _line; }
90 @property int pos() { return _pos; }
91
92 this(string msg, string file, int line, int pos) {
93 super(msg ~ " at " ~ file ~ " line " ~ to!string(line) ~ " column " ~ to!string(pos));
94 _msg = msg;
95 _file = file;
96 _line = line;
97 _pos = pos;
98 }
99 }
100
101 /// simple tokenizer for DlangUI ML
102 class Tokenizer {
103
104 protected string[] _singleLineCommentPrefixes = ["//"];
105 protected LineStream _lines;
106 protected dchar[] _lineText;
107 protected ushort _line;
108 protected ushort _pos;
109 protected int _len;
110 protected dchar _prevChar;
111 protected string _filename;
112 protected Token _token;
113
114 enum : int {
115 EOF_CHAR = 0x001A,
116 EOL_CHAR = 0x000A
117 }
118
119 this(string source, string filename = "", string[] singleLineCommentPrefixes = ["//"]) {
120 _singleLineCommentPrefixes = singleLineCommentPrefixes;
121 _filename = filename;
122 _lines = LineStream.create(source, filename);
123 _lineText = _lines.readLine();
124 _len = cast(int)_lineText.length;
125 _line = 0;
126 _pos = 0;
127 _prevChar = 0;
128 }
129
130 ~this() {
131 destroy(_lines);
132 _lines = null;
133 }
134
135 protected dchar peekChar() {
136 if (_pos < _len)
137 return _lineText[_pos];
138 else if (_lineText is null)
139 return EOF_CHAR;
140 return EOL_CHAR;
141 }
142
143 protected dchar peekNextChar() {
144 if (_pos < _len - 1)
145 return _lineText[_pos + 1];
146 else if (_lineText is null)
147 return EOF_CHAR;
148 return EOL_CHAR;
149 }
150
151 protected dchar nextChar() {
152 if (_pos < _len)
153 _prevChar = _lineText[_pos++];
154 else if (_lineText is null)
155 _prevChar = EOF_CHAR;
156 else {
157 _lineText = _lines.readLine();
158 _len = cast(int)_lineText.length;
159 _line++;
160 _pos = 0;
161 _prevChar = EOL_CHAR;
162 }
163 return _prevChar;
164 }
165
166 protected dchar skipChar() {
167 nextChar();
168 return peekChar();
169 }
170
171 protected void setTokenStart() {
172 _token.pos = _pos;
173 _token.line = _line;
174 _token.text = null;
175 _token.intvalue = 0;
176 }
177
178 protected ref const(Token) parseEof() {
179 _token.type = TokenType.eof;
180 return _token;
181 }
182
183 protected ref const(Token) parseEol() {
184 _token.type = TokenType.eol;
185 nextChar();
186 return _token;
187 }
188
189 protected ref const(Token) parseWhiteSpace() {
190 _token.type = TokenType.whitespace;
191 for(;;) {
192 dchar ch = skipChar();
193 if (ch != ' ' && ch != '\t')
194 break;
195 }
196 return _token;
197 }
198
199 static bool isAlpha(dchar ch) {
200 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_';
201 }
202
203 static bool isNum(dchar ch) {
204 return (ch >= '0' && ch <= '9');
205 }
206
207 static bool isAlphaNum(dchar ch) {
208 return isNum(ch) || isAlpha(ch);
209 }
210
211 private char[] _stringbuf;
212 protected ref const(Token) parseString() {
213 _token.type = TokenType.str;
214 //skipChar(); // skip "
215 bool lastBackslash = false;
216 _stringbuf.length = 0;
217 dchar quoteChar = peekChar();
218 for(;;) {
219 dchar ch = skipChar();
220 if (ch == quoteChar) { // '\"'
221 if (lastBackslash) {
222 _stringbuf ~= ch;
223 lastBackslash = false;
224 } else {
225 skipChar();
226 break;
227 }
228 } else if (ch == '\\') {
229 if (lastBackslash) {
230 _stringbuf ~= ch;
231 lastBackslash = false;
232 } else {
233 lastBackslash = true;
234 }
235 } else if (ch == EOL_CHAR) {
236 skipChar();
237 break;
238 } else if (lastBackslash) {
239 if (ch == 'n')
240 ch = '\n';
241 else if (ch == 't')
242 ch = '\t';
243 _stringbuf ~= ch;
244 lastBackslash = false;
245 } else {
246 _stringbuf ~= ch;
247 lastBackslash = false;
248 }
249 }
250 _token.text = _stringbuf.dup;
251 return _token;
252 }
253
254 protected ref const(Token) parseIdent() {
255 _token.type = TokenType.ident;
256 _stringbuf.length = 0;
257 _stringbuf ~= peekChar();
258 for(;;) {
259 dchar ch = skipChar();
260 if (!isAlphaNum(ch))
261 break;
262 _stringbuf ~= ch;
263 }
264 _token.text = _stringbuf.dup;
265 return _token;
266 }
267
268 protected ref const(Token) parseFloating(int n) {
269 _token.type = TokenType.floating;
270 dchar ch = peekChar();
271 // floating point
272 int div = 1;
273 int n2 = 0;
274 for (;;) {
275 ch = skipChar();
276 if (!isNum(ch))
277 break;
278 n2 = n2 * 10 + (ch - '0');
279 div *= 10;
280 }
281 _token.floatvalue = cast(double)n + (div > 0 ? cast(double)n2 / div : 0.0);
282 string suffix;
283 if (ch == '%') {
284 suffix ~= ch;
285 ch = skipChar();
286 } else {
287 while (ch >= 'a' && ch <= 'z') {
288 suffix ~= ch;
289 ch = skipChar();
290 }
291 }
292 if (isAlphaNum(ch) || ch == '.')
293 return parseError();
294 _token.text = suffix;
295 return _token;
296 }
297
298 protected ref const(Token) parseHex(int prefixLen) {
299 dchar ch = 0;
300 foreach(i; 0 .. prefixLen)
301 ch = skipChar();
302
303 uint n = parseHexDigit(ch);
304 if (n == uint.max)
305 return parseError();
306
307 for(;;) {
308 ch = skipChar();
309 uint digit = parseHexDigit(ch);
310 if (digit == uint.max)
311 break;
312 n = (n << 4) + digit;
313 }
314 string suffix;
315 if (ch == '%') {
316 suffix ~= ch;
317 ch = skipChar();
318 } else {
319 while (ch >= 'a' && ch <= 'z') {
320 suffix ~= ch;
321 ch = skipChar();
322 }
323 }
324 if (isAlphaNum(ch) || ch == '.')
325 return parseError();
326 _token.type = TokenType.integer;
327 _token.intvalue = n;
328 _token.text = suffix;
329 return _token;
330 }
331
332 protected ref const(Token) parseNumber() {
333 dchar ch = peekChar();
334 uint n = ch - '0';
335 for(;;) {
336 ch = skipChar();
337 if (!isNum(ch))
338 break;
339 n = n * 10 + (ch - '0');
340 }
341 if (ch == '.')
342 return parseFloating(n);
343 string suffix;
344 if (ch == '%') {
345 suffix ~= ch;
346 ch = skipChar();
347 } else {
348 while (ch >= 'a' && ch <= 'z') {
349 suffix ~= ch;
350 ch = skipChar();
351 }
352 }
353 if (isAlphaNum(ch) || ch == '.')
354 return parseError();
355 _token.type = TokenType.integer;
356 _token.intvalue = n;
357 _token.text = suffix;
358 return _token;
359 }
360
361 protected ref const(Token) parseSingleLineComment() {
362 for(;;) {
363 dchar ch = skipChar();
364 if (ch == EOL_CHAR || ch == EOF_CHAR)
365 break;
366 }
367 _token.type = TokenType.comment;
368 _token.multiline = false;
369 return _token;
370 }
371
372 protected ref const(Token) parseMultiLineComment() {
373 skipChar();
374 for(;;) {
375 dchar ch = skipChar();
376 if (ch == '*' && peekNextChar() == '/') {
377 skipChar();
378 skipChar();
379 break;
380 }
381 if (ch == EOF_CHAR)
382 break;
383 }
384 _token.type = TokenType.comment;
385 _token.multiline = true;
386 return _token;
387 }
388
389 protected ref const(Token) parseError() {
390 _token.type = TokenType.error;
391 for(;;) {
392 dchar ch = skipChar();
393 if (ch == ' ' || ch == '\t' || ch == EOL_CHAR || ch == EOF_CHAR)
394 break;
395 }
396 return _token;
397 }
398
399 protected ref const(Token) parseOp(TokenType op) {
400 _token.type = op;
401 skipChar();
402 return _token;
403 }
404
405 /// get next token
406 ref const(Token) nextToken() {
407 setTokenStart();
408 dchar ch = peekChar();
409 if (ch == EOF_CHAR)
410 return parseEof();
411 if (ch == EOL_CHAR)
412 return parseEol();
413 if (ch == ' ' || ch == '\t')
414 return parseWhiteSpace();
415 if (ch == '\"' || ch == '\'' || ch == '`')
416 return parseString();
417 if (isAlpha(ch))
418 return parseIdent();
419 if (ch == '0' && peekNextChar == 'x')
420 return parseHex(2);
421 if (ch == '#')
422 return parseHex(1);
423 if (isNum(ch))
424 return parseNumber();
425 if (ch == '.' && isNum(peekNextChar()))
426 return parseFloating(0);
427 foreach(prefix; _singleLineCommentPrefixes) {
428 if (ch == prefix[0] && (prefix.length == 1 || peekNextChar() == prefix[1]))
429 return parseSingleLineComment();
430 }
431 if (ch == '/' && peekNextChar() == '*')
432 return parseMultiLineComment();
433 switch (ch) {
434 case '.': return parseOp(TokenType.dot);
435 case ':': return parseOp(TokenType.colon);
436 case ';': return parseOp(TokenType.semicolon);
437 case ',': return parseOp(TokenType.comma);
438 case '-': return parseOp(TokenType.minus);
439 case '+': return parseOp(TokenType.plus);
440 case '{': return parseOp(TokenType.curlyOpen);
441 case '}': return parseOp(TokenType.curlyClose);
442 case '(': return parseOp(TokenType.open);
443 case ')': return parseOp(TokenType.close);
444 case '[': return parseOp(TokenType.squareOpen);
445 case ']': return parseOp(TokenType.squareClose);
446 case '/': return parseOp(TokenType.divide);
447 default:
448 return parseError();
449 }
450 }
451
452 string getContextSource() {
453 string s = toUTF8(cast(dstring)_lineText);
454 if (_pos == 0)
455 return " near `^^^" ~ s[0..min($,30)] ~ "`";
456 if (_pos >= _len)
457 return " near `" ~ s[max(_len - 30, 0) .. $] ~ "^^^`";
458 return " near `" ~ s[max(_pos - 15, 0) .. _pos] ~ "^^^" ~ s[_pos .. min(_pos + 15, $)] ~ "`";
459 }
460
461 @property string filename() {
462 return filename;
463 }
464 @property int line() {
465 return _token.line;
466 }
467 @property int pos() {
468 return _token.pos;
469 }
470
471 void emitError(string msg) {
472 throw new ParserException(msg ~ getContextSource(), _filename, _token.line, _token.pos);
473 }
474
475 void emitError(string msg, ref const Token token) {
476 throw new ParserException(msg, _filename, token.line, token.pos);
477 }
478 }
479
480 /// tokenize source into array of tokens (excluding EOF)
481 public Token[] tokenize(string code, string[] _singleLineCommentPrefixes = ["//"], bool skipSpace = false, bool skipEols = false, bool skipComments = false) {
482 Token[] res;
483 auto tokenizer = new Tokenizer(code, "", _singleLineCommentPrefixes);
484 for (;;) {
485 auto token = tokenizer.nextToken();
486 if (token.type == TokenType.eof)
487 break;
488 if (skipSpace && token.type == TokenType.whitespace)
489 continue;
490 if (skipEols && token.type == TokenType.eol)
491 continue;
492 if (skipComments && token.type == TokenType.comment)
493 continue;
494 res ~= token;
495 }
496 return res;
497 }
498
499 /// exclude whitespace tokens at beginning and end of token sequence
500 Token[] trimSpaceTokens(Token[] tokens, bool trimBeginning = true, bool trimEnd = true) {
501 if (trimBeginning)
502 while(tokens.length > 0 && tokens[0].type == TokenType.whitespace)
503 tokens = tokens[1 .. $];
504 if (trimEnd)
505 while(tokens.length > 0 && tokens[$ - 1].type == TokenType.whitespace)
506 tokens = tokens[0 .. $ - 1];
507 return tokens;
508 }