1 module dlangui.dml.tokenizer; 2 3 import dlangui.core.types; 4 import dlangui.core.linestream; 5 6 import std.conv : to; 7 import std.utf : toUTF32; 8 import std.algorithm : equal, min, max; 9 10 enum TokenType : ushort { 11 /// end of file 12 eof, 13 /// end of line 14 eol, 15 /// whitespace 16 whitespace, 17 /// string literal 18 str, 19 /// integer literal 20 integer, 21 /// floating point literal 22 floating, 23 /// comment 24 comment, 25 /// ident 26 ident, 27 /// error 28 error, 29 // operators 30 /// : operator 31 colon, 32 /// . operator 33 dot, 34 /// ; operator 35 semicolon, 36 /// / operator 37 divide, 38 /// , operator 39 comma, 40 /// - operator 41 minus, 42 /// + operator 43 plus, 44 /// { 45 curlyOpen, 46 /// } 47 curlyClose, 48 /// ( 49 open, 50 /// ) 51 close, 52 /// [ 53 squareOpen, 54 /// ] 55 squareClose, 56 } 57 58 struct Token { 59 TokenType type; 60 ushort line; 61 ushort pos; 62 bool multiline; 63 string text; 64 union { 65 int intvalue; 66 double floatvalue; 67 } 68 public @property string toString() const { 69 if (type == TokenType.integer) 70 return "" ~ to!string(line) ~ ":" ~ to!string(pos) ~ " " ~ to!string(type) ~ " " ~ to!string(intvalue); 71 else if (type == TokenType.floating) 72 return "" ~ to!string(line) ~ ":" ~ to!string(pos) ~ " " ~ to!string(type) ~ " " ~ to!string(floatvalue); 73 else 74 return "" ~ to!string(line) ~ ":" ~ to!string(pos) ~ " " ~ to!string(type) ~ " \"" ~ text ~ "\""; 75 } 76 @property bool isMultilineComment() { 77 return type == TokenType.comment && multiline; 78 } 79 } 80 81 class ParserException : Exception { 82 protected string _msg; 83 protected string _file; 84 protected int _line; 85 protected int _pos; 86 87 @property string file() { return _file; } 88 @property string msg() { return _msg; } 89 @property int line() { return _line; } 90 @property int pos() { return _pos; } 91 92 this(string msg, string file, int line, int pos) { 93 super(msg ~ " at " ~ file ~ " line " ~ to!string(line) ~ " column " ~ to!string(pos)); 94 _msg = msg; 95 _file = file; 96 _line = line; 97 _pos = pos; 98 } 99 } 100 101 /// simple tokenizer for DlangUI ML 102 class Tokenizer { 103 104 protected string[] _singleLineCommentPrefixes = ["//"]; 105 protected LineStream _lines; 106 protected dchar[] _lineText; 107 protected ushort _line; 108 protected ushort _pos; 109 protected int _len; 110 protected dchar _prevChar; 111 protected string _filename; 112 protected Token _token; 113 114 enum : int { 115 EOF_CHAR = 0x001A, 116 EOL_CHAR = 0x000A 117 } 118 119 this(string source, string filename = "", string[] singleLineCommentPrefixes = ["//"]) { 120 _singleLineCommentPrefixes = singleLineCommentPrefixes; 121 _filename = filename; 122 _lines = LineStream.create(source, filename); 123 _lineText = _lines.readLine(); 124 _len = cast(int)_lineText.length; 125 _line = 0; 126 _pos = 0; 127 _prevChar = 0; 128 } 129 130 ~this() { 131 destroy(_lines); 132 _lines = null; 133 } 134 135 protected dchar peekChar() { 136 if (_pos < _len) 137 return _lineText[_pos]; 138 else if (_lineText is null) 139 return EOF_CHAR; 140 return EOL_CHAR; 141 } 142 143 protected dchar peekNextChar() { 144 if (_pos < _len - 1) 145 return _lineText[_pos + 1]; 146 else if (_lineText is null) 147 return EOF_CHAR; 148 return EOL_CHAR; 149 } 150 151 protected dchar nextChar() { 152 if (_pos < _len) 153 _prevChar = _lineText[_pos++]; 154 else if (_lineText is null) 155 _prevChar = EOF_CHAR; 156 else { 157 _lineText = _lines.readLine(); 158 _len = cast(int)_lineText.length; 159 _line++; 160 _pos = 0; 161 _prevChar = EOL_CHAR; 162 } 163 return _prevChar; 164 } 165 166 protected dchar skipChar() { 167 nextChar(); 168 return peekChar(); 169 } 170 171 protected void setTokenStart() { 172 _token.pos = _pos; 173 _token.line = _line; 174 _token.text = null; 175 _token.intvalue = 0; 176 } 177 178 protected ref const(Token) parseEof() { 179 _token.type = TokenType.eof; 180 return _token; 181 } 182 183 protected ref const(Token) parseEol() { 184 _token.type = TokenType.eol; 185 nextChar(); 186 return _token; 187 } 188 189 protected ref const(Token) parseWhiteSpace() { 190 _token.type = TokenType.whitespace; 191 for(;;) { 192 dchar ch = skipChar(); 193 if (ch != ' ' && ch != '\t') 194 break; 195 } 196 return _token; 197 } 198 199 static bool isAlpha(dchar ch) { 200 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_'; 201 } 202 203 static bool isNum(dchar ch) { 204 return (ch >= '0' && ch <= '9'); 205 } 206 207 static bool isAlphaNum(dchar ch) { 208 return isNum(ch) || isAlpha(ch); 209 } 210 211 private char[] _stringbuf; 212 protected ref const(Token) parseString() { 213 _token.type = TokenType.str; 214 //skipChar(); // skip " 215 bool lastBackslash = false; 216 _stringbuf.length = 0; 217 dchar quoteChar = peekChar(); 218 for(;;) { 219 dchar ch = skipChar(); 220 if (ch == quoteChar) { // '\"' 221 if (lastBackslash) { 222 _stringbuf ~= ch; 223 lastBackslash = false; 224 } else { 225 skipChar(); 226 break; 227 } 228 } else if (ch == '\\') { 229 if (lastBackslash) { 230 _stringbuf ~= ch; 231 lastBackslash = false; 232 } else { 233 lastBackslash = true; 234 } 235 } else if (ch == EOL_CHAR) { 236 skipChar(); 237 break; 238 } else if (lastBackslash) { 239 if (ch == 'n') 240 ch = '\n'; 241 else if (ch == 't') 242 ch = '\t'; 243 _stringbuf ~= ch; 244 lastBackslash = false; 245 } else { 246 _stringbuf ~= ch; 247 lastBackslash = false; 248 } 249 } 250 _token.text = _stringbuf.dup; 251 return _token; 252 } 253 254 protected ref const(Token) parseIdent() { 255 _token.type = TokenType.ident; 256 _stringbuf.length = 0; 257 _stringbuf ~= peekChar(); 258 for(;;) { 259 dchar ch = skipChar(); 260 if (!isAlphaNum(ch)) 261 break; 262 _stringbuf ~= ch; 263 } 264 _token.text = _stringbuf.dup; 265 return _token; 266 } 267 268 protected ref const(Token) parseFloating(int n) { 269 _token.type = TokenType.floating; 270 dchar ch = peekChar(); 271 // floating point 272 int div = 1; 273 int n2 = 0; 274 for (;;) { 275 ch = skipChar(); 276 if (!isNum(ch)) 277 break; 278 n2 = n2 * 10 + (ch - '0'); 279 div *= 10; 280 } 281 _token.floatvalue = cast(double)n + (div > 0 ? cast(double)n2 / div : 0.0); 282 string suffix; 283 if (ch == '%') { 284 suffix ~= ch; 285 ch = skipChar(); 286 } else { 287 while (ch >= 'a' && ch <= 'z') { 288 suffix ~= ch; 289 ch = skipChar(); 290 } 291 } 292 if (isAlphaNum(ch) || ch == '.') 293 return parseError(); 294 _token.text = suffix; 295 return _token; 296 } 297 298 protected ref const(Token) parseHex(int prefixLen) { 299 dchar ch = 0; 300 foreach(i; 0 .. prefixLen) 301 ch = skipChar(); 302 303 uint n = parseHexDigit(ch); 304 if (n == uint.max) 305 return parseError(); 306 307 for(;;) { 308 ch = skipChar(); 309 uint digit = parseHexDigit(ch); 310 if (digit == uint.max) 311 break; 312 n = (n << 4) + digit; 313 } 314 string suffix; 315 if (ch == '%') { 316 suffix ~= ch; 317 ch = skipChar(); 318 } else { 319 while (ch >= 'a' && ch <= 'z') { 320 suffix ~= ch; 321 ch = skipChar(); 322 } 323 } 324 if (isAlphaNum(ch) || ch == '.') 325 return parseError(); 326 _token.type = TokenType.integer; 327 _token.intvalue = n; 328 _token.text = suffix; 329 return _token; 330 } 331 332 protected ref const(Token) parseNumber() { 333 dchar ch = peekChar(); 334 uint n = ch - '0'; 335 for(;;) { 336 ch = skipChar(); 337 if (!isNum(ch)) 338 break; 339 n = n * 10 + (ch - '0'); 340 } 341 if (ch == '.') 342 return parseFloating(n); 343 string suffix; 344 if (ch == '%') { 345 suffix ~= ch; 346 ch = skipChar(); 347 } else { 348 while (ch >= 'a' && ch <= 'z') { 349 suffix ~= ch; 350 ch = skipChar(); 351 } 352 } 353 if (isAlphaNum(ch) || ch == '.') 354 return parseError(); 355 _token.type = TokenType.integer; 356 _token.intvalue = n; 357 _token.text = suffix; 358 return _token; 359 } 360 361 protected ref const(Token) parseSingleLineComment() { 362 for(;;) { 363 dchar ch = skipChar(); 364 if (ch == EOL_CHAR || ch == EOF_CHAR) 365 break; 366 } 367 _token.type = TokenType.comment; 368 _token.multiline = false; 369 return _token; 370 } 371 372 protected ref const(Token) parseMultiLineComment() { 373 skipChar(); 374 for(;;) { 375 dchar ch = skipChar(); 376 if (ch == '*' && peekNextChar() == '/') { 377 skipChar(); 378 skipChar(); 379 break; 380 } 381 if (ch == EOF_CHAR) 382 break; 383 } 384 _token.type = TokenType.comment; 385 _token.multiline = true; 386 return _token; 387 } 388 389 protected ref const(Token) parseError() { 390 _token.type = TokenType.error; 391 for(;;) { 392 dchar ch = skipChar(); 393 if (ch == ' ' || ch == '\t' || ch == EOL_CHAR || ch == EOF_CHAR) 394 break; 395 } 396 return _token; 397 } 398 399 protected ref const(Token) parseOp(TokenType op) { 400 _token.type = op; 401 skipChar(); 402 return _token; 403 } 404 405 /// get next token 406 ref const(Token) nextToken() { 407 setTokenStart(); 408 dchar ch = peekChar(); 409 if (ch == EOF_CHAR) 410 return parseEof(); 411 if (ch == EOL_CHAR) 412 return parseEol(); 413 if (ch == ' ' || ch == '\t') 414 return parseWhiteSpace(); 415 if (ch == '\"' || ch == '\'' || ch == '`') 416 return parseString(); 417 if (isAlpha(ch)) 418 return parseIdent(); 419 if (ch == '0' && peekNextChar == 'x') 420 return parseHex(2); 421 if (ch == '#') 422 return parseHex(1); 423 if (isNum(ch)) 424 return parseNumber(); 425 if (ch == '.' && isNum(peekNextChar())) 426 return parseFloating(0); 427 foreach(prefix; _singleLineCommentPrefixes) { 428 if (ch == prefix[0] && (prefix.length == 1 || peekNextChar() == prefix[1])) 429 return parseSingleLineComment(); 430 } 431 if (ch == '/' && peekNextChar() == '*') 432 return parseMultiLineComment(); 433 switch (ch) { 434 case '.': return parseOp(TokenType.dot); 435 case ':': return parseOp(TokenType.colon); 436 case ';': return parseOp(TokenType.semicolon); 437 case ',': return parseOp(TokenType.comma); 438 case '-': return parseOp(TokenType.minus); 439 case '+': return parseOp(TokenType.plus); 440 case '{': return parseOp(TokenType.curlyOpen); 441 case '}': return parseOp(TokenType.curlyClose); 442 case '(': return parseOp(TokenType.open); 443 case ')': return parseOp(TokenType.close); 444 case '[': return parseOp(TokenType.squareOpen); 445 case ']': return parseOp(TokenType.squareClose); 446 case '/': return parseOp(TokenType.divide); 447 default: 448 return parseError(); 449 } 450 } 451 452 string getContextSource() { 453 string s = toUTF8(cast(dstring)_lineText); 454 if (_pos == 0) 455 return " near `^^^" ~ s[0..min($,30)] ~ "`"; 456 if (_pos >= _len) 457 return " near `" ~ s[max(_len - 30, 0) .. $] ~ "^^^`"; 458 return " near `" ~ s[max(_pos - 15, 0) .. _pos] ~ "^^^" ~ s[_pos .. min(_pos + 15, $)] ~ "`"; 459 } 460 461 @property string filename() { 462 return filename; 463 } 464 @property int line() { 465 return _token.line; 466 } 467 @property int pos() { 468 return _token.pos; 469 } 470 471 void emitError(string msg) { 472 throw new ParserException(msg ~ getContextSource(), _filename, _token.line, _token.pos); 473 } 474 475 void emitError(string msg, ref const Token token) { 476 throw new ParserException(msg, _filename, token.line, token.pos); 477 } 478 } 479 480 /// tokenize source into array of tokens (excluding EOF) 481 public Token[] tokenize(string code, string[] _singleLineCommentPrefixes = ["//"], bool skipSpace = false, bool skipEols = false, bool skipComments = false) { 482 Token[] res; 483 auto tokenizer = new Tokenizer(code, "", _singleLineCommentPrefixes); 484 for (;;) { 485 auto token = tokenizer.nextToken(); 486 if (token.type == TokenType.eof) 487 break; 488 if (skipSpace && token.type == TokenType.whitespace) 489 continue; 490 if (skipEols && token.type == TokenType.eol) 491 continue; 492 if (skipComments && token.type == TokenType.comment) 493 continue; 494 res ~= token; 495 } 496 return res; 497 } 498 499 /// exclude whitespace tokens at beginning and end of token sequence 500 Token[] trimSpaceTokens(Token[] tokens, bool trimBeginning = true, bool trimEnd = true) { 501 if (trimBeginning) 502 while(tokens.length > 0 && tokens[0].type == TokenType.whitespace) 503 tokens = tokens[1 .. $]; 504 if (trimEnd) 505 while(tokens.length > 0 && tokens[$ - 1].type == TokenType.whitespace) 506 tokens = tokens[0 .. $ - 1]; 507 return tokens; 508 }