1 module dom.cssparser; 2 3 /** 4 Before sending the input stream to the tokenizer, implementations must make the following code point substitutions: 5 * Replace any U+000D CARRIAGE RETURN (CR) code point, U+000C FORM FEED (FF) code point, or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF) by a single U+000A LINE FEED (LF) code point. 6 * Replace any U+0000 NULL code point with U+FFFD REPLACEMENT CHARACTER. 7 */ 8 char[] preProcessCSS(char[] src) { 9 char[] res; 10 res.assumeSafeAppend(); 11 int p = 0; 12 bool last0D = false; 13 foreach(ch; src) { 14 if (ch == 0) { 15 // append U+FFFD 1110xxxx 10xxxxxx 10xxxxxx == EF BF BD 16 res ~= 0xEF; 17 res ~= 0xBF; 18 res ~= 0xBD; 19 } else if (ch == 0x0D || ch == 0x0C) { 20 res ~= 0x0A; 21 } else if (ch == 0x0A) { 22 if (!last0D) 23 res ~= 0x0A; 24 } else { 25 res ~= ch; 26 } 27 last0D = (ch == 0x0D); 28 } 29 return res; 30 } 31 32 struct CSSImportRule { 33 /// start position - byte offset of @import 34 size_t startPos; 35 /// end position - byte offset of next char after closing ';' 36 size_t endPos; 37 /// url of CSS to import 38 string url; 39 /// content of downloaded URL to apply in place of rule 40 string content; 41 } 42 43 enum CSSTokenType : ubyte { 44 eof, // end of file 45 delim, // delimiter (may be unknown token or error) 46 comment, /* some comment */ 47 //newline, // any of \n \r\n \r \f 48 whitespace, // space, \t, newline 49 ident, // identifier 50 url, // url() 51 badUrl, // url() which is bad 52 func, // function( 53 str, // string '' or "" 54 badStr, // string '' or "" ended with newline character 55 hashToken, // # 56 prefixMatch, // ^= 57 suffixMatch, // $= 58 substringMatch, // *= 59 includeMatch, // ~= 60 dashMatch, // |= 61 column, // || 62 parentOpen, // ( 63 parentClose, // ) 64 squareOpen, // [ 65 squareClose, // ] 66 curlyOpen, // { 67 curlyClose, // } 68 comma, // , 69 colon, // : 70 semicolon, // ; 71 number, // +12345.324e-3 72 dimension, // 1.23px -- number with dimension 73 cdo, // <!-- 74 cdc, // --> 75 atKeyword, // @someKeyword -- tokenText will contain keyword w/o @ prefix 76 unicodeRange, // U+XXX-XXX 77 } 78 79 struct CSSToken { 80 CSSTokenType type; 81 string text; 82 string dimensionUnit; 83 union { 84 bool typeFlagId; // true if identifier is valid ID 85 struct { 86 long intValue = 0; // for number and dimension 87 double doubleValue = 0; // for number and dimension 88 bool typeFlagInteger; // for number and dimension - true if number is integer, false if double 89 } 90 struct { 91 uint unicodeRangeStart; // for unicodeRange (initialized to 0 via intValue=0) 92 uint unicodeRangeEnd; // for unicodeRange (initialized to 0 via intValue=0) 93 } 94 } 95 } 96 97 int decodeHexDigit(char ch) { 98 if (ch >= 'a' && ch <= 'f') 99 return (ch - 'a') + 10; 100 if (ch >= 'A' && ch <= 'F') 101 return (ch - 'A') + 10; 102 if (ch >= '0' && ch <= '9') 103 return (ch - '0'); 104 return -1; 105 } 106 107 bool isCSSWhiteSpaceChar(char ch) { 108 return ch == ' ' || ch == '\t' || ch == 0x0C || ch == 0x0D || ch == 0x0A; 109 } 110 111 // returns true if code point is letter, underscore or non-ascii 112 bool isCSSNameStart(char ch) { 113 return ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch & 0x80) > 0 || ch == '_'); 114 } 115 116 bool isCSSNonPrintable(char ch) { 117 if (ch >= 0 && ch <= 8) 118 return true; 119 if (ch == 0x0B || ch == 0x7F) 120 return true; 121 if (ch >= 0x0E && ch <= 0x1F) 122 return true; 123 return false; 124 } 125 // This section describes how to check if two code points are a valid escape 126 bool isCSSValidEscSequence(char ch, char ch2) { 127 //If the first code point is not U+005D REVERSE SOLIDUS (\), return false. 128 if (ch != '\\') 129 return false; 130 if (ch2 == '\r' || ch2 == '\n') 131 return false; 132 return true; 133 } 134 135 struct CSSTokenizer { 136 /// CSS source code (utf-8) 137 char[] src; 138 /// current token type 139 CSSTokenType tokenType; 140 /// current token start byte offset 141 size_t tokenStart; 142 /// current token end byte offset 143 size_t tokenEnd; 144 char[] tokenText; 145 char[] dimensionUnit; 146 bool tokenTypeFlagId; // true if identifier is valid ID 147 bool tokenTypeInteger; // for number and dimension - true if number is integer, false if double 148 long tokenIntValue; // for number and dimension 149 double tokenDoubleValue; // for number and dimension 150 uint unicodeRangeStart = 0; // for unicodeRange 151 uint unicodeRangeEnd = 0; // for unicodeRange 152 void start(string _src) { 153 src = _src.dup; 154 tokenStart = tokenEnd = 0; 155 tokenText.length = 1000; 156 tokenText.assumeSafeAppend; 157 dimensionUnit.length = 1000; 158 dimensionUnit.assumeSafeAppend; 159 } 160 bool eof() { 161 return tokenEnd >= src.length; 162 } 163 /** 164 Skip whitespace; return true if at least one whitespace char is skipped; move tokenEnd position 165 tokenType will be set to newline if any newline character found, otherwise - to whitespace 166 */ 167 bool skipWhiteSpace() { 168 bool skipped = false; 169 tokenType = CSSTokenType.whitespace; 170 for (;;) { 171 if (tokenEnd >= src.length) { 172 return false; 173 } 174 char ch = src.ptr[tokenEnd]; 175 if (ch == '\r' || ch == '\n' || ch == 0x0C) { 176 tokenEnd++; 177 //tokenType = CSSTokenType.newline; 178 skipped = true; 179 } if (ch == ' ' || ch == '\t') { 180 tokenEnd++; 181 skipped = true; 182 } else if (ch == 0xEF && tokenEnd + 2 < src.length && src.ptr[tokenEnd + 1] == 0xBF && src.ptr[tokenEnd + 2] == 0xBD) { 183 // U+FFFD 1110xxxx 10xxxxxx 10xxxxxx == EF BF BD 184 tokenEnd++; 185 skipped = true; 186 } else { 187 return skipped; 188 } 189 } 190 } 191 192 private dchar parseEscape(ref size_t p) { 193 size_t pos = p + 1; 194 if (pos >= src.length) 195 return cast(dchar)0xFFFFFFFF; // out of bounds 196 char ch = src.ptr[pos]; 197 pos++; 198 if (ch == '\r' || ch == '\n' || ch == 0x0C) 199 return cast(dchar)0xFFFFFFFF; // unexpected newline: invalid esc sequence 200 int hex = decodeHexDigit(ch); 201 if (hex >= 0) { 202 dchar res = hex; 203 int count = 1; 204 while (count < 6) { 205 if (pos >= src.length) 206 break; 207 ch = src.ptr[pos]; 208 hex = decodeHexDigit(ch); 209 if (hex < 0) 210 break; 211 res = (res << 4) | hex; 212 pos++; 213 count++; 214 } 215 if (isCSSWhiteSpaceChar(ch)) 216 pos++; 217 p = pos; 218 return res; 219 } else { 220 // not a hex: one character is escaped 221 p = pos; 222 return ch; 223 } 224 } 225 private void appendEscapedIdentChar(dchar ch) { 226 if (ch < 0x80) { 227 // put as is 228 tokenText ~= cast(char)ch; 229 } else { 230 // UTF-8 encode 231 import std.utf : encode, isValidDchar; 232 char[4] buf; 233 size_t chars = isValidDchar(ch) ? encode(buf, ch) : 0; 234 if (chars) 235 tokenText ~= buf[0 .. chars]; 236 else 237 tokenText ~= '?'; // replacement for invalid character 238 } 239 } 240 241 /** Consume identifier at current position, append it to tokenText */ 242 bool consumeIdent(ref char[] tokenText) { 243 size_t p = tokenEnd; 244 char ch = src.ptr[p]; 245 bool hasHyphen = false; 246 if (ch == '-') { 247 p++; 248 if (p >= src.length) 249 return false; // eof 250 hasHyphen = true; 251 ch = src.ptr[p]; 252 } 253 if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_' || ch >= 0x80) { 254 if (hasHyphen) 255 tokenText ~= '-'; 256 tokenText ~= ch; 257 p++; 258 } else if (ch == '\\') { 259 dchar esc = parseEscape(p); 260 if (esc == 0xFFFFFFFF) 261 return false; // invalid esc 262 // encode to UTF-8 263 appendEscapedIdentChar(esc); 264 } else { 265 return false; 266 } 267 for (;;) { 268 if (p >= src.length) 269 break; 270 ch = src.ptr[p]; 271 if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch == '_' || ch == '-' || ch >= 0x80) { 272 tokenText ~= ch; 273 p++; 274 } else if (ch == '\\') { 275 dchar esc = parseEscape(p); 276 if (esc == 0xFFFFFFFF) 277 break; // invalid esc 278 // encode to UTF-8 279 appendEscapedIdentChar(esc); 280 } else { 281 break; 282 } 283 } 284 tokenEnd = p; 285 return true; 286 } 287 288 /** 289 Parse identifier. 290 Returns true if identifier is parsed. tokenText will contain identifier text. 291 */ 292 bool parseIdent() { 293 if (!isIdentStart(tokenEnd)) 294 return false; 295 if (consumeIdent(tokenText)) { 296 tokenType = tokenType.ident; 297 return true; 298 } 299 return false; 300 } 301 302 /** returns true if current tokenEnd position is identifier start */ 303 bool isIdentStart(size_t p) { 304 if (p >= src.length) 305 return false; 306 char ch = src.ptr[p]; 307 if (isCSSNameStart(ch)) 308 return true; 309 if (ch == '-') { 310 //If the second code point is a name-start code point or the second and third code points are a valid escape, return true. Otherwise, return false. 311 p++; 312 if (p >= src.length) 313 return false; 314 ch = src.ptr[p]; 315 if (isCSSNameStart(ch)) 316 return true; 317 } 318 if (ch == '\\') { 319 p++; 320 if (p >= src.length) 321 return false; 322 char ch2 = src.ptr[p]; 323 return isCSSValidEscSequence(ch, ch2); 324 } 325 return false; 326 } 327 328 /** 329 Parse identifier. 330 Returns true if identifier is parsed. tokenText will contain identifier text. 331 */ 332 bool parseNumber() { 333 tokenTypeInteger = true; 334 tokenIntValue = 0; 335 tokenDoubleValue = 0; 336 size_t p = tokenEnd; 337 char ch = src.ptr[p]; 338 int numberSign = 1; 339 int exponentSign = 1; 340 bool hasPoint = false; 341 ulong intValue = 0; 342 ulong afterPointValue = 0; 343 ulong exponentValue = 0; 344 int beforePointDigits = 0; 345 int afterPointDigits = 0; 346 int exponentDigits = 0; 347 if (ch == '+' || ch == '-') { 348 if (ch == '-') 349 numberSign = -1; 350 tokenText ~= ch; 351 p++; 352 if (p >= src.length) 353 return false; // eof 354 ch = src.ptr[p]; 355 } 356 // append digits before point 357 while (ch >= '0' && ch <= '9') { 358 tokenText ~= ch; 359 intValue = intValue * 10 + (ch - '0'); 360 beforePointDigits++; 361 p++; 362 if (p >= src.length) { 363 ch = 0; 364 break; 365 } 366 ch = src.ptr[p]; 367 } 368 // check for point 369 if (ch == '.') { 370 hasPoint = true; 371 tokenText ~= ch; 372 p++; 373 if (p >= src.length) 374 return false; // eof 375 ch = src.ptr[p]; 376 } 377 // append digits after point 378 while (ch >= '0' && ch <= '9') { 379 tokenText ~= ch; 380 afterPointValue = afterPointValue * 10 + (ch - '0'); 381 afterPointDigits++; 382 p++; 383 if (p >= src.length) { 384 ch = 0; 385 break; 386 } 387 ch = src.ptr[p]; 388 } 389 if (!beforePointDigits && !afterPointDigits) { 390 if (tokenText.length) 391 tokenText.length = 0; 392 return false; // not a number 393 } 394 if (ch == 'e' || ch == 'E') { 395 char nextCh = p + 1 < src.length ? src.ptr[p + 1] : 0; 396 char nextCh2 = p + 2 < src.length ? src.ptr[p + 2] : 0; 397 int skip = 1; 398 if (nextCh == '+' || nextCh == '-') { 399 if (nextCh == '-') 400 exponentSign = -1; 401 skip = 2; 402 nextCh = nextCh2; 403 } 404 if (nextCh >= '0' && nextCh <= '9') { 405 tokenText ~= src.ptr[p .. p + skip]; 406 p += skip; 407 ch = nextCh; 408 // append exponent digits 409 while (ch >= '0' && ch <= '9') { 410 tokenText ~= ch; 411 exponentValue = exponentValue * 10 + (ch - '0'); 412 exponentDigits++; 413 p++; 414 if (p >= src.length) { 415 ch = 0; 416 break; 417 } 418 ch = src.ptr[p]; 419 } 420 } 421 } 422 tokenType = CSSTokenType.number; 423 tokenEnd = p; 424 if (exponentDigits || afterPointDigits) { 425 // parsed floating point 426 tokenDoubleValue = cast(long)intValue; 427 if (afterPointDigits) { 428 long divider = 1; 429 for (int i = 0; i < afterPointDigits; i++) 430 divider *= 10; 431 tokenDoubleValue += afterPointValue / cast(double)divider; 432 } 433 if (numberSign < 0) 434 tokenDoubleValue = -tokenDoubleValue; 435 if (exponentDigits) { 436 import std.math : pow; 437 double exponent = (cast(long)exponentValue * exponentSign); 438 tokenDoubleValue = tokenDoubleValue * pow(10, exponent); 439 } 440 tokenIntValue = cast(long)tokenDoubleValue; 441 } else { 442 // parsed integer 443 tokenIntValue = cast(long)intValue; 444 if (numberSign < 0) 445 tokenIntValue = -tokenIntValue; 446 tokenDoubleValue = tokenIntValue; 447 } 448 dimensionUnit.length = 0; 449 if (isIdentStart(tokenEnd)) { 450 tokenType = CSSTokenType.dimension; 451 consumeIdent(dimensionUnit); 452 } 453 return true; 454 } 455 456 bool parseString(char quotationChar) { 457 tokenType = CSSTokenType.str; 458 // skip first delimiter ' or " 459 size_t p = tokenEnd + 1; 460 for (;;) { 461 if (p >= src.length) { 462 // unexpected end of file 463 tokenEnd = p; 464 return true; 465 } 466 char ch = src.ptr[p]; 467 if (ch == '\r' || ch == '\n') { 468 tokenType = CSSTokenType.badStr; 469 tokenEnd = p - 1; 470 return true; 471 } else if (ch == quotationChar) { 472 // end of string 473 tokenEnd = p + 1; 474 return true; 475 } else if (ch == '\\') { 476 if (p + 1 >= src.length) { 477 // unexpected end of file 478 tokenEnd = p; 479 return true; 480 } 481 ch = src.ptr[p + 1]; 482 if (ch == '\r' || ch == '\n') { 483 // \ NEWLINE 484 //tokenText ~= 0x0A; 485 p++; 486 } else { 487 dchar esc = parseEscape(p); 488 if (esc == 0xFFFFFFFF) { 489 esc = '?'; // replace invalid code point 490 p++; 491 } 492 // encode to UTF-8 493 appendEscapedIdentChar(esc); 494 } 495 } else { 496 // normal character 497 tokenText ~= ch; 498 p++; 499 } 500 } 501 } 502 CSSTokenType emitDelimToken() { 503 import std.utf : stride, UTFException; 504 try { 505 uint len = stride(src[tokenStart .. $]); 506 tokenEnd = tokenStart + len; 507 } catch (UTFException e) { 508 tokenEnd = tokenStart + 1; 509 } 510 tokenText ~= src[tokenStart .. tokenEnd]; 511 tokenType = CSSTokenType.delim; 512 return tokenType; 513 } 514 // #token 515 CSSTokenType parseHashToken() { 516 tokenTypeFlagId = false; 517 tokenEnd++; 518 // set tokenTypeFlagId flag 519 if (parseIdent()) { 520 tokenType = CSSTokenType.hashToken; 521 if (tokenText[0] < '0' || tokenText[0] > '9') 522 tokenTypeFlagId = true; // is valid ID 523 return tokenType; 524 } 525 // invalid ident 526 return emitDelimToken(); 527 } 528 /// current chars are /* 529 CSSTokenType parseComment() { 530 size_t p = tokenEnd + 2; // skip /* 531 while (p < src.length) { 532 char ch = src.ptr[p]; 533 char ch2 = p + 1 < src.length ? src.ptr[p + 1] : 0; 534 if (ch == '*' && ch2 == '/') { 535 p += 2; 536 break; 537 } 538 p++; 539 } 540 tokenEnd = p; 541 tokenType = CSSTokenType.comment; 542 return tokenType; 543 } 544 /// current chars are U+ or u+ followed by hex digit or ? 545 CSSTokenType parseUnicodeRangeToken() { 546 unicodeRangeStart = 0; 547 unicodeRangeEnd = 0; 548 size_t p = tokenEnd + 2; // skip U+ 549 // now we have hex digit or ? 550 int hexCount = 0; 551 uint hexNumber = 0; 552 int questionCount = 0; 553 // consume hex digits 554 while (p < src.length) { 555 char ch = src.ptr[p]; 556 int digit = decodeHexDigit(ch); 557 if (digit < 0) 558 break; 559 hexCount++; 560 hexNumber = (hexNumber << 4) | digit; 561 p++; 562 if (hexCount >= 6) 563 break; 564 } 565 // consume question marks 566 while (p < src.length && questionCount + hexCount < 6) { 567 char ch = src.ptr[p]; 568 if (ch != '?') 569 break; 570 questionCount++; 571 p++; 572 } 573 if (questionCount) { 574 int shift = 4 * questionCount; 575 unicodeRangeStart = hexNumber << shift; 576 unicodeRangeEnd = unicodeRangeStart + ((1 << shift) - 1); 577 } else { 578 unicodeRangeStart = hexNumber; 579 char ch = p < src.length ? src.ptr[p] : 0; 580 char ch2 = p + 1 < src.length ? src.ptr[p + 1] : 0; 581 int digit = decodeHexDigit(ch2); 582 if (ch == '-' && digit >= 0) { 583 p += 2; // skip - and first digit 584 hexCount = 1; 585 hexNumber = digit; 586 while (p < src.length) { 587 ch = src.ptr[p]; 588 digit = decodeHexDigit(ch); 589 if (digit < 0) 590 break; 591 hexCount++; 592 hexNumber = (hexNumber << 4) | digit; 593 p++; 594 if (hexCount >= 6) 595 break; 596 } 597 unicodeRangeEnd = hexNumber; 598 } else { 599 unicodeRangeEnd = unicodeRangeStart; 600 } 601 } 602 tokenEnd = p; 603 tokenType = CSSTokenType.unicodeRange; 604 return tokenType; 605 } 606 /// emit single char token like () {} [] : ; 607 CSSTokenType emitSingleCharToken(CSSTokenType type) { 608 tokenType = type; 609 tokenEnd = tokenStart + 1; 610 tokenText ~= src[tokenStart]; 611 return type; 612 } 613 /// emit double char token like $= *= 614 CSSTokenType emitDoubleCharToken(CSSTokenType type) { 615 tokenType = type; 616 tokenEnd = tokenStart + 2; 617 tokenText ~= src[tokenStart .. tokenStart + 2]; 618 return type; 619 } 620 void consumeBadUrl() { 621 for (;;) { 622 char ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0; 623 char ch2 = tokenEnd + 1 < src.length ? src.ptr[tokenEnd + 1] : 0; 624 if (ch == ')' || ch == 0) { 625 if (ch == ')') 626 tokenEnd++; 627 break; 628 } 629 if (isCSSValidEscSequence(ch, ch2)) { 630 parseEscape(tokenEnd); 631 } 632 tokenEnd++; 633 } 634 tokenType = CSSTokenType.badUrl; 635 } 636 // Current position is after url( 637 void parseUrlToken() { 638 tokenText.length = 0; 639 skipWhiteSpace(); 640 if (tokenEnd >= src.length) 641 return; 642 char ch = src.ptr[tokenEnd]; 643 if (ch == '\'' || ch == '\"') { 644 if (parseString(ch)) { 645 skipWhiteSpace(); 646 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0; 647 if (ch == ')' || ch == 0) { 648 // valid URL token 649 if (ch == ')') 650 tokenEnd++; 651 tokenType = CSSTokenType.url; 652 return; 653 } 654 } 655 // bad url 656 consumeBadUrl(); 657 return; 658 } 659 // not quoted 660 for (;;) { 661 if (skipWhiteSpace()) { 662 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0; 663 if (ch == ')' || ch == 0) { 664 if (ch == ')') 665 tokenEnd++; 666 tokenType = CSSTokenType.url; 667 return; 668 } 669 consumeBadUrl(); 670 return; 671 } 672 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0; 673 char ch2 = tokenEnd + 1 < src.length ? src.ptr[tokenEnd + 1] : 0; 674 if (ch == ')' || ch == 0) { 675 if (ch == ')') 676 tokenEnd++; 677 tokenType = CSSTokenType.url; 678 return; 679 } 680 if (ch == '(' || ch == '\'' || ch == '\"' || isCSSNonPrintable(ch)) { 681 consumeBadUrl(); 682 return; 683 } 684 if (ch == '\\') { 685 if (isCSSValidEscSequence(ch, ch2)) { 686 dchar esc = parseEscape(tokenEnd); 687 appendEscapedIdentChar(ch); 688 } else { 689 consumeBadUrl(); 690 return; 691 } 692 } 693 tokenText ~= ch; 694 tokenEnd++; 695 } 696 } 697 CSSTokenType next() { 698 // move beginning of token 699 tokenStart = tokenEnd; 700 tokenText.length = 0; 701 // check for whitespace 702 if (skipWhiteSpace()) 703 return tokenType; // whitespace or newline token 704 // check for eof 705 if (tokenEnd >= src.length) 706 return CSSTokenType.eof; 707 char ch = src.ptr[tokenEnd]; 708 char nextCh = tokenEnd + 1 < src.length ? src.ptr[tokenEnd + 1] : 0; 709 if (ch == '\"' || ch == '\'') { 710 parseString(ch); 711 return tokenType; 712 } 713 if (ch == '#') { 714 return parseHashToken(); 715 } 716 if (ch == '$') { 717 if (nextCh == '=') { 718 return emitDoubleCharToken(CSSTokenType.suffixMatch); 719 } else { 720 return emitDelimToken(); 721 } 722 } 723 if (ch == '^') { 724 if (nextCh == '=') { 725 return emitDoubleCharToken(CSSTokenType.prefixMatch); 726 } else { 727 return emitDelimToken(); 728 } 729 } 730 if (ch == '(') 731 return emitSingleCharToken(CSSTokenType.parentOpen); 732 if (ch == ')') 733 return emitSingleCharToken(CSSTokenType.parentClose); 734 if (ch == '[') 735 return emitSingleCharToken(CSSTokenType.squareOpen); 736 if (ch == ']') 737 return emitSingleCharToken(CSSTokenType.squareClose); 738 if (ch == '{') 739 return emitSingleCharToken(CSSTokenType.curlyOpen); 740 if (ch == '}') 741 return emitSingleCharToken(CSSTokenType.curlyClose); 742 if (ch == ',') 743 return emitSingleCharToken(CSSTokenType.comma); 744 if (ch == ':') 745 return emitSingleCharToken(CSSTokenType.colon); 746 if (ch == ';') 747 return emitSingleCharToken(CSSTokenType.semicolon); 748 if (ch == '*') { 749 if (nextCh == '=') { 750 return emitDoubleCharToken(CSSTokenType.substringMatch); 751 } else { 752 return emitDelimToken(); 753 } 754 } 755 if (ch == '~') { 756 if (nextCh == '=') { 757 return emitDoubleCharToken(CSSTokenType.includeMatch); 758 } else { 759 return emitDelimToken(); 760 } 761 } 762 if (ch == '|') { 763 if (nextCh == '=') { 764 return emitDoubleCharToken(CSSTokenType.dashMatch); 765 } else if (nextCh == '|') { 766 return emitDoubleCharToken(CSSTokenType.column); 767 } else { 768 return emitDelimToken(); 769 } 770 } 771 if (ch == '/') { 772 if (nextCh == '*') { 773 return parseComment(); 774 } else { 775 return emitDelimToken(); 776 } 777 } 778 char nextCh2 = tokenEnd + 2 < src.length ? src.ptr[tokenEnd + 2] : 0; 779 if (ch == 'u' || ch == 'U') { 780 if (nextCh == '+' && (decodeHexDigit(nextCh2) >= 0 || nextCh2 == '?')) { 781 return parseUnicodeRangeToken(); 782 } 783 } 784 if (parseNumber()) 785 return tokenType; 786 if (parseIdent()) { 787 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0; 788 if (ch == '(') { 789 tokenEnd++; 790 import std.uni : icmp; 791 if (tokenText.length == 3 && icmp(tokenText, "url") == 0) { 792 // parse URL function 793 parseUrlToken(); 794 } else { 795 tokenType = CSSTokenType.func; 796 } 797 } 798 return tokenType; 799 } 800 if (ch == '-') { 801 if (nextCh == '-' && nextCh2 == '>') { 802 tokenEnd = tokenStart + 3; 803 tokenType = CSSTokenType.cdc; 804 tokenText ~= src[tokenStart .. tokenEnd]; 805 return tokenType; 806 } 807 return emitDelimToken(); 808 } 809 if (ch == '<') { 810 char nextCh3 = tokenEnd + 3 < src.length ? src.ptr[tokenEnd + 3] : 0; 811 if (nextCh == '!' && nextCh2 == '-' && nextCh3 == '-') { 812 tokenEnd = tokenStart + 4; 813 tokenType = CSSTokenType.cdo; 814 tokenText ~= src[tokenStart .. tokenEnd]; 815 return tokenType; 816 } 817 return emitDelimToken(); 818 } 819 if (ch == '@') { 820 if (isIdentStart(tokenEnd + 1)) { 821 tokenEnd++; 822 parseIdent(); 823 tokenType = CSSTokenType.atKeyword; 824 return tokenType; 825 } 826 return emitDelimToken(); 827 } 828 return emitDelimToken(); 829 } 830 /// same as next() but returns filled CSSToken struct 831 CSSToken nextToken() { 832 CSSToken res; 833 res.type = next(); 834 if (res.type == CSSTokenType.str || res.type == CSSTokenType.ident || res.type == CSSTokenType.atKeyword || res.type == CSSTokenType.url || res.type == CSSTokenType.func) { 835 if (tokenText.length) 836 res.text = tokenText.dup; 837 } 838 if (res.type == CSSTokenType.dimension && dimensionUnit.length) 839 res.dimensionUnit = dimensionUnit.dup; 840 if (res.type == CSSTokenType.dimension || res.type == CSSTokenType.number) { 841 res.doubleValue = tokenDoubleValue; 842 res.intValue = tokenIntValue; 843 res.typeFlagInteger = tokenTypeInteger; 844 } else if (res.type == CSSTokenType.ident) { 845 res.typeFlagId = tokenTypeFlagId; 846 } else if (res.type == CSSTokenType.unicodeRange) { 847 res.unicodeRangeStart = unicodeRangeStart; 848 res.unicodeRangeEnd = unicodeRangeEnd; 849 } 850 return res; 851 } 852 } 853 854 unittest { 855 CSSTokenizer tokenizer; 856 tokenizer.start("ident-1{ }\n#id\n'blabla' \"bla bla 2\" -ident2*=12345 -.234e+5 " 857 ~ "1.23px/* some comment */U+123?!" 858 ~"url( 'text.css' )url(bad url)functionName()url( bla )" 859 ~"'\\30 \\31'"); 860 assert(tokenizer.next() == CSSTokenType.ident); 861 assert(tokenizer.tokenText == "ident-1"); 862 assert(tokenizer.next() == CSSTokenType.curlyOpen); 863 assert(tokenizer.next() == CSSTokenType.whitespace); 864 assert(tokenizer.next() == CSSTokenType.curlyClose); 865 assert(tokenizer.next() == CSSTokenType.whitespace); //newline 866 assert(tokenizer.next() == CSSTokenType.hashToken); 867 assert(tokenizer.tokenText == "id"); 868 assert(tokenizer.tokenTypeFlagId == true); 869 assert(tokenizer.next() == CSSTokenType.whitespace); //newline 870 assert(tokenizer.next() == CSSTokenType.str); 871 assert(tokenizer.tokenText == "blabla"); 872 assert(tokenizer.next() == CSSTokenType.whitespace); 873 assert(tokenizer.next() == CSSTokenType.str); 874 assert(tokenizer.tokenText == "bla bla 2"); 875 assert(tokenizer.next() == CSSTokenType.whitespace); 876 assert(tokenizer.next() == CSSTokenType.ident); 877 assert(tokenizer.tokenText == "-ident2"); 878 assert(tokenizer.next() == CSSTokenType.substringMatch); 879 assert(tokenizer.next() == CSSTokenType.number); 880 assert(tokenizer.tokenText == "12345"); 881 assert(tokenizer.tokenIntValue == 12345); 882 assert(tokenizer.next() == CSSTokenType.whitespace); 883 assert(tokenizer.next() == CSSTokenType.number); 884 assert(tokenizer.tokenText == "-.234e+5"); 885 assert(tokenizer.tokenIntValue == -23400); 886 assert(tokenizer.tokenDoubleValue == -.234e+5); 887 assert(tokenizer.next() == CSSTokenType.whitespace); 888 // next line 889 assert(tokenizer.next() == CSSTokenType.dimension); 890 assert(tokenizer.tokenText == "1.23"); 891 assert(tokenizer.tokenIntValue == 1); 892 assert(tokenizer.tokenDoubleValue == 1.23); 893 assert(tokenizer.dimensionUnit == "px"); 894 assert(tokenizer.next() == CSSTokenType.comment); 895 assert(tokenizer.next() == CSSTokenType.unicodeRange); 896 assert(tokenizer.unicodeRangeStart == 0x1230 && tokenizer.unicodeRangeEnd == 0x123F); 897 assert(tokenizer.next() == CSSTokenType.delim); 898 assert(tokenizer.tokenText == "!"); 899 // next line 900 assert(tokenizer.next() == CSSTokenType.url); 901 assert(tokenizer.tokenText == "text.css"); 902 assert(tokenizer.next() == CSSTokenType.badUrl); 903 assert(tokenizer.next() == CSSTokenType.func); 904 assert(tokenizer.tokenText == "functionName"); 905 assert(tokenizer.next() == CSSTokenType.parentClose); 906 assert(tokenizer.next() == CSSTokenType.url); 907 assert(tokenizer.tokenText == "bla"); 908 // next line 909 assert(tokenizer.next() == CSSTokenType.str); 910 assert(tokenizer.tokenText == "01"); //'\30 \31' 911 assert(tokenizer.next() == CSSTokenType.eof); 912 } 913 914 915 /** 916 Tokenizes css source, returns array of tokens (last token is EOF). 917 Source must be preprocessed utf-8 string. 918 */ 919 static CSSToken[] tokenizeCSS(string src) { 920 CSSTokenizer tokenizer; 921 tokenizer.start(src); 922 CSSToken[] res; 923 res.assumeSafeAppend(); 924 for(;;) { 925 res ~= tokenizer.nextToken(); 926 if (res[$ - 1].type == CSSTokenType.eof) 927 break; 928 } 929 return res; 930 } 931 932 unittest { 933 string src = "pre {123em}"; 934 auto res = tokenizeCSS(src); 935 assert(res.length == 6); 936 assert(res[0].type == CSSTokenType.ident); 937 assert(res[0].text == "pre"); 938 assert(res[1].type == CSSTokenType.whitespace); 939 assert(res[2].type == CSSTokenType.curlyOpen); 940 assert(res[3].type == CSSTokenType.dimension); 941 assert(res[3].typeFlagInteger == true); 942 assert(res[3].intValue == 123); 943 assert(res[3].dimensionUnit == "em"); 944 assert(res[4].type == CSSTokenType.curlyClose); 945 assert(res[$ - 1].type == CSSTokenType.eof); 946 } 947 948 // easy way to extract and apply imports w/o full document parsing 949 /** 950 Extract CSS vimport rules from source. 951 */ 952 CSSImportRule[] extractCSSImportRules(string src) { 953 enum ParserState { 954 start, // before rule begin, switch to this state after ; 955 afterImport, // after @import 956 afterCharset, // after @charset 957 afterCharsetName, // after @charset 958 afterImportUrl, // after @charset 959 } 960 ParserState state = ParserState.start; 961 CSSImportRule[] res; 962 CSSTokenizer tokenizer; 963 tokenizer.start(src); 964 bool insideImportRule = false; 965 string url; 966 size_t startPos = 0; 967 size_t endPos = 0; 968 for (;;) { 969 CSSTokenType type = tokenizer.next(); 970 if (type == CSSTokenType.eof) 971 break; 972 if (type == CSSTokenType.whitespace || type == CSSTokenType.comment) 973 continue; // skip whitespaces and comments 974 if (type == CSSTokenType.atKeyword) { 975 if (tokenizer.tokenText == "charset") { 976 state = ParserState.afterCharset; 977 continue; 978 } 979 if (tokenizer.tokenText != "import") 980 break; 981 // import rule 982 state = ParserState.afterImport; 983 startPos = tokenizer.tokenStart; 984 continue; 985 } 986 if (type == CSSTokenType.str || type == CSSTokenType.url) { 987 if (state == ParserState.afterImport) { 988 url = tokenizer.tokenText.dup; 989 state = ParserState.afterImportUrl; 990 continue; 991 } 992 if (state == ParserState.afterCharset) { 993 state = ParserState.afterCharsetName; 994 continue; 995 } 996 break; 997 } 998 if (type == CSSTokenType.curlyOpen) 999 break; 1000 if (type == CSSTokenType.ident && state == ParserState.start) 1001 break; // valid @imports may be only at the beginning of file 1002 if (type == CSSTokenType.semicolon) { 1003 if (state == ParserState.afterImportUrl) { 1004 // add URL 1005 endPos = tokenizer.tokenEnd; 1006 CSSImportRule rule; 1007 rule.startPos = startPos; 1008 rule.endPos = endPos; 1009 rule.url = url; 1010 res ~= rule; 1011 } 1012 state = ParserState.start; 1013 continue; 1014 } 1015 } 1016 return res; 1017 } 1018 1019 /** 1020 Replace source code import rules obtained by extractImportRules() with imported content. 1021 */ 1022 string applyCSSImportRules(string src, CSSImportRule[] rules) { 1023 if (!rules.length) 1024 return src; // no rules 1025 char[] res; 1026 res.assumeSafeAppend; 1027 size_t start = 0; 1028 for (int i = 0; i < rules.length; i++) { 1029 res ~= src[start .. rules[i].startPos]; 1030 res ~= rules[i].content; 1031 start = rules[i].endPos; 1032 } 1033 if (start < src.length) 1034 res ~= src[start .. $]; 1035 return cast(string)res; 1036 } 1037 1038 1039 unittest { 1040 string src = q{ 1041 @charset "utf-8"; 1042 /* comment must be ignored */ 1043 @import "file1.css"; /* string */ 1044 @import url(file2.css); /* url */ 1045 pre {} 1046 @import "ignore_me.css"; 1047 p {} 1048 }; 1049 auto res = extractCSSImportRules(src); 1050 assert(res.length == 2); 1051 assert(res[0].url == "file1.css"); 1052 assert(res[1].url == "file2.css"); 1053 res[0].content = "[file1_content]"; 1054 res[1].content = "[file2_content]"; 1055 string s = applyCSSImportRules(src, res); 1056 assert (s.length != src.length); 1057 } 1058 1059 enum ASTNodeType { 1060 simpleBlock, 1061 componentValue, 1062 preservedToken, 1063 func, 1064 atRule, 1065 qualifiedRule, 1066 } 1067 1068 class ASTNode { 1069 ASTNodeType type; 1070 } 1071 1072 class ComponentValueNode : ASTNode { 1073 this() { 1074 type = ASTNodeType.componentValue; 1075 } 1076 } 1077 1078 class SimpleBlockNode : ComponentValueNode { 1079 CSSTokenType blockType = CSSTokenType.curlyOpen; 1080 ComponentValueNode[] componentValues; 1081 this() { 1082 type = ASTNodeType.simpleBlock; 1083 } 1084 } 1085 1086 class FunctionNode : ComponentValueNode { 1087 ComponentValueNode[] componentValues; 1088 this(string name) { 1089 type = ASTNodeType.func; 1090 } 1091 } 1092 1093 class PreservedTokenNode : ComponentValueNode { 1094 CSSToken token; 1095 this(ref CSSToken token) { 1096 this.token = token; 1097 type = ASTNodeType.preservedToken; 1098 } 1099 } 1100 1101 class QualifiedRuleNode : ASTNode { 1102 ComponentValueNode[] componentValues; 1103 SimpleBlockNode block; 1104 this() { 1105 type = ASTNodeType.qualifiedRule; 1106 } 1107 } 1108 1109 class ATRuleNode : QualifiedRuleNode { 1110 string name; 1111 this() { 1112 type = ASTNodeType.atRule; 1113 } 1114 } 1115 1116 1117 class CSSParser { 1118 CSSToken[] tokens; 1119 int pos = 0; 1120 this(CSSToken[] _tokens) { 1121 tokens = _tokens; 1122 } 1123 /// peek current token 1124 @property ref CSSToken currentToken() { 1125 return tokens[pos]; 1126 } 1127 /// peek next token 1128 @property ref CSSToken nextToken() { 1129 return tokens[pos + 1 < $ ? pos + 1 : pos]; 1130 } 1131 /// move to next token 1132 bool next() { 1133 if (pos < tokens.length) { 1134 pos++; 1135 return true; 1136 } 1137 return false; 1138 } 1139 /// move to nearest non-whitespace token; return current token type (does not move if current token is not whitespace) 1140 CSSTokenType skipWhiteSpace() { 1141 while (currentToken.type == CSSTokenType.whitespace || currentToken.type == CSSTokenType.comment || currentToken.type == CSSTokenType.delim) 1142 next(); 1143 return currentToken.type; 1144 } 1145 /// skip current token, then move to nearest non-whitespace token; return new token type 1146 @property CSSTokenType nextNonWhiteSpace() { 1147 next(); 1148 return skipWhiteSpace(); 1149 } 1150 SimpleBlockNode parseSimpleBlock() { 1151 auto type = skipWhiteSpace(); 1152 CSSTokenType closeType; 1153 if (type == CSSTokenType.curlyOpen) { 1154 closeType = CSSTokenType.curlyClose; 1155 } else if (type == CSSTokenType.squareOpen) { 1156 closeType = CSSTokenType.squareClose; 1157 } else if (type == CSSTokenType.parentOpen) { 1158 closeType = CSSTokenType.parentClose; 1159 } else { 1160 // not a simple block 1161 return null; 1162 } 1163 SimpleBlockNode res = new SimpleBlockNode(); 1164 res.blockType = type; 1165 auto t = nextNonWhiteSpace(); 1166 res.componentValues = parseComponentValueList(closeType); 1167 t = skipWhiteSpace(); 1168 if (t == closeType) 1169 nextNonWhiteSpace(); 1170 return res; 1171 } 1172 FunctionNode parseFunctionBlock() { 1173 auto type = skipWhiteSpace(); 1174 if (type != CSSTokenType.func) 1175 return null; 1176 FunctionNode res = new FunctionNode(currentToken.text); 1177 auto t = nextNonWhiteSpace(); 1178 res.componentValues = parseComponentValueList(CSSTokenType.parentClose); 1179 t = skipWhiteSpace(); 1180 if (t == CSSTokenType.parentClose) 1181 nextNonWhiteSpace(); 1182 return res; 1183 } 1184 ComponentValueNode[] parseComponentValueList(CSSTokenType endToken1 = CSSTokenType.eof, CSSTokenType endToken2 = CSSTokenType.eof) { 1185 ComponentValueNode[] res; 1186 for (;;) { 1187 auto type = skipWhiteSpace(); 1188 if (type == CSSTokenType.eof) 1189 return res; 1190 if (type == endToken1 || type == endToken2) 1191 return res; 1192 if (type == CSSTokenType.squareOpen || type == CSSTokenType.parentOpen || type == CSSTokenType.curlyOpen) { 1193 res ~= parseSimpleBlock(); 1194 } else if (type == CSSTokenType.func) { 1195 res ~= parseFunctionBlock(); 1196 } else { 1197 res ~= new PreservedTokenNode(currentToken); 1198 next(); 1199 } 1200 } 1201 } 1202 ATRuleNode parseATRule() { 1203 auto type = skipWhiteSpace(); 1204 if (type != CSSTokenType.atKeyword) 1205 return null; 1206 ATRuleNode res = new ATRuleNode(); 1207 res.name = currentToken.text; 1208 type = nextNonWhiteSpace(); 1209 res.componentValues = parseComponentValueList(CSSTokenType.semicolon, CSSTokenType.curlyOpen); 1210 type = skipWhiteSpace(); 1211 if (type == CSSTokenType.semicolon) { 1212 next(); 1213 return res; 1214 } 1215 if (type == CSSTokenType.curlyOpen) { 1216 res.block = parseSimpleBlock(); 1217 return res; 1218 } 1219 if (type == CSSTokenType.eof) 1220 return res; 1221 return res; 1222 } 1223 1224 QualifiedRuleNode parseQualifiedRule() { 1225 auto type = skipWhiteSpace(); 1226 if (type == CSSTokenType.eof) 1227 return null; 1228 QualifiedRuleNode res = new QualifiedRuleNode(); 1229 res.componentValues = parseComponentValueList(CSSTokenType.curlyOpen); 1230 type = skipWhiteSpace(); 1231 if (type == CSSTokenType.curlyOpen) { 1232 res.block = parseSimpleBlock(); 1233 } 1234 return res; 1235 } 1236 } 1237 1238 unittest { 1239 ATRuleNode atRule = new CSSParser(tokenizeCSS("@atRuleName;")).parseATRule(); 1240 assert(atRule !is null); 1241 assert(atRule.name == "atRuleName"); 1242 assert(atRule.block is null); 1243 1244 atRule = new CSSParser(tokenizeCSS("@atRuleName2 { }")).parseATRule(); 1245 assert(atRule !is null); 1246 assert(atRule.name == "atRuleName2"); 1247 assert(atRule.block !is null); 1248 assert(atRule.block.blockType == CSSTokenType.curlyOpen); 1249 1250 atRule = new CSSParser(tokenizeCSS("@atRuleName3 url('bla') { 123 }")).parseATRule(); 1251 assert(atRule !is null); 1252 assert(atRule.name == "atRuleName3"); 1253 assert(atRule.componentValues.length == 1); 1254 assert(atRule.componentValues[0].type == ASTNodeType.preservedToken); 1255 assert(atRule.block !is null); 1256 assert(atRule.block.blockType == CSSTokenType.curlyOpen); 1257 assert(atRule.block.componentValues.length == 1); 1258 1259 1260 atRule = new CSSParser(tokenizeCSS("@atRuleName4 \"value\" { funcName(123) }")).parseATRule(); 1261 assert(atRule !is null); 1262 assert(atRule.name == "atRuleName4"); 1263 assert(atRule.componentValues.length == 1); 1264 assert(atRule.componentValues[0].type == ASTNodeType.preservedToken); 1265 assert(atRule.block !is null); 1266 assert(atRule.block.blockType == CSSTokenType.curlyOpen); 1267 assert(atRule.block.componentValues.length == 1); 1268 assert(atRule.block.componentValues[0].type == ASTNodeType.func); 1269 } 1270 1271 unittest { 1272 QualifiedRuleNode qualifiedRule = new CSSParser(tokenizeCSS(" pre { display: none } ")).parseQualifiedRule(); 1273 assert(qualifiedRule !is null); 1274 assert(qualifiedRule.componentValues.length == 1); 1275 assert(qualifiedRule.block !is null); 1276 assert(qualifiedRule.block.componentValues.length == 3); 1277 }