1 module dom.cssparser;
2
3 /**
4 Before sending the input stream to the tokenizer, implementations must make the following code point substitutions:
5 * Replace any U+000D CARRIAGE RETURN (CR) code point, U+000C FORM FEED (FF) code point, or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF) by a single U+000A LINE FEED (LF) code point.
6 * Replace any U+0000 NULL code point with U+FFFD REPLACEMENT CHARACTER.
7 */
8 char[] preProcessCSS(char[] src) {
9 char[] res;
10 res.assumeSafeAppend();
11 int p = 0;
12 bool last0D = false;
13 foreach(ch; src) {
14 if (ch == 0) {
15 // append U+FFFD 1110xxxx 10xxxxxx 10xxxxxx == EF BF BD
16 res ~= 0xEF;
17 res ~= 0xBF;
18 res ~= 0xBD;
19 } else if (ch == 0x0D || ch == 0x0C) {
20 res ~= 0x0A;
21 } else if (ch == 0x0A) {
22 if (!last0D)
23 res ~= 0x0A;
24 } else {
25 res ~= ch;
26 }
27 last0D = (ch == 0x0D);
28 }
29 return res;
30 }
31
32 struct CSSImportRule {
33 /// start position - byte offset of @import
34 size_t startPos;
35 /// end position - byte offset of next char after closing ';'
36 size_t endPos;
37 /// url of CSS to import
38 string url;
39 /// content of downloaded URL to apply in place of rule
40 string content;
41 }
42
43 enum CSSTokenType : ubyte {
44 eof, // end of file
45 delim, // delimiter (may be unknown token or error)
46 comment, /* some comment */
47 //newline, // any of \n \r\n \r \f
48 whitespace, // space, \t, newline
49 ident, // identifier
50 url, // url()
51 badUrl, // url() which is bad
52 func, // function(
53 str, // string '' or ""
54 badStr, // string '' or "" ended with newline character
55 hashToken, // #
56 prefixMatch, // ^=
57 suffixMatch, // $=
58 substringMatch, // *=
59 includeMatch, // ~=
60 dashMatch, // |=
61 column, // ||
62 parentOpen, // (
63 parentClose, // )
64 squareOpen, // [
65 squareClose, // ]
66 curlyOpen, // {
67 curlyClose, // }
68 comma, // ,
69 colon, // :
70 semicolon, // ;
71 number, // +12345.324e-3
72 dimension, // 1.23px -- number with dimension
73 cdo, // <!--
74 cdc, // -->
75 atKeyword, // @someKeyword -- tokenText will contain keyword w/o @ prefix
76 unicodeRange, // U+XXX-XXX
77 }
78
79 struct CSSToken {
80 CSSTokenType type;
81 string text;
82 string dimensionUnit;
83 union {
84 struct {
85 long intValue = 0; /// for number and dimension
86 double doubleValue = 0; /// for number and dimension
87 bool typeFlagInteger; /// for number and dimension - true if number is integer, false if double
88 }
89 struct {
90 uint unicodeRangeStart; /// for unicodeRange (initialized to 0 via intValue=0)
91 uint unicodeRangeEnd; /// for unicodeRange (initialized to 0 via intValue=0)
92 }
93 bool typeFlagId; // true if identifier is valid ID
94 }
95 }
96
97 int decodeHexDigit(char ch) {
98 if (ch >= 'a' && ch <= 'f')
99 return (ch - 'a') + 10;
100 if (ch >= 'A' && ch <= 'F')
101 return (ch - 'A') + 10;
102 if (ch >= '0' && ch <= '9')
103 return (ch - '0');
104 return -1;
105 }
106
107 bool isCSSWhiteSpaceChar(char ch) {
108 return ch == ' ' || ch == '\t' || ch == 0x0C || ch == 0x0D || ch == 0x0A;
109 }
110
111 // returns true if code point is letter, underscore or non-ascii
112 bool isCSSNameStart(char ch) {
113 return ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch & 0x80) > 0 || ch == '_');
114 }
115
116 bool isCSSNonPrintable(char ch) {
117 if (ch >= 0 && ch <= 8)
118 return true;
119 if (ch == 0x0B || ch == 0x7F)
120 return true;
121 if (ch >= 0x0E && ch <= 0x1F)
122 return true;
123 return false;
124 }
125 // This section describes how to check if two code points are a valid escape
126 bool isCSSValidEscSequence(char ch, char ch2) {
127 //If the first code point is not U+005D REVERSE SOLIDUS (\), return false.
128 if (ch != '\\')
129 return false;
130 if (ch2 == '\r' || ch2 == '\n')
131 return false;
132 return true;
133 }
134
135 struct CSSTokenizer {
136 /// CSS source code (utf-8)
137 char[] src;
138 /// current token type
139 CSSTokenType tokenType;
140 /// current token start byte offset
141 size_t tokenStart;
142 /// current token end byte offset
143 size_t tokenEnd;
144 char[] tokenText;
145 char[] dimensionUnit;
146 bool tokenTypeFlagId; // true if identifier is valid ID
147 bool tokenTypeInteger; // for number and dimension - true if number is integer, false if double
148 long tokenIntValue; // for number and dimension
149 double tokenDoubleValue; // for number and dimension
150 uint unicodeRangeStart = 0; // for unicodeRange
151 uint unicodeRangeEnd = 0; // for unicodeRange
152 void start(string _src) {
153 src = _src.dup;
154 tokenStart = tokenEnd = 0;
155 tokenText.length = 1000;
156 tokenText.assumeSafeAppend;
157 dimensionUnit.length = 1000;
158 dimensionUnit.assumeSafeAppend;
159 }
160 bool eof() {
161 return tokenEnd >= src.length;
162 }
163 /**
164 Skip whitespace; return true if at least one whitespace char is skipped; move tokenEnd position
165 tokenType will be set to newline if any newline character found, otherwise - to whitespace
166 */
167 bool skipWhiteSpace() {
168 bool skipped = false;
169 tokenType = CSSTokenType.whitespace;
170 for (;;) {
171 if (tokenEnd >= src.length) {
172 return false;
173 }
174 char ch = src.ptr[tokenEnd];
175 if (ch == '\r' || ch == '\n' || ch == 0x0C) {
176 tokenEnd++;
177 //tokenType = CSSTokenType.newline;
178 skipped = true;
179 } if (ch == ' ' || ch == '\t') {
180 tokenEnd++;
181 skipped = true;
182 } else if (ch == 0xEF && tokenEnd + 2 < src.length && src.ptr[tokenEnd + 1] == 0xBF && src.ptr[tokenEnd + 2] == 0xBD) {
183 // U+FFFD 1110xxxx 10xxxxxx 10xxxxxx == EF BF BD
184 tokenEnd++;
185 skipped = true;
186 } else {
187 return skipped;
188 }
189 }
190 }
191
192 private dchar parseEscape(ref size_t p) {
193 size_t pos = p + 1;
194 if (pos >= src.length)
195 return cast(dchar)0xFFFFFFFF; // out of bounds
196 char ch = src.ptr[pos];
197 pos++;
198 if (ch == '\r' || ch == '\n' || ch == 0x0C)
199 return cast(dchar)0xFFFFFFFF; // unexpected newline: invalid esc sequence
200 int hex = decodeHexDigit(ch);
201 if (hex >= 0) {
202 dchar res = hex;
203 int count = 1;
204 while (count < 6) {
205 if (pos >= src.length)
206 break;
207 ch = src.ptr[pos];
208 hex = decodeHexDigit(ch);
209 if (hex < 0)
210 break;
211 res = (res << 4) | hex;
212 pos++;
213 count++;
214 }
215 if (isCSSWhiteSpaceChar(ch))
216 pos++;
217 p = pos;
218 return res;
219 } else {
220 // not a hex: one character is escaped
221 p = pos;
222 return ch;
223 }
224 }
225 private void appendEscapedIdentChar(dchar ch) {
226 if (ch < 0x80) {
227 // put as is
228 tokenText ~= cast(char)ch;
229 } else {
230 // UTF-8 encode
231 import std.utf : encode, isValidDchar;
232 char[4] buf;
233 size_t chars = isValidDchar(ch) ? encode(buf, ch) : 0;
234 if (chars)
235 tokenText ~= buf[0 .. chars];
236 else
237 tokenText ~= '?'; // replacement for invalid character
238 }
239 }
240
241 /** Consume identifier at current position, append it to tokenText */
242 bool consumeIdent(ref char[] tokenText) {
243 size_t p = tokenEnd;
244 char ch = src.ptr[p];
245 bool hasHyphen = false;
246 if (ch == '-') {
247 p++;
248 if (p >= src.length)
249 return false; // eof
250 hasHyphen = true;
251 ch = src.ptr[p];
252 }
253 if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_' || ch >= 0x80) {
254 if (hasHyphen)
255 tokenText ~= '-';
256 tokenText ~= ch;
257 p++;
258 } else if (ch == '\\') {
259 dchar esc = parseEscape(p);
260 if (esc == 0xFFFFFFFF)
261 return false; // invalid esc
262 // encode to UTF-8
263 appendEscapedIdentChar(esc);
264 } else {
265 return false;
266 }
267 for (;;) {
268 if (p >= src.length)
269 break;
270 ch = src.ptr[p];
271 if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch == '_' || ch == '-' || ch >= 0x80) {
272 tokenText ~= ch;
273 p++;
274 } else if (ch == '\\') {
275 dchar esc = parseEscape(p);
276 if (esc == 0xFFFFFFFF)
277 break; // invalid esc
278 // encode to UTF-8
279 appendEscapedIdentChar(esc);
280 } else {
281 break;
282 }
283 }
284 tokenEnd = p;
285 return true;
286 }
287
288 /**
289 Parse identifier.
290 Returns true if identifier is parsed. tokenText will contain identifier text.
291 */
292 bool parseIdent() {
293 if (!isIdentStart(tokenEnd))
294 return false;
295 if (consumeIdent(tokenText)) {
296 tokenType = tokenType.ident;
297 return true;
298 }
299 return false;
300 }
301
302 /** returns true if current tokenEnd position is identifier start */
303 bool isIdentStart(size_t p) {
304 if (p >= src.length)
305 return false;
306 char ch = src.ptr[p];
307 if (isCSSNameStart(ch))
308 return true;
309 if (ch == '-') {
310 //If the second code point is a name-start code point or the second and third code points are a valid escape, return true. Otherwise, return false.
311 p++;
312 if (p >= src.length)
313 return false;
314 ch = src.ptr[p];
315 if (isCSSNameStart(ch))
316 return true;
317 }
318 if (ch == '\\') {
319 p++;
320 if (p >= src.length)
321 return false;
322 char ch2 = src.ptr[p];
323 return isCSSValidEscSequence(ch, ch2);
324 }
325 return false;
326 }
327
328 /**
329 Parse identifier.
330 Returns true if identifier is parsed. tokenText will contain identifier text.
331 */
332 bool parseNumber() {
333 tokenTypeInteger = true;
334 tokenIntValue = 0;
335 tokenDoubleValue = 0;
336 size_t p = tokenEnd;
337 char ch = src.ptr[p];
338 int numberSign = 1;
339 int exponentSign = 1;
340 bool hasPoint = false;
341 ulong intValue = 0;
342 ulong afterPointValue = 0;
343 ulong exponentValue = 0;
344 int beforePointDigits = 0;
345 int afterPointDigits = 0;
346 int exponentDigits = 0;
347 if (ch == '+' || ch == '-') {
348 if (ch == '-')
349 numberSign = -1;
350 tokenText ~= ch;
351 p++;
352 if (p >= src.length)
353 return false; // eof
354 ch = src.ptr[p];
355 }
356 // append digits before point
357 while (ch >= '0' && ch <= '9') {
358 tokenText ~= ch;
359 intValue = intValue * 10 + (ch - '0');
360 beforePointDigits++;
361 p++;
362 if (p >= src.length) {
363 ch = 0;
364 break;
365 }
366 ch = src.ptr[p];
367 }
368 // check for point
369 if (ch == '.') {
370 hasPoint = true;
371 tokenText ~= ch;
372 p++;
373 if (p >= src.length)
374 return false; // eof
375 ch = src.ptr[p];
376 }
377 // append digits after point
378 while (ch >= '0' && ch <= '9') {
379 tokenText ~= ch;
380 afterPointValue = afterPointValue * 10 + (ch - '0');
381 afterPointDigits++;
382 p++;
383 if (p >= src.length) {
384 ch = 0;
385 break;
386 }
387 ch = src.ptr[p];
388 }
389 if (!beforePointDigits && !afterPointDigits) {
390 if (tokenText.length)
391 tokenText.length = 0;
392 return false; // not a number
393 }
394 if (ch == 'e' || ch == 'E') {
395 char nextCh = p + 1 < src.length ? src.ptr[p + 1] : 0;
396 char nextCh2 = p + 2 < src.length ? src.ptr[p + 2] : 0;
397 int skip = 1;
398 if (nextCh == '+' || nextCh == '-') {
399 if (nextCh == '-')
400 exponentSign = -1;
401 skip = 2;
402 nextCh = nextCh2;
403 }
404 if (nextCh >= '0' && nextCh <= '9') {
405 tokenText ~= src.ptr[p .. p + skip];
406 p += skip;
407 ch = nextCh;
408 // append exponent digits
409 while (ch >= '0' && ch <= '9') {
410 tokenText ~= ch;
411 exponentValue = exponentValue * 10 + (ch - '0');
412 exponentDigits++;
413 p++;
414 if (p >= src.length) {
415 ch = 0;
416 break;
417 }
418 ch = src.ptr[p];
419 }
420 }
421 }
422 tokenType = CSSTokenType.number;
423 tokenEnd = p;
424 if (exponentDigits || afterPointDigits) {
425 // parsed floating point
426 tokenDoubleValue = cast(long)intValue;
427 if (afterPointDigits) {
428 long divider = 1;
429 for (int i = 0; i < afterPointDigits; i++)
430 divider *= 10;
431 tokenDoubleValue += afterPointValue / cast(double)divider;
432 }
433 if (numberSign < 0)
434 tokenDoubleValue = -tokenDoubleValue;
435 if (exponentDigits) {
436 import std.math : pow;
437 double exponent = (cast(long)exponentValue * exponentSign);
438 tokenDoubleValue = tokenDoubleValue * pow(10, exponent);
439 }
440 tokenIntValue = cast(long)tokenDoubleValue;
441 } else {
442 // parsed integer
443 tokenIntValue = cast(long)intValue;
444 if (numberSign < 0)
445 tokenIntValue = -tokenIntValue;
446 tokenDoubleValue = tokenIntValue;
447 }
448 dimensionUnit.length = 0;
449 if (isIdentStart(tokenEnd)) {
450 tokenType = CSSTokenType.dimension;
451 consumeIdent(dimensionUnit);
452 }
453 return true;
454 }
455
456 bool parseString(char quotationChar) {
457 tokenType = CSSTokenType.str;
458 // skip first delimiter ' or "
459 size_t p = tokenEnd + 1;
460 for (;;) {
461 if (p >= src.length) {
462 // unexpected end of file
463 tokenEnd = p;
464 return true;
465 }
466 char ch = src.ptr[p];
467 if (ch == '\r' || ch == '\n') {
468 tokenType = CSSTokenType.badStr;
469 tokenEnd = p - 1;
470 return true;
471 } else if (ch == quotationChar) {
472 // end of string
473 tokenEnd = p + 1;
474 return true;
475 } else if (ch == '\\') {
476 if (p + 1 >= src.length) {
477 // unexpected end of file
478 tokenEnd = p;
479 return true;
480 }
481 ch = src.ptr[p + 1];
482 if (ch == '\r' || ch == '\n') {
483 // \ NEWLINE
484 //tokenText ~= 0x0A;
485 p++;
486 } else {
487 dchar esc = parseEscape(p);
488 if (esc == 0xFFFFFFFF) {
489 esc = '?'; // replace invalid code point
490 p++;
491 }
492 // encode to UTF-8
493 appendEscapedIdentChar(esc);
494 }
495 } else {
496 // normal character
497 tokenText ~= ch;
498 p++;
499 }
500 }
501 }
502 CSSTokenType emitDelimToken() {
503 import std.utf : stride, UTFException;
504 try {
505 uint len = stride(src[tokenStart .. $]);
506 tokenEnd = tokenStart + len;
507 } catch (UTFException e) {
508 tokenEnd = tokenStart + 1;
509 }
510 tokenText ~= src[tokenStart .. tokenEnd];
511 tokenType = CSSTokenType.delim;
512 return tokenType;
513 }
514 // #token
515 CSSTokenType parseHashToken() {
516 tokenTypeFlagId = false;
517 tokenEnd++;
518 // set tokenTypeFlagId flag
519 if (parseIdent()) {
520 tokenType = CSSTokenType.hashToken;
521 if (tokenText[0] < '0' || tokenText[0] > '9')
522 tokenTypeFlagId = true; // is valid ID
523 return tokenType;
524 }
525 // invalid ident
526 return emitDelimToken();
527 }
528 /// current chars are /*
529 CSSTokenType parseComment() {
530 size_t p = tokenEnd + 2; // skip /*
531 while (p < src.length) {
532 char ch = src.ptr[p];
533 char ch2 = p + 1 < src.length ? src.ptr[p + 1] : 0;
534 if (ch == '*' && ch2 == '/') {
535 p += 2;
536 break;
537 }
538 p++;
539 }
540 tokenEnd = p;
541 tokenType = CSSTokenType.comment;
542 return tokenType;
543 }
544 /// current chars are U+ or u+ followed by hex digit or ?
545 CSSTokenType parseUnicodeRangeToken() {
546 unicodeRangeStart = 0;
547 unicodeRangeEnd = 0;
548 size_t p = tokenEnd + 2; // skip U+
549 // now we have hex digit or ?
550 int hexCount = 0;
551 uint hexNumber = 0;
552 int questionCount = 0;
553 // consume hex digits
554 while (p < src.length) {
555 char ch = src.ptr[p];
556 int digit = decodeHexDigit(ch);
557 if (digit < 0)
558 break;
559 hexCount++;
560 hexNumber = (hexNumber << 4) | digit;
561 p++;
562 if (hexCount >= 6)
563 break;
564 }
565 // consume question marks
566 while (p < src.length && questionCount + hexCount < 6) {
567 char ch = src.ptr[p];
568 if (ch != '?')
569 break;
570 questionCount++;
571 p++;
572 }
573 if (questionCount) {
574 int shift = 4 * questionCount;
575 unicodeRangeStart = hexNumber << shift;
576 unicodeRangeEnd = unicodeRangeStart + ((1 << shift) - 1);
577 } else {
578 unicodeRangeStart = hexNumber;
579 char ch = p < src.length ? src.ptr[p] : 0;
580 char ch2 = p + 1 < src.length ? src.ptr[p + 1] : 0;
581 int digit = decodeHexDigit(ch2);
582 if (ch == '-' && digit >= 0) {
583 p += 2; // skip - and first digit
584 hexCount = 1;
585 hexNumber = digit;
586 while (p < src.length) {
587 ch = src.ptr[p];
588 digit = decodeHexDigit(ch);
589 if (digit < 0)
590 break;
591 hexCount++;
592 hexNumber = (hexNumber << 4) | digit;
593 p++;
594 if (hexCount >= 6)
595 break;
596 }
597 unicodeRangeEnd = hexNumber;
598 } else {
599 unicodeRangeEnd = unicodeRangeStart;
600 }
601 }
602 tokenEnd = p;
603 tokenType = CSSTokenType.unicodeRange;
604 return tokenType;
605 }
606 /// emit single char token like () {} [] : ;
607 CSSTokenType emitSingleCharToken(CSSTokenType type) {
608 tokenType = type;
609 tokenEnd = tokenStart + 1;
610 tokenText ~= src[tokenStart];
611 return type;
612 }
613 /// emit double char token like $= *=
614 CSSTokenType emitDoubleCharToken(CSSTokenType type) {
615 tokenType = type;
616 tokenEnd = tokenStart + 2;
617 tokenText ~= src[tokenStart .. tokenStart + 2];
618 return type;
619 }
620 void consumeBadUrl() {
621 for (;;) {
622 char ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
623 char ch2 = tokenEnd + 1 < src.length ? src.ptr[tokenEnd + 1] : 0;
624 if (ch == ')' || ch == 0) {
625 if (ch == ')')
626 tokenEnd++;
627 break;
628 }
629 if (isCSSValidEscSequence(ch, ch2)) {
630 parseEscape(tokenEnd);
631 }
632 tokenEnd++;
633 }
634 tokenType = CSSTokenType.badUrl;
635 }
636 // Current position is after url(
637 void parseUrlToken() {
638 tokenText.length = 0;
639 skipWhiteSpace();
640 if (tokenEnd >= src.length)
641 return;
642 char ch = src.ptr[tokenEnd];
643 if (ch == '\'' || ch == '\"') {
644 if (parseString(ch)) {
645 skipWhiteSpace();
646 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
647 if (ch == ')' || ch == 0) {
648 // valid URL token
649 if (ch == ')')
650 tokenEnd++;
651 tokenType = CSSTokenType.url;
652 return;
653 }
654 }
655 // bad url
656 consumeBadUrl();
657 return;
658 }
659 // not quoted
660 for (;;) {
661 if (skipWhiteSpace()) {
662 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
663 if (ch == ')' || ch == 0) {
664 if (ch == ')')
665 tokenEnd++;
666 tokenType = CSSTokenType.url;
667 return;
668 }
669 consumeBadUrl();
670 return;
671 }
672 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
673 char ch2 = tokenEnd + 1 < src.length ? src.ptr[tokenEnd + 1] : 0;
674 if (ch == ')' || ch == 0) {
675 if (ch == ')')
676 tokenEnd++;
677 tokenType = CSSTokenType.url;
678 return;
679 }
680 if (ch == '(' || ch == '\'' || ch == '\"' || isCSSNonPrintable(ch)) {
681 consumeBadUrl();
682 return;
683 }
684 if (ch == '\\') {
685 if (isCSSValidEscSequence(ch, ch2)) {
686 dchar esc = parseEscape(tokenEnd);
687 appendEscapedIdentChar(ch);
688 } else {
689 consumeBadUrl();
690 return;
691 }
692 }
693 tokenText ~= ch;
694 tokenEnd++;
695 }
696 }
697 CSSTokenType next() {
698 // move beginning of token
699 tokenStart = tokenEnd;
700 tokenText.length = 0;
701 // check for whitespace
702 if (skipWhiteSpace())
703 return tokenType; // whitespace or newline token
704 // check for eof
705 if (tokenEnd >= src.length)
706 return CSSTokenType.eof;
707 char ch = src.ptr[tokenEnd];
708 char nextCh = tokenEnd + 1 < src.length ? src.ptr[tokenEnd + 1] : 0;
709 if (ch == '\"' || ch == '\'') {
710 parseString(ch);
711 return tokenType;
712 }
713 if (ch == '#') {
714 return parseHashToken();
715 }
716 if (ch == '$') {
717 if (nextCh == '=') {
718 return emitDoubleCharToken(CSSTokenType.suffixMatch);
719 } else {
720 return emitDelimToken();
721 }
722 }
723 if (ch == '^') {
724 if (nextCh == '=') {
725 return emitDoubleCharToken(CSSTokenType.prefixMatch);
726 } else {
727 return emitDelimToken();
728 }
729 }
730 if (ch == '(')
731 return emitSingleCharToken(CSSTokenType.parentOpen);
732 if (ch == ')')
733 return emitSingleCharToken(CSSTokenType.parentClose);
734 if (ch == '[')
735 return emitSingleCharToken(CSSTokenType.squareOpen);
736 if (ch == ']')
737 return emitSingleCharToken(CSSTokenType.squareClose);
738 if (ch == '{')
739 return emitSingleCharToken(CSSTokenType.curlyOpen);
740 if (ch == '}')
741 return emitSingleCharToken(CSSTokenType.curlyClose);
742 if (ch == ',')
743 return emitSingleCharToken(CSSTokenType.comma);
744 if (ch == ':')
745 return emitSingleCharToken(CSSTokenType.colon);
746 if (ch == ';')
747 return emitSingleCharToken(CSSTokenType.semicolon);
748 if (ch == '*') {
749 if (nextCh == '=') {
750 return emitDoubleCharToken(CSSTokenType.substringMatch);
751 } else {
752 return emitDelimToken();
753 }
754 }
755 if (ch == '~') {
756 if (nextCh == '=') {
757 return emitDoubleCharToken(CSSTokenType.includeMatch);
758 } else {
759 return emitDelimToken();
760 }
761 }
762 if (ch == '|') {
763 if (nextCh == '=') {
764 return emitDoubleCharToken(CSSTokenType.dashMatch);
765 } else if (nextCh == '|') {
766 return emitDoubleCharToken(CSSTokenType.column);
767 } else {
768 return emitDelimToken();
769 }
770 }
771 if (ch == '/') {
772 if (nextCh == '*') {
773 return parseComment();
774 } else {
775 return emitDelimToken();
776 }
777 }
778 char nextCh2 = tokenEnd + 2 < src.length ? src.ptr[tokenEnd + 2] : 0;
779 if (ch == 'u' || ch == 'U') {
780 if (nextCh == '+' && (decodeHexDigit(nextCh2) >= 0 || nextCh2 == '?')) {
781 return parseUnicodeRangeToken();
782 }
783 }
784 if (parseNumber())
785 return tokenType;
786 if (parseIdent()) {
787 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
788 if (ch == '(') {
789 tokenEnd++;
790 import std.uni : icmp;
791 if (tokenText.length == 3 && icmp(tokenText, "url") == 0) {
792 // parse URL function
793 parseUrlToken();
794 } else {
795 tokenType = CSSTokenType.func;
796 }
797 }
798 return tokenType;
799 }
800 if (ch == '-') {
801 if (nextCh == '-' && nextCh2 == '>') {
802 tokenEnd = tokenStart + 3;
803 tokenType = CSSTokenType.cdc;
804 tokenText ~= src[tokenStart .. tokenEnd];
805 return tokenType;
806 }
807 return emitDelimToken();
808 }
809 if (ch == '<') {
810 char nextCh3 = tokenEnd + 3 < src.length ? src.ptr[tokenEnd + 3] : 0;
811 if (nextCh == '!' && nextCh2 == '-' && nextCh3 == '-') {
812 tokenEnd = tokenStart + 4;
813 tokenType = CSSTokenType.cdo;
814 tokenText ~= src[tokenStart .. tokenEnd];
815 return tokenType;
816 }
817 return emitDelimToken();
818 }
819 if (ch == '@') {
820 if (isIdentStart(tokenEnd + 1)) {
821 tokenEnd++;
822 parseIdent();
823 tokenType = CSSTokenType.atKeyword;
824 return tokenType;
825 }
826 return emitDelimToken();
827 }
828 return emitDelimToken();
829 }
830 /// same as next() but returns filled CSSToken struct
831 CSSToken nextToken() {
832 CSSToken res;
833 res.type = next();
834 if (res.type == CSSTokenType.str || res.type == CSSTokenType.ident || res.type == CSSTokenType.atKeyword || res.type == CSSTokenType.url || res.type == CSSTokenType.func) {
835 if (tokenText.length)
836 res.text = tokenText.dup;
837 }
838 if (res.type == CSSTokenType.dimension && dimensionUnit.length)
839 res.dimensionUnit = dimensionUnit.dup;
840 if (res.type == CSSTokenType.dimension || res.type == CSSTokenType.number) {
841 res.doubleValue = tokenDoubleValue;
842 res.intValue = tokenIntValue;
843 res.typeFlagInteger = tokenTypeInteger;
844 } else if (res.type == CSSTokenType.ident) {
845 res.typeFlagId = tokenTypeFlagId;
846 } else if (res.type == CSSTokenType.unicodeRange) {
847 res.unicodeRangeStart = unicodeRangeStart;
848 res.unicodeRangeEnd = unicodeRangeEnd;
849 }
850 return res;
851 }
852 }
853
854 unittest {
855 CSSTokenizer tokenizer;
856 tokenizer.start("ident-1{ }\n#id\n'blabla' \"bla bla 2\" -ident2*=12345 -.234e+5 "
857 ~ "1.23px/* some comment */U+123?!"
858 ~"url( 'text.css' )url(bad url)functionName()url( bla )"
859 ~"'\\30 \\31'");
860 assert(tokenizer.next() == CSSTokenType.ident);
861 assert(tokenizer.tokenText == "ident-1");
862 assert(tokenizer.next() == CSSTokenType.curlyOpen);
863 assert(tokenizer.next() == CSSTokenType.whitespace);
864 assert(tokenizer.next() == CSSTokenType.curlyClose);
865 assert(tokenizer.next() == CSSTokenType.whitespace); //newline
866 assert(tokenizer.next() == CSSTokenType.hashToken);
867 assert(tokenizer.tokenText == "id");
868 assert(tokenizer.tokenTypeFlagId == true);
869 assert(tokenizer.next() == CSSTokenType.whitespace); //newline
870 assert(tokenizer.next() == CSSTokenType.str);
871 assert(tokenizer.tokenText == "blabla");
872 assert(tokenizer.next() == CSSTokenType.whitespace);
873 assert(tokenizer.next() == CSSTokenType.str);
874 assert(tokenizer.tokenText == "bla bla 2");
875 assert(tokenizer.next() == CSSTokenType.whitespace);
876 assert(tokenizer.next() == CSSTokenType.ident);
877 assert(tokenizer.tokenText == "-ident2");
878 assert(tokenizer.next() == CSSTokenType.substringMatch);
879 assert(tokenizer.next() == CSSTokenType.number);
880 assert(tokenizer.tokenText == "12345");
881 assert(tokenizer.tokenIntValue == 12345);
882 assert(tokenizer.next() == CSSTokenType.whitespace);
883 assert(tokenizer.next() == CSSTokenType.number);
884 assert(tokenizer.tokenText == "-.234e+5");
885 assert(tokenizer.tokenIntValue == -23400);
886 assert(tokenizer.tokenDoubleValue == -.234e+5);
887 assert(tokenizer.next() == CSSTokenType.whitespace);
888 // next line
889 assert(tokenizer.next() == CSSTokenType.dimension);
890 assert(tokenizer.tokenText == "1.23");
891 assert(tokenizer.tokenIntValue == 1);
892 assert(tokenizer.tokenDoubleValue == 1.23);
893 assert(tokenizer.dimensionUnit == "px");
894 assert(tokenizer.next() == CSSTokenType.comment);
895 assert(tokenizer.next() == CSSTokenType.unicodeRange);
896 assert(tokenizer.unicodeRangeStart == 0x1230 && tokenizer.unicodeRangeEnd == 0x123F);
897 assert(tokenizer.next() == CSSTokenType.delim);
898 assert(tokenizer.tokenText == "!");
899 // next line
900 assert(tokenizer.next() == CSSTokenType.url);
901 assert(tokenizer.tokenText == "text.css");
902 assert(tokenizer.next() == CSSTokenType.badUrl);
903 assert(tokenizer.next() == CSSTokenType.func);
904 assert(tokenizer.tokenText == "functionName");
905 assert(tokenizer.next() == CSSTokenType.parentClose);
906 assert(tokenizer.next() == CSSTokenType.url);
907 assert(tokenizer.tokenText == "bla");
908 // next line
909 assert(tokenizer.next() == CSSTokenType.str);
910 assert(tokenizer.tokenText == "01"); //'\30 \31'
911 assert(tokenizer.next() == CSSTokenType.eof);
912 }
913
914
915 /**
916 Tokenizes css source, returns array of tokens (last token is EOF).
917 Source must be preprocessed utf-8 string.
918 */
919 static CSSToken[] tokenizeCSS(string src) {
920 CSSTokenizer tokenizer;
921 tokenizer.start(src);
922 CSSToken[] res;
923 res.assumeSafeAppend();
924 for(;;) {
925 res ~= tokenizer.nextToken();
926 if (res[$ - 1].type == CSSTokenType.eof)
927 break;
928 }
929 return res;
930 }
931
932 unittest {
933 string src = "pre {123em}";
934 auto res = tokenizeCSS(src);
935 assert(res.length == 6);
936 assert(res[0].type == CSSTokenType.ident);
937 assert(res[0].text == "pre");
938 assert(res[1].type == CSSTokenType.whitespace);
939 assert(res[2].type == CSSTokenType.curlyOpen);
940 assert(res[3].type == CSSTokenType.dimension);
941 assert(res[3].typeFlagInteger == true);
942 assert(res[3].intValue == 123);
943 assert(res[3].dimensionUnit == "em");
944 assert(res[4].type == CSSTokenType.curlyClose);
945 assert(res[$ - 1].type == CSSTokenType.eof);
946 }
947
948 // easy way to extract and apply imports w/o full document parsing
949 /**
950 Extract CSS vimport rules from source.
951 */
952 CSSImportRule[] extractCSSImportRules(string src) {
953 enum ParserState {
954 start, // before rule begin, switch to this state after ;
955 afterImport, // after @import
956 afterCharset, // after @charset
957 afterCharsetName, // after @charset
958 afterImportUrl, // after @charset
959 }
960 ParserState state = ParserState.start;
961 CSSImportRule[] res;
962 CSSTokenizer tokenizer;
963 tokenizer.start(src);
964 bool insideImportRule = false;
965 string url;
966 size_t startPos = 0;
967 size_t endPos = 0;
968 for (;;) {
969 CSSTokenType type = tokenizer.next();
970 if (type == CSSTokenType.eof)
971 break;
972 if (type == CSSTokenType.whitespace || type == CSSTokenType.comment)
973 continue; // skip whitespaces and comments
974 if (type == CSSTokenType.atKeyword) {
975 if (tokenizer.tokenText == "charset") {
976 state = ParserState.afterCharset;
977 continue;
978 }
979 if (tokenizer.tokenText != "import")
980 break;
981 // import rule
982 state = ParserState.afterImport;
983 startPos = tokenizer.tokenStart;
984 continue;
985 }
986 if (type == CSSTokenType.str || type == CSSTokenType.url) {
987 if (state == ParserState.afterImport) {
988 url = tokenizer.tokenText.dup;
989 state = ParserState.afterImportUrl;
990 continue;
991 }
992 if (state == ParserState.afterCharset) {
993 state = ParserState.afterCharsetName;
994 continue;
995 }
996 break;
997 }
998 if (type == CSSTokenType.curlyOpen)
999 break;
1000 if (type == CSSTokenType.ident && state == ParserState.start)
1001 break; // valid @imports may be only at the beginning of file
1002 if (type == CSSTokenType.semicolon) {
1003 if (state == ParserState.afterImportUrl) {
1004 // add URL
1005 endPos = tokenizer.tokenEnd;
1006 CSSImportRule rule;
1007 rule.startPos = startPos;
1008 rule.endPos = endPos;
1009 rule.url = url;
1010 res ~= rule;
1011 }
1012 state = ParserState.start;
1013 continue;
1014 }
1015 }
1016 return res;
1017 }
1018
1019 /**
1020 Replace source code import rules obtained by extractImportRules() with imported content.
1021 */
1022 string applyCSSImportRules(string src, CSSImportRule[] rules) {
1023 if (!rules.length)
1024 return src; // no rules
1025 char[] res;
1026 res.assumeSafeAppend;
1027 size_t start = 0;
1028 for (int i = 0; i < rules.length; i++) {
1029 res ~= src[start .. rules[i].startPos];
1030 res ~= rules[i].content;
1031 start = rules[i].endPos;
1032 }
1033 if (start < src.length)
1034 res ~= src[start .. $];
1035 return cast(string)res;
1036 }
1037
1038
1039 unittest {
1040 string src = q{
1041 @charset "utf-8";
1042 /* comment must be ignored */
1043 @import "file1.css"; /* string */
1044 @import url(file2.css); /* url */
1045 pre {}
1046 @import "ignore_me.css";
1047 p {}
1048 };
1049 auto res = extractCSSImportRules(src);
1050 assert(res.length == 2);
1051 assert(res[0].url == "file1.css");
1052 assert(res[1].url == "file2.css");
1053 res[0].content = "[file1_content]";
1054 res[1].content = "[file2_content]";
1055 string s = applyCSSImportRules(src, res);
1056 assert (s.length != src.length);
1057 }
1058
1059 enum ASTNodeType {
1060 simpleBlock,
1061 componentValue,
1062 preservedToken,
1063 func,
1064 atRule,
1065 qualifiedRule,
1066 }
1067
1068 class ASTNode {
1069 ASTNodeType type;
1070 }
1071
1072 class ComponentValueNode : ASTNode {
1073 this() {
1074 type = ASTNodeType.componentValue;
1075 }
1076 }
1077
1078 class SimpleBlockNode : ComponentValueNode {
1079 CSSTokenType blockType = CSSTokenType.curlyOpen;
1080 ComponentValueNode[] componentValues;
1081 this() {
1082 type = ASTNodeType.simpleBlock;
1083 }
1084 }
1085
1086 class FunctionNode : ComponentValueNode {
1087 ComponentValueNode[] componentValues;
1088 this(string name) {
1089 type = ASTNodeType.func;
1090 }
1091 }
1092
1093 class PreservedTokenNode : ComponentValueNode {
1094 CSSToken token;
1095 this(ref CSSToken token) {
1096 this.token = token;
1097 type = ASTNodeType.preservedToken;
1098 }
1099 }
1100
1101 class QualifiedRuleNode : ASTNode {
1102 ComponentValueNode[] componentValues;
1103 SimpleBlockNode block;
1104 this() {
1105 type = ASTNodeType.qualifiedRule;
1106 }
1107 }
1108
1109 class ATRuleNode : QualifiedRuleNode {
1110 string name;
1111 this() {
1112 type = ASTNodeType.atRule;
1113 }
1114 }
1115
1116
1117 class CSSParser {
1118 CSSToken[] tokens;
1119 int pos = 0;
1120 this(CSSToken[] _tokens) {
1121 tokens = _tokens;
1122 }
1123 /// peek current token
1124 @property ref CSSToken currentToken() {
1125 return tokens[pos];
1126 }
1127 /// peek next token
1128 @property ref CSSToken nextToken() {
1129 return tokens[pos + 1 < $ ? pos + 1 : pos];
1130 }
1131 /// move to next token
1132 bool next() {
1133 if (pos < tokens.length) {
1134 pos++;
1135 return true;
1136 }
1137 return false;
1138 }
1139 /// move to nearest non-whitespace token; return current token type (does not move if current token is not whitespace)
1140 CSSTokenType skipWhiteSpace() {
1141 while (currentToken.type == CSSTokenType.whitespace || currentToken.type == CSSTokenType.comment || currentToken.type == CSSTokenType.delim)
1142 next();
1143 return currentToken.type;
1144 }
1145 /// skip current token, then move to nearest non-whitespace token; return new token type
1146 @property CSSTokenType nextNonWhiteSpace() {
1147 next();
1148 return skipWhiteSpace();
1149 }
1150 SimpleBlockNode parseSimpleBlock() {
1151 auto type = skipWhiteSpace();
1152 CSSTokenType closeType;
1153 if (type == CSSTokenType.curlyOpen) {
1154 closeType = CSSTokenType.curlyClose;
1155 } else if (type == CSSTokenType.squareOpen) {
1156 closeType = CSSTokenType.squareClose;
1157 } else if (type == CSSTokenType.parentOpen) {
1158 closeType = CSSTokenType.parentClose;
1159 } else {
1160 // not a simple block
1161 return null;
1162 }
1163 SimpleBlockNode res = new SimpleBlockNode();
1164 res.blockType = type;
1165 auto t = nextNonWhiteSpace();
1166 res.componentValues = parseComponentValueList(closeType);
1167 t = skipWhiteSpace();
1168 if (t == closeType)
1169 nextNonWhiteSpace();
1170 return res;
1171 }
1172 FunctionNode parseFunctionBlock() {
1173 auto type = skipWhiteSpace();
1174 if (type != CSSTokenType.func)
1175 return null;
1176 FunctionNode res = new FunctionNode(currentToken.text);
1177 auto t = nextNonWhiteSpace();
1178 res.componentValues = parseComponentValueList(CSSTokenType.parentClose);
1179 t = skipWhiteSpace();
1180 if (t == CSSTokenType.parentClose)
1181 nextNonWhiteSpace();
1182 return res;
1183 }
1184 ComponentValueNode[] parseComponentValueList(CSSTokenType endToken1 = CSSTokenType.eof, CSSTokenType endToken2 = CSSTokenType.eof) {
1185 ComponentValueNode[] res;
1186 for (;;) {
1187 auto type = skipWhiteSpace();
1188 if (type == CSSTokenType.eof)
1189 return res;
1190 if (type == endToken1 || type == endToken2)
1191 return res;
1192 if (type == CSSTokenType.squareOpen || type == CSSTokenType.parentOpen || type == CSSTokenType.curlyOpen) {
1193 res ~= parseSimpleBlock();
1194 } else if (type == CSSTokenType.func) {
1195 res ~= parseFunctionBlock();
1196 } else {
1197 res ~= new PreservedTokenNode(currentToken);
1198 next();
1199 }
1200 }
1201 }
1202 ATRuleNode parseATRule() {
1203 auto type = skipWhiteSpace();
1204 if (type != CSSTokenType.atKeyword)
1205 return null;
1206 ATRuleNode res = new ATRuleNode();
1207 res.name = currentToken.text;
1208 type = nextNonWhiteSpace();
1209 res.componentValues = parseComponentValueList(CSSTokenType.semicolon, CSSTokenType.curlyOpen);
1210 type = skipWhiteSpace();
1211 if (type == CSSTokenType.semicolon) {
1212 next();
1213 return res;
1214 }
1215 if (type == CSSTokenType.curlyOpen) {
1216 res.block = parseSimpleBlock();
1217 return res;
1218 }
1219 if (type == CSSTokenType.eof)
1220 return res;
1221 return res;
1222 }
1223
1224 QualifiedRuleNode parseQualifiedRule() {
1225 auto type = skipWhiteSpace();
1226 if (type == CSSTokenType.eof)
1227 return null;
1228 QualifiedRuleNode res = new QualifiedRuleNode();
1229 res.componentValues = parseComponentValueList(CSSTokenType.curlyOpen);
1230 type = skipWhiteSpace();
1231 if (type == CSSTokenType.curlyOpen) {
1232 res.block = parseSimpleBlock();
1233 }
1234 return res;
1235 }
1236 }
1237
1238 unittest {
1239 ATRuleNode atRule = new CSSParser(tokenizeCSS("@atRuleName;")).parseATRule();
1240 assert(atRule !is null);
1241 assert(atRule.name == "atRuleName");
1242 assert(atRule.block is null);
1243
1244 atRule = new CSSParser(tokenizeCSS("@atRuleName2 { }")).parseATRule();
1245 assert(atRule !is null);
1246 assert(atRule.name == "atRuleName2");
1247 assert(atRule.block !is null);
1248 assert(atRule.block.blockType == CSSTokenType.curlyOpen);
1249
1250 atRule = new CSSParser(tokenizeCSS("@atRuleName3 url('bla') { 123 }")).parseATRule();
1251 assert(atRule !is null);
1252 assert(atRule.name == "atRuleName3");
1253 assert(atRule.componentValues.length == 1);
1254 assert(atRule.componentValues[0].type == ASTNodeType.preservedToken);
1255 assert(atRule.block !is null);
1256 assert(atRule.block.blockType == CSSTokenType.curlyOpen);
1257 assert(atRule.block.componentValues.length == 1);
1258
1259
1260 atRule = new CSSParser(tokenizeCSS("@atRuleName4 \"value\" { funcName(123) }")).parseATRule();
1261 assert(atRule !is null);
1262 assert(atRule.name == "atRuleName4");
1263 assert(atRule.componentValues.length == 1);
1264 assert(atRule.componentValues[0].type == ASTNodeType.preservedToken);
1265 assert(atRule.block !is null);
1266 assert(atRule.block.blockType == CSSTokenType.curlyOpen);
1267 assert(atRule.block.componentValues.length == 1);
1268 assert(atRule.block.componentValues[0].type == ASTNodeType.func);
1269 }
1270
1271 unittest {
1272 QualifiedRuleNode qualifiedRule = new CSSParser(tokenizeCSS(" pre { display: none } ")).parseQualifiedRule();
1273 assert(qualifiedRule !is null);
1274 assert(qualifiedRule.componentValues.length == 1);
1275 assert(qualifiedRule.block !is null);
1276 assert(qualifiedRule.block.componentValues.length == 3);
1277 }