1 module dom.cssparser;
2 
3 /**
4 Before sending the input stream to the tokenizer, implementations must make the following code point substitutions:
5     * Replace any U+000D CARRIAGE RETURN (CR) code point, U+000C FORM FEED (FF) code point, or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF) by a single U+000A LINE FEED (LF) code point.
6     * Replace any U+0000 NULL code point with U+FFFD REPLACEMENT CHARACTER.
7 */
8 char[] preProcessCSS(char[] src) {
9     char[] res;
10     res.assumeSafeAppend();
11     int p = 0;
12     bool last0D = false;
13     foreach(ch; src) {
14         if (ch == 0) {
15             // append U+FFFD 1110xxxx 10xxxxxx 10xxxxxx == EF BF BD
16             res ~= 0xEF;
17             res ~= 0xBF;
18             res ~= 0xBD;
19         } else if (ch == 0x0D || ch == 0x0C) {
20             res ~= 0x0A;
21         } else if (ch == 0x0A) {
22             if (!last0D)
23                 res ~= 0x0A;
24         } else {
25             res ~= ch;
26         }
27         last0D = (ch == 0x0D);
28     }
29     return res;
30 }
31 
32 struct CSSImportRule {
33     /// start position - byte offset of @import
34     size_t startPos;
35     /// end position - byte offset of next char after closing ';'
36     size_t endPos;
37     /// url of CSS to import
38     string url;
39     /// content of downloaded URL to apply in place of rule
40     string content;
41 }
42 
43 enum CSSTokenType : ubyte {
44     eof, // end of file
45     delim, // delimiter (may be unknown token or error)
46     comment, /* some comment */
47     //newline, // any of \n \r\n \r \f
48     whitespace, // space, \t, newline
49     ident, // identifier
50     url, // url()
51     badUrl, // url() which is bad
52     func, // function(
53     str, // string '' or ""
54     badStr, // string '' or "" ended with newline character
55     hashToken, // #
56     prefixMatch, // ^=
57     suffixMatch, // $=
58     substringMatch, // *=
59     includeMatch, // ~=
60     dashMatch, // |=
61     column, // ||
62     parentOpen, // (
63     parentClose, // )
64     squareOpen, // [
65     squareClose, // ]
66     curlyOpen, // {
67     curlyClose, // }
68     comma, // ,
69     colon, // :
70     semicolon, // ;
71     number, // +12345.324e-3
72     dimension, // 1.23px  -- number with dimension
73     cdo, // <!--
74     cdc, // -->
75     atKeyword, // @someKeyword -- tokenText will contain keyword w/o @ prefix
76     unicodeRange, // U+XXX-XXX
77 }
78 
79 struct CSSToken {
80     CSSTokenType type;
81     string text;
82     string dimensionUnit;
83     union {
84         bool typeFlagId; // true if identifier is valid ID
85         struct {
86             long intValue = 0; // for number and dimension
87             double doubleValue = 0; // for number and dimension
88             bool typeFlagInteger; // for number and dimension - true if number is integer, false if double
89         }
90         struct {
91             uint unicodeRangeStart; // for unicodeRange (initialized to 0 via intValue=0)
92             uint unicodeRangeEnd; // for unicodeRange (initialized to 0 via intValue=0)
93         }
94     }
95 }
96 
97 int decodeHexDigit(char ch) {
98     if (ch >= 'a' && ch <= 'f')
99         return (ch - 'a') + 10;
100     if (ch >= 'A' && ch <= 'F')
101         return (ch - 'A') + 10;
102     if (ch >= '0' && ch <= '9')
103         return (ch - '0');
104     return -1;
105 }
106 
107 bool isCSSWhiteSpaceChar(char ch) {
108     return ch == ' ' || ch == '\t' || ch == 0x0C || ch == 0x0D || ch == 0x0A;
109 }
110 
111 // returns true if code point is letter, underscore or non-ascii
112 bool isCSSNameStart(char ch) {
113     return ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch & 0x80) > 0 || ch == '_');
114 }
115 
116 bool isCSSNonPrintable(char ch) {
117     if (ch >= 0 && ch <= 8)
118         return true;
119     if (ch == 0x0B || ch == 0x7F)
120         return true;
121     if (ch >= 0x0E && ch <= 0x1F)
122         return true;
123     return false;
124 }
125 // This section describes how to check if two code points are a valid escape
126 bool isCSSValidEscSequence(char ch, char ch2) {
127     //If the first code point is not U+005D REVERSE SOLIDUS (\), return false.
128     if (ch != '\\')
129         return false;
130     if (ch2 == '\r' || ch2 == '\n')
131         return false;
132     return true;
133 }
134 
135 struct CSSTokenizer {
136     /// CSS source code (utf-8)
137     char[] src;
138     /// current token type
139     CSSTokenType tokenType;
140     /// current token start byte offset
141     size_t tokenStart;
142     /// current token end byte offset
143     size_t tokenEnd;
144     char[] tokenText;
145     char[] dimensionUnit;
146     bool tokenTypeFlagId; // true if identifier is valid ID
147     bool tokenTypeInteger; // for number and dimension - true if number is integer, false if double
148     long tokenIntValue; // for number and dimension
149     double tokenDoubleValue; // for number and dimension
150     uint unicodeRangeStart = 0; // for unicodeRange
151     uint unicodeRangeEnd = 0; // for unicodeRange
152     void start(string _src) {
153         src = _src.dup;
154         tokenStart = tokenEnd = 0;
155         tokenText.length = 1000;
156         tokenText.assumeSafeAppend;
157         dimensionUnit.length = 1000;
158         dimensionUnit.assumeSafeAppend;
159     }
160     bool eof() {
161         return tokenEnd >= src.length;
162     }
163     /**
164       Skip whitespace; return true if at least one whitespace char is skipped; move tokenEnd position
165       tokenType will be set to newline if any newline character found, otherwise - to whitespace
166     */
167     bool skipWhiteSpace() {
168         bool skipped = false;
169         tokenType = CSSTokenType.whitespace;
170         for (;;) {
171             if (tokenEnd >= src.length) {
172                 return false;
173             }
174             char ch = src.ptr[tokenEnd];
175             if (ch == '\r' || ch == '\n' || ch == 0x0C) {
176                 tokenEnd++;
177                 //tokenType = CSSTokenType.newline;
178                 skipped = true;
179             } if (ch == ' ' || ch == '\t') {
180                 tokenEnd++;
181                 skipped = true;
182             } else if (ch == 0xEF && tokenEnd  + 2 < src.length && src.ptr[tokenEnd + 1] == 0xBF && src.ptr[tokenEnd + 2] == 0xBD) {
183                 // U+FFFD 1110xxxx 10xxxxxx 10xxxxxx == EF BF BD
184                 tokenEnd++;
185                 skipped = true;
186             } else {
187                 return skipped;
188             }
189         }
190     }
191 
192     private dchar parseEscape(ref size_t p) {
193         size_t pos = p + 1;
194         if (pos >= src.length)
195             return cast(dchar)0xFFFFFFFF; // out of bounds
196         char ch = src.ptr[pos];
197         pos++;
198         if (ch == '\r' || ch == '\n' || ch == 0x0C)
199             return cast(dchar)0xFFFFFFFF; // unexpected newline: invalid esc sequence
200         int hex = decodeHexDigit(ch);
201         if (hex >= 0) {
202             dchar res = hex;
203             int count = 1;
204             while (count < 6) {
205                 if (pos >= src.length)
206                     break;
207                 ch = src.ptr[pos];
208                 hex = decodeHexDigit(ch);
209                 if (hex < 0)
210                     break;
211                 res = (res << 4) | hex;
212                 pos++;
213                 count++;
214             }
215             if (isCSSWhiteSpaceChar(ch))
216                 pos++;
217             p = pos;
218             return res;
219         } else {
220             // not a hex: one character is escaped
221             p = pos;
222             return ch;
223         }
224     }
225     private void appendEscapedIdentChar(dchar ch) {
226         if (ch < 0x80) {
227             // put as is
228             tokenText ~= cast(char)ch;
229         } else {
230             // UTF-8 encode
231             import std.utf : encode, isValidDchar;
232             char[4] buf;
233             size_t chars = isValidDchar(ch) ? encode(buf, ch) : 0;
234             if (chars)
235                 tokenText ~= buf[0 .. chars];
236             else
237                 tokenText ~= '?'; // replacement for invalid character
238         }
239     }
240 
241     /** Consume identifier at current position, append it to tokenText */
242     bool consumeIdent(ref char[] tokenText) {
243         size_t p = tokenEnd;
244         char ch = src.ptr[p];
245         bool hasHyphen = false;
246         if (ch == '-') {
247             p++;
248             if (p >= src.length)
249                 return false; // eof
250             hasHyphen = true;
251             ch = src.ptr[p];
252         }
253         if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_' || ch >= 0x80) {
254             if (hasHyphen)
255                 tokenText ~= '-';
256             tokenText ~= ch;
257             p++;
258         } else if (ch == '\\') {
259             dchar esc = parseEscape(p);
260             if (esc == 0xFFFFFFFF)
261                 return false; // invalid esc
262             // encode to UTF-8
263             appendEscapedIdentChar(esc);
264         } else {
265             return false;
266         }
267         for (;;) {
268             if (p >= src.length)
269                 break;
270             ch = src.ptr[p];
271             if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')  || (ch >= '0' && ch <= '9') || ch == '_'  || ch == '-' || ch >= 0x80) {
272                 tokenText ~= ch;
273                 p++;
274             } else if (ch == '\\') {
275                 dchar esc = parseEscape(p);
276                 if (esc == 0xFFFFFFFF)
277                     break; // invalid esc
278                 // encode to UTF-8
279                 appendEscapedIdentChar(esc);
280             } else {
281                 break;
282             }
283         }
284         tokenEnd = p;
285         return true;
286     }
287 
288     /**
289       Parse identifier.
290       Returns true if identifier is parsed. tokenText will contain identifier text.
291     */
292     bool parseIdent() {
293         if (!isIdentStart(tokenEnd))
294             return false;
295         if (consumeIdent(tokenText)) {
296             tokenType = tokenType.ident;
297             return true;
298         }
299         return false;
300     }
301 
302     /** returns true if current tokenEnd position is identifier start */
303     bool isIdentStart(size_t p) {
304         if (p >= src.length)
305             return false;
306         char ch = src.ptr[p];
307         if (isCSSNameStart(ch))
308             return true;
309         if (ch == '-') {
310             //If the second code point is a name-start code point or the second and third code points are a valid escape, return true. Otherwise, return false.
311             p++;
312             if (p >= src.length)
313                 return false;
314             ch = src.ptr[p];
315             if (isCSSNameStart(ch))
316                 return true;
317         }
318         if (ch == '\\') {
319             p++;
320             if (p >= src.length)
321                 return false;
322             char ch2 = src.ptr[p];
323             return isCSSValidEscSequence(ch, ch2);
324         }
325         return false;
326     }
327 
328     /**
329     Parse identifier.
330     Returns true if identifier is parsed. tokenText will contain identifier text.
331     */
332     bool parseNumber() {
333         tokenTypeInteger = true;
334         tokenIntValue = 0;
335         tokenDoubleValue = 0;
336         size_t p = tokenEnd;
337         char ch = src.ptr[p];
338         int numberSign = 1;
339         int exponentSign = 1;
340         bool hasPoint = false;
341         ulong intValue = 0;
342         ulong afterPointValue = 0;
343         ulong exponentValue = 0;
344         int beforePointDigits = 0;
345         int afterPointDigits = 0;
346         int exponentDigits = 0;
347         if (ch == '+' || ch == '-') {
348             if (ch == '-')
349                 numberSign = -1;
350             tokenText ~= ch;
351             p++;
352             if (p >= src.length)
353                 return false; // eof
354             ch = src.ptr[p];
355         }
356         // append digits before point
357         while (ch >= '0' && ch <= '9') {
358             tokenText ~= ch;
359             intValue = intValue * 10 + (ch - '0');
360             beforePointDigits++;
361             p++;
362             if (p >= src.length) {
363                 ch = 0;
364                 break;
365             }
366             ch = src.ptr[p];
367         }
368         // check for point
369         if (ch == '.') {
370             hasPoint = true;
371             tokenText ~= ch;
372             p++;
373             if (p >= src.length)
374                 return false; // eof
375             ch = src.ptr[p];
376         }
377         // append digits after point
378         while (ch >= '0' && ch <= '9') {
379             tokenText ~= ch;
380             afterPointValue = afterPointValue * 10 + (ch - '0');
381             afterPointDigits++;
382             p++;
383             if (p >= src.length) {
384                 ch = 0;
385                 break;
386             }
387             ch = src.ptr[p];
388         }
389         if (!beforePointDigits && !afterPointDigits) {
390             if (tokenText.length)
391                 tokenText.length = 0;
392             return false; // not a number
393         }
394         if (ch == 'e' || ch == 'E') {
395             char nextCh = p + 1 < src.length ? src.ptr[p + 1] : 0;
396             char nextCh2 = p + 2 < src.length ? src.ptr[p + 2] : 0;
397             int skip = 1;
398             if (nextCh == '+' || nextCh == '-') {
399                 if (nextCh == '-')
400                     exponentSign = -1;
401                 skip = 2;
402                 nextCh = nextCh2;
403             }
404             if (nextCh >= '0' && nextCh <= '9') {
405                 tokenText ~= src.ptr[p .. p + skip];
406                 p += skip;
407                 ch = nextCh;
408                 // append exponent digits
409                 while (ch >= '0' && ch <= '9') {
410                     tokenText ~= ch;
411                     exponentValue = exponentValue * 10 + (ch - '0');
412                     exponentDigits++;
413                     p++;
414                     if (p >= src.length) {
415                         ch = 0;
416                         break;
417                     }
418                     ch = src.ptr[p];
419                 }
420             }
421         }
422         tokenType = CSSTokenType.number;
423         tokenEnd = p;
424         if (exponentDigits || afterPointDigits) {
425             // parsed floating point
426             tokenDoubleValue = cast(long)intValue;
427             if (afterPointDigits) {
428                 long divider = 1;
429                 for (int i = 0; i < afterPointDigits; i++)
430                     divider *= 10;
431                 tokenDoubleValue += afterPointValue / cast(double)divider;
432             }
433             if (numberSign < 0)
434                 tokenDoubleValue = -tokenDoubleValue;
435             if (exponentDigits) {
436                 import std.math : pow;
437                 double exponent = (cast(long)exponentValue * exponentSign);
438                 tokenDoubleValue = tokenDoubleValue * pow(10, exponent);
439             }
440             tokenIntValue = cast(long)tokenDoubleValue;
441         } else {
442             // parsed integer
443             tokenIntValue = cast(long)intValue;
444             if (numberSign < 0)
445                 tokenIntValue = -tokenIntValue;
446             tokenDoubleValue = tokenIntValue;
447         }
448         dimensionUnit.length = 0;
449         if (isIdentStart(tokenEnd)) {
450             tokenType = CSSTokenType.dimension;
451             consumeIdent(dimensionUnit);
452         }
453         return true;
454     }
455 
456     bool parseString(char quotationChar) {
457         tokenType = CSSTokenType.str;
458         // skip first delimiter ' or "
459         size_t p = tokenEnd + 1;
460         for (;;) {
461             if (p >= src.length) {
462                 // unexpected end of file
463                 tokenEnd = p;
464                 return true;
465             }
466             char ch = src.ptr[p];
467             if (ch == '\r' || ch == '\n') {
468                 tokenType = CSSTokenType.badStr;
469                 tokenEnd = p - 1;
470                 return true;
471             } else if (ch == quotationChar) {
472                 // end of string
473                 tokenEnd = p + 1;
474                 return true;
475             } else if (ch == '\\') {
476                 if (p + 1 >= src.length) {
477                     // unexpected end of file
478                     tokenEnd = p;
479                     return true;
480                 }
481                 ch = src.ptr[p + 1];
482                 if (ch == '\r' || ch == '\n') {
483                     // \ NEWLINE
484                     //tokenText ~= 0x0A;
485                     p++;
486                 } else {
487                     dchar esc = parseEscape(p);
488                     if (esc == 0xFFFFFFFF) {
489                         esc = '?'; // replace invalid code point
490                         p++;
491                     }
492                     // encode to UTF-8
493                     appendEscapedIdentChar(esc);
494                 }
495             } else {
496                 // normal character
497                 tokenText ~= ch;
498                 p++;
499             }
500         }
501     }
502     CSSTokenType emitDelimToken() {
503         import std.utf : stride, UTFException;
504         try {
505             uint len = stride(src[tokenStart .. $]);
506             tokenEnd = tokenStart + len;
507         } catch (UTFException e) {
508             tokenEnd = tokenStart + 1;
509         }
510         tokenText ~= src[tokenStart .. tokenEnd];
511         tokenType = CSSTokenType.delim;
512         return tokenType;
513     }
514     // #token
515     CSSTokenType parseHashToken() {
516         tokenTypeFlagId = false;
517         tokenEnd++;
518         // set tokenTypeFlagId flag
519         if (parseIdent()) {
520             tokenType = CSSTokenType.hashToken;
521             if (tokenText[0] < '0' || tokenText[0] > '9')
522                 tokenTypeFlagId = true; // is valid ID
523             return tokenType;
524         }
525         // invalid ident
526         return emitDelimToken();
527     }
528     /// current chars are /*
529     CSSTokenType parseComment() {
530         size_t p = tokenEnd + 2; // skip /*
531         while (p < src.length) {
532             char ch = src.ptr[p];
533             char ch2 = p + 1 < src.length ? src.ptr[p + 1] : 0;
534             if (ch == '*' && ch2 == '/') {
535                 p += 2;
536                 break;
537             }
538             p++;
539         }
540         tokenEnd = p;
541         tokenType = CSSTokenType.comment;
542         return tokenType;
543     }
544     /// current chars are U+ or u+ followed by hex digit or ?
545     CSSTokenType parseUnicodeRangeToken() {
546         unicodeRangeStart = 0;
547         unicodeRangeEnd = 0;
548         size_t p = tokenEnd + 2; // skip U+
549         // now we have hex digit or ?
550         int hexCount = 0;
551         uint hexNumber = 0;
552         int questionCount = 0;
553         // consume hex digits
554         while (p < src.length) {
555             char ch = src.ptr[p];
556             int digit = decodeHexDigit(ch);
557             if (digit < 0)
558                 break;
559             hexCount++;
560             hexNumber = (hexNumber << 4) | digit;
561             p++;
562             if (hexCount >= 6)
563                 break;
564         }
565         // consume question marks
566         while (p < src.length && questionCount + hexCount < 6) {
567             char ch = src.ptr[p];
568             if (ch != '?')
569                 break;
570             questionCount++;
571             p++;
572         }
573         if (questionCount) {
574             int shift = 4 * questionCount;
575             unicodeRangeStart = hexNumber << shift;
576             unicodeRangeEnd = unicodeRangeStart + ((1 << shift) - 1);
577         } else {
578             unicodeRangeStart = hexNumber;
579             char ch = p < src.length ? src.ptr[p] : 0;
580             char ch2 = p + 1 < src.length ? src.ptr[p + 1] : 0;
581             int digit = decodeHexDigit(ch2);
582             if (ch == '-' && digit >= 0) {
583                 p += 2; // skip - and first digit
584                 hexCount = 1;
585                 hexNumber = digit;
586                 while (p < src.length) {
587                     ch = src.ptr[p];
588                     digit = decodeHexDigit(ch);
589                     if (digit < 0)
590                         break;
591                     hexCount++;
592                     hexNumber = (hexNumber << 4) | digit;
593                     p++;
594                     if (hexCount >= 6)
595                         break;
596                 }
597                 unicodeRangeEnd = hexNumber;
598             } else {
599                 unicodeRangeEnd = unicodeRangeStart;
600             }
601         }
602         tokenEnd = p;
603         tokenType = CSSTokenType.unicodeRange;
604         return tokenType;
605     }
606     /// emit single char token like () {} [] : ;
607     CSSTokenType emitSingleCharToken(CSSTokenType type) {
608         tokenType = type;
609         tokenEnd = tokenStart + 1;
610         tokenText ~= src[tokenStart];
611         return type;
612     }
613     /// emit double char token like $= *=
614     CSSTokenType emitDoubleCharToken(CSSTokenType type) {
615         tokenType = type;
616         tokenEnd = tokenStart + 2;
617         tokenText ~= src[tokenStart .. tokenStart + 2];
618         return type;
619     }
620     void consumeBadUrl() {
621         for (;;) {
622             char ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
623             char ch2 = tokenEnd + 1 < src.length ? src.ptr[tokenEnd + 1] : 0;
624             if (ch == ')' || ch == 0) {
625                 if (ch == ')')
626                     tokenEnd++;
627                 break;
628             }
629             if (isCSSValidEscSequence(ch, ch2)) {
630                 parseEscape(tokenEnd);
631             }
632             tokenEnd++;
633         }
634         tokenType = CSSTokenType.badUrl;
635     }
636     // Current position is after url(
637     void parseUrlToken() {
638         tokenText.length = 0;
639         skipWhiteSpace();
640         if (tokenEnd >= src.length)
641             return;
642         char ch = src.ptr[tokenEnd];
643         if (ch == '\'' || ch == '\"') {
644             if (parseString(ch)) {
645                 skipWhiteSpace();
646                 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
647                 if (ch == ')' || ch == 0) {
648                     // valid URL token
649                     if (ch == ')')
650                         tokenEnd++;
651                     tokenType = CSSTokenType.url;
652                     return;
653                 }
654             }
655             // bad url
656             consumeBadUrl();
657             return;
658         }
659         // not quoted
660         for (;;) {
661             if (skipWhiteSpace()) {
662                 ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
663                 if (ch == ')' || ch == 0) {
664                     if (ch == ')')
665                         tokenEnd++;
666                     tokenType = CSSTokenType.url;
667                     return;
668                 }
669                 consumeBadUrl();
670                 return;
671             }
672             ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
673             char ch2 = tokenEnd + 1 < src.length ? src.ptr[tokenEnd + 1] : 0;
674             if (ch == ')' || ch == 0) {
675                 if (ch == ')')
676                     tokenEnd++;
677                 tokenType = CSSTokenType.url;
678                 return;
679             }
680             if (ch == '(' || ch == '\'' || ch == '\"' || isCSSNonPrintable(ch)) {
681                 consumeBadUrl();
682                 return;
683             }
684             if (ch == '\\') {
685                 if (isCSSValidEscSequence(ch, ch2)) {
686                     dchar esc = parseEscape(tokenEnd);
687                     appendEscapedIdentChar(ch);
688                 } else {
689                     consumeBadUrl();
690                     return;
691                 }
692             }
693             tokenText ~= ch;
694             tokenEnd++;
695         }
696     }
697     CSSTokenType next() {
698         // move beginning of token
699         tokenStart = tokenEnd;
700         tokenText.length = 0;
701         // check for whitespace
702         if (skipWhiteSpace())
703             return tokenType; // whitespace or newline token
704         // check for eof
705         if (tokenEnd >= src.length)
706             return CSSTokenType.eof;
707         char ch = src.ptr[tokenEnd];
708         char nextCh = tokenEnd + 1 < src.length ? src.ptr[tokenEnd + 1] : 0;
709         if (ch == '\"' || ch == '\'') {
710             parseString(ch);
711             return tokenType;
712         }
713         if (ch == '#') {
714             return parseHashToken();
715         }
716         if (ch == '$') {
717             if (nextCh == '=') {
718                 return emitDoubleCharToken(CSSTokenType.suffixMatch);
719             } else {
720                 return emitDelimToken();
721             }
722         }
723         if (ch == '^') {
724             if (nextCh == '=') {
725                 return emitDoubleCharToken(CSSTokenType.prefixMatch);
726             } else {
727                 return emitDelimToken();
728             }
729         }
730         if (ch == '(')
731             return emitSingleCharToken(CSSTokenType.parentOpen);
732         if (ch == ')')
733             return emitSingleCharToken(CSSTokenType.parentClose);
734         if (ch == '[')
735             return emitSingleCharToken(CSSTokenType.squareOpen);
736         if (ch == ']')
737             return emitSingleCharToken(CSSTokenType.squareClose);
738         if (ch == '{')
739             return emitSingleCharToken(CSSTokenType.curlyOpen);
740         if (ch == '}')
741             return emitSingleCharToken(CSSTokenType.curlyClose);
742         if (ch == ',')
743             return emitSingleCharToken(CSSTokenType.comma);
744         if (ch == ':')
745             return emitSingleCharToken(CSSTokenType.colon);
746         if (ch == ';')
747             return emitSingleCharToken(CSSTokenType.semicolon);
748         if (ch == '*') {
749             if (nextCh == '=') {
750                 return emitDoubleCharToken(CSSTokenType.substringMatch);
751             } else {
752                 return emitDelimToken();
753             }
754         }
755         if (ch == '~') {
756             if (nextCh == '=') {
757                 return emitDoubleCharToken(CSSTokenType.includeMatch);
758             } else {
759                 return emitDelimToken();
760             }
761         }
762         if (ch == '|') {
763             if (nextCh == '=') {
764                 return emitDoubleCharToken(CSSTokenType.dashMatch);
765             } else if (nextCh == '|') {
766                 return emitDoubleCharToken(CSSTokenType.column);
767             } else {
768                 return emitDelimToken();
769             }
770         }
771         if (ch == '/') {
772             if (nextCh == '*') {
773                 return parseComment();
774             } else {
775                 return emitDelimToken();
776             }
777         }
778         char nextCh2 = tokenEnd + 2 < src.length ? src.ptr[tokenEnd + 2] : 0;
779         if (ch == 'u' || ch == 'U') {
780             if (nextCh == '+' && (decodeHexDigit(nextCh2) >= 0 || nextCh2 == '?')) {
781                 return parseUnicodeRangeToken();
782             }
783         }
784         if (parseNumber())
785             return tokenType;
786         if (parseIdent()) {
787             ch = tokenEnd < src.length ? src.ptr[tokenEnd] : 0;
788             if (ch == '(') {
789                 tokenEnd++;
790                 import std.uni : icmp;
791                 if (tokenText.length == 3 && icmp(tokenText, "url") == 0) {
792                     // parse URL function
793                     parseUrlToken();
794                 } else {
795                     tokenType = CSSTokenType.func;
796                 }
797             }
798             return tokenType;
799         }
800         if (ch == '-') {
801             if (nextCh == '-' && nextCh2 == '>') {
802                 tokenEnd = tokenStart + 3;
803                 tokenType = CSSTokenType.cdc;
804                 tokenText ~= src[tokenStart .. tokenEnd];
805                 return tokenType;
806             }
807             return emitDelimToken();
808         }
809         if (ch == '<') {
810             char nextCh3 = tokenEnd + 3 < src.length ? src.ptr[tokenEnd + 3] : 0;
811             if (nextCh == '!' && nextCh2 == '-' && nextCh3 == '-') {
812                 tokenEnd = tokenStart + 4;
813                 tokenType = CSSTokenType.cdo;
814                 tokenText ~= src[tokenStart .. tokenEnd];
815                 return tokenType;
816             }
817             return emitDelimToken();
818         }
819         if (ch == '@') {
820             if (isIdentStart(tokenEnd + 1)) {
821                 tokenEnd++;
822                 parseIdent();
823                 tokenType = CSSTokenType.atKeyword;
824                 return tokenType;
825             }
826             return emitDelimToken();
827         }
828         return emitDelimToken();
829     }
830     /// same as next() but returns filled CSSToken struct
831     CSSToken nextToken() {
832         CSSToken res;
833         res.type = next();
834         if (res.type == CSSTokenType.str || res.type == CSSTokenType.ident || res.type == CSSTokenType.atKeyword || res.type == CSSTokenType.url || res.type == CSSTokenType.func) {
835             if (tokenText.length)
836                 res.text = tokenText.dup;
837         }
838         if (res.type == CSSTokenType.dimension && dimensionUnit.length)
839             res.dimensionUnit = dimensionUnit.dup;
840         if (res.type == CSSTokenType.dimension || res.type == CSSTokenType.number) {
841             res.doubleValue = tokenDoubleValue;
842             res.intValue = tokenIntValue;
843             res.typeFlagInteger = tokenTypeInteger;
844         } else if (res.type == CSSTokenType.ident) {
845             res.typeFlagId = tokenTypeFlagId;
846         } else if (res.type == CSSTokenType.unicodeRange) {
847             res.unicodeRangeStart = unicodeRangeStart;
848             res.unicodeRangeEnd = unicodeRangeEnd;
849         }
850         return res;
851     }
852 }
853 
854 unittest {
855     CSSTokenizer tokenizer;
856     tokenizer.start("ident-1{ }\n#id\n'blabla' \"bla bla 2\" -ident2*=12345 -.234e+5 "
857                     ~ "1.23px/* some comment */U+123?!"
858                     ~"url(   'text.css'  )url(bad url)functionName()url( bla )"
859                     ~"'\\30 \\31'");
860     assert(tokenizer.next() == CSSTokenType.ident);
861     assert(tokenizer.tokenText == "ident-1");
862     assert(tokenizer.next() == CSSTokenType.curlyOpen);
863     assert(tokenizer.next() == CSSTokenType.whitespace);
864     assert(tokenizer.next() == CSSTokenType.curlyClose);
865     assert(tokenizer.next() == CSSTokenType.whitespace); //newline
866     assert(tokenizer.next() == CSSTokenType.hashToken);
867     assert(tokenizer.tokenText == "id");
868     assert(tokenizer.tokenTypeFlagId == true);
869     assert(tokenizer.next() == CSSTokenType.whitespace); //newline
870     assert(tokenizer.next() == CSSTokenType.str);
871     assert(tokenizer.tokenText == "blabla");
872     assert(tokenizer.next() == CSSTokenType.whitespace);
873     assert(tokenizer.next() == CSSTokenType.str);
874     assert(tokenizer.tokenText == "bla bla 2");
875     assert(tokenizer.next() == CSSTokenType.whitespace);
876     assert(tokenizer.next() == CSSTokenType.ident);
877     assert(tokenizer.tokenText == "-ident2");
878     assert(tokenizer.next() == CSSTokenType.substringMatch);
879     assert(tokenizer.next() == CSSTokenType.number);
880     assert(tokenizer.tokenText == "12345");
881     assert(tokenizer.tokenIntValue == 12345);
882     assert(tokenizer.next() == CSSTokenType.whitespace);
883     assert(tokenizer.next() == CSSTokenType.number);
884     assert(tokenizer.tokenText == "-.234e+5");
885     assert(tokenizer.tokenIntValue == -23400);
886     assert(tokenizer.tokenDoubleValue == -.234e+5);
887     assert(tokenizer.next() == CSSTokenType.whitespace);
888     // next line
889     assert(tokenizer.next() == CSSTokenType.dimension);
890     assert(tokenizer.tokenText == "1.23");
891     assert(tokenizer.tokenIntValue == 1);
892     assert(tokenizer.tokenDoubleValue == 1.23);
893     assert(tokenizer.dimensionUnit == "px");
894     assert(tokenizer.next() == CSSTokenType.comment);
895     assert(tokenizer.next() == CSSTokenType.unicodeRange);
896     assert(tokenizer.unicodeRangeStart == 0x1230 && tokenizer.unicodeRangeEnd == 0x123F);
897     assert(tokenizer.next() == CSSTokenType.delim);
898     assert(tokenizer.tokenText == "!");
899     // next line
900     assert(tokenizer.next() == CSSTokenType.url);
901     assert(tokenizer.tokenText == "text.css");
902     assert(tokenizer.next() == CSSTokenType.badUrl);
903     assert(tokenizer.next() == CSSTokenType.func);
904     assert(tokenizer.tokenText == "functionName");
905     assert(tokenizer.next() == CSSTokenType.parentClose);
906     assert(tokenizer.next() == CSSTokenType.url);
907     assert(tokenizer.tokenText == "bla");
908     // next line
909     assert(tokenizer.next() == CSSTokenType.str);
910     assert(tokenizer.tokenText == "01"); //'\30 \31'
911     assert(tokenizer.next() == CSSTokenType.eof);
912 }
913 
914 
915 /**
916 Tokenizes css source, returns array of tokens (last token is EOF).
917 Source must be preprocessed utf-8 string.
918 */
919 static CSSToken[] tokenizeCSS(string src) {
920     CSSTokenizer tokenizer;
921     tokenizer.start(src);
922     CSSToken[] res;
923     res.assumeSafeAppend();
924     for(;;) {
925         res ~= tokenizer.nextToken();
926         if (res[$ - 1].type == CSSTokenType.eof)
927             break;
928     }
929     return res;
930 }
931 
932 unittest {
933     string src = "pre {123em}";
934     auto res = tokenizeCSS(src);
935     assert(res.length == 6);
936     assert(res[0].type == CSSTokenType.ident);
937     assert(res[0].text == "pre");
938     assert(res[1].type == CSSTokenType.whitespace);
939     assert(res[2].type == CSSTokenType.curlyOpen);
940     assert(res[3].type == CSSTokenType.dimension);
941     assert(res[3].typeFlagInteger == true);
942     assert(res[3].intValue == 123);
943     assert(res[3].dimensionUnit == "em");
944     assert(res[4].type == CSSTokenType.curlyClose);
945     assert(res[$ - 1].type == CSSTokenType.eof);
946 }
947 
948 // easy way to extract and apply imports w/o full document parsing
949 /**
950     Extract CSS vimport rules from source.
951 */
952 CSSImportRule[] extractCSSImportRules(string src) {
953     enum ParserState {
954         start, // before rule begin, switch to this state after ;
955         afterImport, // after @import
956         afterCharset, // after @charset
957         afterCharsetName, // after @charset
958         afterImportUrl, // after @charset
959     }
960     ParserState state = ParserState.start;
961     CSSImportRule[] res;
962     CSSTokenizer tokenizer;
963     tokenizer.start(src);
964     bool insideImportRule = false;
965     string url;
966     size_t startPos = 0;
967     size_t endPos = 0;
968     for (;;) {
969         CSSTokenType type = tokenizer.next();
970         if (type == CSSTokenType.eof)
971             break;
972         if (type == CSSTokenType.whitespace || type == CSSTokenType.comment)
973             continue; // skip whitespaces and comments
974         if (type == CSSTokenType.atKeyword) {
975             if (tokenizer.tokenText == "charset") {
976                 state = ParserState.afterCharset;
977                 continue;
978             }
979             if (tokenizer.tokenText != "import")
980                 break;
981             // import rule
982             state = ParserState.afterImport;
983             startPos = tokenizer.tokenStart;
984             continue;
985         }
986         if (type == CSSTokenType.str || type == CSSTokenType.url) {
987             if (state == ParserState.afterImport) {
988                 url = tokenizer.tokenText.dup;
989                 state = ParserState.afterImportUrl;
990                 continue;
991             }
992             if (state == ParserState.afterCharset) {
993                 state = ParserState.afterCharsetName;
994                 continue;
995             }
996             break;
997         }
998         if (type == CSSTokenType.curlyOpen)
999             break;
1000         if (type == CSSTokenType.ident && state == ParserState.start)
1001             break; // valid @imports may be only at the beginning of file
1002         if (type == CSSTokenType.semicolon) {
1003             if (state == ParserState.afterImportUrl) {
1004                 // add URL
1005                 endPos = tokenizer.tokenEnd;
1006                 CSSImportRule rule;
1007                 rule.startPos = startPos;
1008                 rule.endPos = endPos;
1009                 rule.url = url;
1010                 res ~= rule;
1011             }
1012             state = ParserState.start;
1013             continue;
1014         }
1015     }
1016     return res;
1017 }
1018 
1019 /**
1020   Replace source code import rules obtained by extractImportRules() with imported content.
1021 */
1022 string applyCSSImportRules(string src, CSSImportRule[] rules) {
1023     if (!rules.length)
1024         return src; // no rules
1025     char[] res;
1026     res.assumeSafeAppend;
1027     size_t start = 0;
1028     for (int i = 0; i < rules.length; i++) {
1029         res ~= src[start .. rules[i].startPos];
1030         res ~= rules[i].content;
1031         start = rules[i].endPos;
1032     }
1033     if (start < src.length)
1034         res ~= src[start .. $];
1035     return cast(string)res;
1036 }
1037 
1038 
1039 unittest {
1040     string src = q{
1041         @charset "utf-8";
1042         /* comment must be ignored */
1043         @import "file1.css"; /* string */
1044         @import url(file2.css); /* url */
1045         pre {}
1046         @import "ignore_me.css";
1047         p {}
1048     };
1049     auto res = extractCSSImportRules(src);
1050     assert(res.length == 2);
1051     assert(res[0].url == "file1.css");
1052     assert(res[1].url == "file2.css");
1053     res[0].content = "[file1_content]";
1054     res[1].content = "[file2_content]";
1055     string s = applyCSSImportRules(src, res);
1056     assert (s.length != src.length);
1057 }
1058 
1059 enum ASTNodeType {
1060     simpleBlock,
1061     componentValue,
1062     preservedToken,
1063     func,
1064     atRule,
1065     qualifiedRule,
1066 }
1067 
1068 class ASTNode {
1069     ASTNodeType type;
1070 }
1071 
1072 class ComponentValueNode : ASTNode {
1073     this() {
1074         type = ASTNodeType.componentValue;
1075     }
1076 }
1077 
1078 class SimpleBlockNode : ComponentValueNode {
1079     CSSTokenType blockType = CSSTokenType.curlyOpen;
1080     ComponentValueNode[] componentValues;
1081     this() {
1082         type = ASTNodeType.simpleBlock;
1083     }
1084 }
1085 
1086 class FunctionNode : ComponentValueNode {
1087     ComponentValueNode[] componentValues;
1088     this(string name) {
1089         type = ASTNodeType.func;
1090     }
1091 }
1092 
1093 class PreservedTokenNode : ComponentValueNode {
1094     CSSToken token;
1095     this(ref CSSToken token) {
1096         this.token = token;
1097         type = ASTNodeType.preservedToken;
1098     }
1099 }
1100 
1101 class QualifiedRuleNode : ASTNode {
1102     ComponentValueNode[] componentValues;
1103     SimpleBlockNode block;
1104     this() {
1105         type = ASTNodeType.qualifiedRule;
1106     }
1107 }
1108 
1109 class ATRuleNode : QualifiedRuleNode {
1110     string name;
1111     this() {
1112         type = ASTNodeType.atRule;
1113     }
1114 }
1115 
1116 
1117 class CSSParser {
1118     CSSToken[] tokens;
1119     int pos = 0;
1120     this(CSSToken[] _tokens) {
1121         tokens = _tokens;
1122     }
1123     /// peek current token
1124     @property ref CSSToken currentToken() {
1125         return tokens[pos];
1126     }
1127     /// peek next token
1128     @property ref CSSToken nextToken() {
1129         return tokens[pos + 1 < $ ? pos + 1 : pos];
1130     }
1131     /// move to next token
1132     bool next() {
1133         if (pos < tokens.length) {
1134             pos++;
1135             return true;
1136         }
1137         return false;
1138     }
1139     /// move to nearest non-whitespace token; return current token type (does not move if current token is not whitespace)
1140     CSSTokenType skipWhiteSpace() {
1141         while (currentToken.type == CSSTokenType.whitespace || currentToken.type == CSSTokenType.comment || currentToken.type == CSSTokenType.delim)
1142             next();
1143         return currentToken.type;
1144     }
1145     /// skip current token, then move to nearest non-whitespace token; return new token type
1146     @property CSSTokenType nextNonWhiteSpace() {
1147         next();
1148         return skipWhiteSpace();
1149     }
1150     SimpleBlockNode parseSimpleBlock() {
1151         auto type = skipWhiteSpace();
1152         CSSTokenType closeType;
1153         if (type == CSSTokenType.curlyOpen) {
1154             closeType = CSSTokenType.curlyClose;
1155         } else if (type == CSSTokenType.squareOpen) {
1156             closeType = CSSTokenType.squareClose;
1157         } else if (type == CSSTokenType.parentOpen) {
1158             closeType = CSSTokenType.parentClose;
1159         } else {
1160             // not a simple block
1161             return null;
1162         }
1163         SimpleBlockNode res = new SimpleBlockNode();
1164         res.blockType = type;
1165         auto t = nextNonWhiteSpace();
1166         res.componentValues = parseComponentValueList(closeType);
1167         t = skipWhiteSpace();
1168         if (t == closeType)
1169             nextNonWhiteSpace();
1170         return res;
1171     }
1172     FunctionNode parseFunctionBlock() {
1173         auto type = skipWhiteSpace();
1174         if (type != CSSTokenType.func)
1175             return null;
1176         FunctionNode res = new FunctionNode(currentToken.text);
1177         auto t = nextNonWhiteSpace();
1178         res.componentValues = parseComponentValueList(CSSTokenType.parentClose);
1179         t = skipWhiteSpace();
1180         if (t == CSSTokenType.parentClose)
1181             nextNonWhiteSpace();
1182         return res;
1183     }
1184     ComponentValueNode[] parseComponentValueList(CSSTokenType endToken1 = CSSTokenType.eof, CSSTokenType endToken2 = CSSTokenType.eof) {
1185         ComponentValueNode[] res;
1186         for (;;) {
1187             auto type = skipWhiteSpace();
1188             if (type == CSSTokenType.eof)
1189                 return res;
1190             if (type == endToken1 || type == endToken2)
1191                 return res;
1192             if (type == CSSTokenType.squareOpen || type == CSSTokenType.parentOpen || type == CSSTokenType.curlyOpen) {
1193                 res ~= parseSimpleBlock();
1194             } else if (type == CSSTokenType.func) {
1195                 res ~= parseFunctionBlock();
1196             } else {
1197                 res ~= new PreservedTokenNode(currentToken);
1198                 next();
1199             }
1200         }
1201     }
1202     ATRuleNode parseATRule() {
1203         auto type = skipWhiteSpace();
1204         if (type != CSSTokenType.atKeyword)
1205             return null;
1206         ATRuleNode res = new ATRuleNode();
1207         res.name = currentToken.text;
1208         type = nextNonWhiteSpace();
1209         res.componentValues = parseComponentValueList(CSSTokenType.semicolon, CSSTokenType.curlyOpen);
1210         type = skipWhiteSpace();
1211         if (type == CSSTokenType.semicolon) {
1212             next();
1213             return res;
1214         }
1215         if (type == CSSTokenType.curlyOpen) {
1216             res.block = parseSimpleBlock();
1217             return res;
1218         }
1219         if (type == CSSTokenType.eof)
1220             return res;
1221         return res;
1222     }
1223 
1224     QualifiedRuleNode parseQualifiedRule() {
1225         auto type = skipWhiteSpace();
1226         if (type == CSSTokenType.eof)
1227             return null;
1228         QualifiedRuleNode res = new QualifiedRuleNode();
1229         res.componentValues = parseComponentValueList(CSSTokenType.curlyOpen);
1230         type = skipWhiteSpace();
1231         if (type == CSSTokenType.curlyOpen) {
1232             res.block = parseSimpleBlock();
1233         }
1234         return res;
1235     }
1236 }
1237 
1238 unittest {
1239     ATRuleNode atRule = new CSSParser(tokenizeCSS("@atRuleName;")).parseATRule();
1240     assert(atRule !is null);
1241     assert(atRule.name == "atRuleName");
1242     assert(atRule.block is null);
1243 
1244     atRule = new CSSParser(tokenizeCSS("@atRuleName2 { }")).parseATRule();
1245     assert(atRule !is null);
1246     assert(atRule.name == "atRuleName2");
1247     assert(atRule.block !is null);
1248     assert(atRule.block.blockType == CSSTokenType.curlyOpen);
1249 
1250     atRule = new CSSParser(tokenizeCSS("@atRuleName3 url('bla') { 123 }")).parseATRule();
1251     assert(atRule !is null);
1252     assert(atRule.name == "atRuleName3");
1253     assert(atRule.componentValues.length == 1);
1254     assert(atRule.componentValues[0].type == ASTNodeType.preservedToken);
1255     assert(atRule.block !is null);
1256     assert(atRule.block.blockType == CSSTokenType.curlyOpen);
1257     assert(atRule.block.componentValues.length == 1);
1258 
1259 
1260     atRule = new CSSParser(tokenizeCSS("@atRuleName4 \"value\" { funcName(123) }")).parseATRule();
1261     assert(atRule !is null);
1262     assert(atRule.name == "atRuleName4");
1263     assert(atRule.componentValues.length == 1);
1264     assert(atRule.componentValues[0].type == ASTNodeType.preservedToken);
1265     assert(atRule.block !is null);
1266     assert(atRule.block.blockType == CSSTokenType.curlyOpen);
1267     assert(atRule.block.componentValues.length == 1);
1268     assert(atRule.block.componentValues[0].type == ASTNodeType.func);
1269 }
1270 
1271 unittest {
1272     QualifiedRuleNode qualifiedRule = new CSSParser(tokenizeCSS(" pre { display: none } ")).parseQualifiedRule();
1273     assert(qualifiedRule !is null);
1274     assert(qualifiedRule.componentValues.length == 1);
1275     assert(qualifiedRule.block !is null);
1276     assert(qualifiedRule.block.componentValues.length == 3);
1277 }