1 module dom.encoding;
2 
3 string findCharsetDirective(ubyte[] src) {
4     import std.string;
5     import std.algorithm : min;
6     string encoding = null;
7     if (src.length >= 17) {
8         auto head = cast(string)src[0 .. min(1024, src.length)];
9         auto encPos = head.indexOf(`@charset "`);
10         if (encPos >= 0) {
11             head = head[10 .. $];
12             auto endPos = head.indexOf('"');
13             if (endPos > 0) {
14                 head = head[0 .. endPos];
15                 bool valid = true;
16                 ubyte v = 0;
17                 foreach(ch; head)
18                     v |= ch;
19                 if (v & 0x80) {
20                     // only code points 0..127
21                     // found valid @charset directive
22                     return cast(string)head.dup;
23                 }
24             }
25         }
26     }
27     return null; // not found
28 }
29 
30 /**
31    Convert CSS code bytes to utf-8.
32    src is source byte stream
33    baseEncoding is name of HTTP stream encoding or base document encoding.
34 */
35 char[] bytesToUtf8(ubyte[] src, string streamEncoding = null, string environmentEncoding = null) {
36     import std.string;
37     import std.algorithm : min;
38     bool isUtf8 = false;
39     string encoding = null;
40     if (streamEncoding) {
41         encoding = streamEncoding;
42     } else {
43         string charsetDirectiveEncoding = findCharsetDirective(src);
44         if (charsetDirectiveEncoding) {
45             encoding = charsetDirectiveEncoding;
46             if (charsetDirectiveEncoding[0] == 'u' && charsetDirectiveEncoding[1] == 't' && charsetDirectiveEncoding[2] == 'f' && charsetDirectiveEncoding[3] == '-') {
47                 isUtf8 = true; // for utf-16be, utf-16le use utf-8
48                 encoding = "utf-8";
49             }
50         }
51     }
52     if (!encoding && environmentEncoding)
53         encoding = environmentEncoding;
54     if (!encoding) {
55         // check bom
56         // utf-8 BOM
57         if (src.length > 3 && src[0] == 0xEF && src[1] == 0xBB && src[2] == 0xBF) {
58             isUtf8 = true;
59             encoding = "utf-8";
60             src = src[3 .. $];
61         } else {
62             // TODO: support other UTF-8 BOMs
63         }
64     }
65     if (isUtf8) {
66         // no decoding needed
67         return cast(char[])src.dup;
68     }
69     // TODO: support more encodings
70     // unknown encoding
71     return null;
72 }