1 module dom.encoding;
2
3 string findCharsetDirective(ubyte[] src) {
4 import std.string;
5 import std.algorithm : min;
6 string encoding = null;
7 if (src.length >= 17) {
8 auto head = cast(string)src[0 .. min(1024, src.length)];
9 auto encPos = head.indexOf(`@charset "`);
10 if (encPos >= 0) {
11 head = head[10 .. $];
12 auto endPos = head.indexOf('"');
13 if (endPos > 0) {
14 head = head[0 .. endPos];
15 bool valid = true;
16 ubyte v = 0;
17 foreach(ch; head)
18 v |= ch;
19 if (v & 0x80) {
20 // only code points 0..127
21 // found valid @charset directive
22 return cast(string)head.dup;
23 }
24 }
25 }
26 }
27 return null; // not found
28 }
29
30 /**
31 Convert CSS code bytes to utf-8.
32 src is source byte stream
33 baseEncoding is name of HTTP stream encoding or base document encoding.
34 */
35 char[] bytesToUtf8(ubyte[] src, string streamEncoding = null, string environmentEncoding = null) {
36 import std.string;
37 import std.algorithm : min;
38 bool isUtf8 = false;
39 string encoding = null;
40 if (streamEncoding) {
41 encoding = streamEncoding;
42 } else {
43 string charsetDirectiveEncoding = findCharsetDirective(src);
44 if (charsetDirectiveEncoding) {
45 encoding = charsetDirectiveEncoding;
46 if (charsetDirectiveEncoding[0] == 'u' && charsetDirectiveEncoding[1] == 't' && charsetDirectiveEncoding[2] == 'f' && charsetDirectiveEncoding[3] == '-') {
47 isUtf8 = true; // for utf-16be, utf-16le use utf-8
48 encoding = "utf-8";
49 }
50 }
51 }
52 if (!encoding && environmentEncoding)
53 encoding = environmentEncoding;
54 if (!encoding) {
55 // check bom
56 // utf-8 BOM
57 if (src.length > 3 && src[0] == 0xEF && src[1] == 0xBB && src[2] == 0xBF) {
58 isUtf8 = true;
59 encoding = "utf-8";
60 src = src[3 .. $];
61 } else {
62 // TODO: support other UTF-8 BOMs
63 }
64 }
65 if (isUtf8) {
66 // no decoding needed
67 return cast(char[])src.dup;
68 }
69 // TODO: support more encodings
70 // unknown encoding
71 return null;
72 }