1 module dom.encoding; 2 3 string findCharsetDirective(ubyte[] src) { 4 import std.string; 5 import std.algorithm : min; 6 string encoding = null; 7 if (src.length >= 17) { 8 auto head = cast(string)src[0 .. min(1024, src.length)]; 9 auto encPos = head.indexOf(`@charset "`); 10 if (encPos >= 0) { 11 head = head[10 .. $]; 12 auto endPos = head.indexOf('"'); 13 if (endPos > 0) { 14 head = head[0 .. endPos]; 15 bool valid = true; 16 ubyte v = 0; 17 foreach(ch; head) 18 v |= ch; 19 if (v & 0x80) { 20 // only code points 0..127 21 // found valid @charset directive 22 return cast(string)head.dup; 23 } 24 } 25 } 26 } 27 return null; // not found 28 } 29 30 /** 31 Convert CSS code bytes to utf-8. 32 src is source byte stream 33 baseEncoding is name of HTTP stream encoding or base document encoding. 34 */ 35 char[] bytesToUtf8(ubyte[] src, string streamEncoding = null, string environmentEncoding = null) { 36 import std.string; 37 import std.algorithm : min; 38 bool isUtf8 = false; 39 string encoding = null; 40 if (streamEncoding) { 41 encoding = streamEncoding; 42 } else { 43 string charsetDirectiveEncoding = findCharsetDirective(src); 44 if (charsetDirectiveEncoding) { 45 encoding = charsetDirectiveEncoding; 46 if (charsetDirectiveEncoding[0] == 'u' && charsetDirectiveEncoding[1] == 't' && charsetDirectiveEncoding[2] == 'f' && charsetDirectiveEncoding[3] == '-') { 47 isUtf8 = true; // for utf-16be, utf-16le use utf-8 48 encoding = "utf-8"; 49 } 50 } 51 } 52 if (!encoding && environmentEncoding) 53 encoding = environmentEncoding; 54 if (!encoding) { 55 // check bom 56 // utf-8 BOM 57 if (src.length > 3 && src[0] == 0xEF && src[1] == 0xBB && src[2] == 0xBF) { 58 isUtf8 = true; 59 encoding = "utf-8"; 60 src = src[3 .. $]; 61 } else { 62 // TODO: support other UTF-8 BOMs 63 } 64 } 65 if (isUtf8) { 66 // no decoding needed 67 return cast(char[])src.dup; 68 } 69 // TODO: support more encodings 70 // unknown encoding 71 return null; 72 }