1 // Written in the D programming language. 2 3 /** 4 5 This module contains text stream reader implementation 6 7 Implements class LineStream for reading of unicode text from stream and returning it by lines. 8 9 Support utf8, utf16, utf32 be and le encodings, and line endings - according to D language source file specification. 10 11 Low resource consuming. Doesn't flood with GC allocations. Dup line if you want to store it somewhere. 12 13 Tracks line number. 14 15 16 Synopsis: 17 18 ---- 19 import dlangui.core.linestream; 20 21 import std.stdio; 22 import std.conv; 23 import std.utf; 24 string fname = "somefile.d"; 25 writeln("opening file"); 26 std.stream.File f = new std.stream.File(fname); 27 scope(exit) { f.close(); } 28 try { 29 LineStream lines = LineStream.create(f, fname); 30 for (;;) { 31 dchar[] s = lines.readLine(); 32 if (s is null) 33 break; 34 writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s)); 35 } 36 if (lines.errorCode != 0) { 37 writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos); 38 } else { 39 writeln("EOF reached"); 40 } 41 } catch (Exception e) { 42 writeln("Exception " ~ e.toString); 43 } 44 45 ---- 46 47 Copyright: Vadim Lopatin, 2014 48 License: Boost License 1.0 49 Authors: Vadim Lopatin, coolreader.org@gmail.com 50 */ 51 module dlangui.core.linestream; 52 53 import dlangui.core.streams; 54 //import std.stream; 55 import std.stdio; 56 import std.conv; 57 import std.utf; 58 59 /// File encoding 60 public enum EncodingType : int { 61 /// utf-8 unicode 62 UTF8, 63 /// utf-16 unicode big endian 64 UTF16BE, 65 /// utf-16 unicode little endian 66 UTF16LE, 67 /// utf-32 unicode big endian 68 UTF32BE, 69 /// utf-32 unicode little endian 70 UTF32LE, 71 /// plain ASCII (character codes must be <= 127) 72 ASCII, 73 /// encoding is unknown 74 UNKNOWN 75 } 76 /// Line ending style 77 public enum LineEnding : int { 78 /// LF (0x0A) - unix style 79 LF, 80 /// CR followed by LF (0x0D,0x0A) - windows style 81 CRLF, 82 /// CR (0x0D) - mac style 83 CR, 84 /// unknown line ending 85 UNKNOWN, 86 /// mixed line endings detected 87 MIXED 88 } 89 90 /// Text file format 91 struct TextFileFormat { 92 /// character encoding 93 EncodingType encoding; 94 /// line ending style 95 LineEnding lineEnding; 96 /// byte order mark character flag 97 bool bom; 98 string toString() const { 99 return to!string(encoding) ~ " " ~ to!string(lineEnding) ~ (bom ? " bom" : ""); 100 } 101 } 102 103 /// Text file writer which supports different text file formats 104 class OutputLineStream { 105 protected OutputStream _stream; 106 protected string _filename; 107 protected TextFileFormat _format; 108 protected bool _firstLine; 109 protected char[] _buf; 110 protected int _len; 111 protected static immutable int MAX_BUFFER_SIZE = 0x10000; // 64K 112 /// create 113 this(OutputStream stream, string filename, TextFileFormat format) { 114 _stream = stream; 115 _filename = filename; 116 _format = format; 117 _firstLine = true; 118 // fix format 119 if (_format.encoding == EncodingType.UNKNOWN || _format.encoding == EncodingType.ASCII) 120 _format.encoding = EncodingType.UTF8; 121 if (_format.lineEnding == LineEnding.UNKNOWN || _format.lineEnding == LineEnding.MIXED) { 122 version (Windows) { 123 _format.lineEnding = LineEnding.CRLF; 124 } else { 125 _format.lineEnding = LineEnding.LF; 126 } 127 } 128 } 129 130 protected void flush() { 131 if (_len > 0) { 132 _stream.write(cast(ubyte[])_buf[0 .. _len]); 133 _len = 0; 134 } 135 } 136 137 /// convert character encoding and write to output stream 138 protected void convertAndWrite(dstring s) { 139 /// reserve buf space 140 if (_buf.length < _len + s.length * 4 + 4) 141 _buf.length = _len + s.length * 4 + 4; 142 switch (_format.encoding) with(EncodingType) 143 { 144 case UTF8: 145 default: 146 char[4] d; 147 foreach(i; 0 .. s.length) { 148 int bytes = cast(int)encode(d, s[i]); 149 foreach(j; 0 .. bytes) 150 _buf[_len++] = d[j]; 151 } 152 break; 153 case UTF16BE: 154 wchar[2] d; 155 foreach(i; 0 .. s.length) { 156 int n = cast(int)encode(d, s[i]); 157 foreach(j; 0 .. n) { 158 _buf[_len++] = cast(char)(d[j] >> 8); 159 _buf[_len++] = cast(char)(d[j] & 0xFF); 160 } 161 } 162 break; 163 case UTF16LE: 164 wchar[2] d; 165 foreach(i; 0 .. s.length) { 166 int n = cast(int)encode(d, s[i]); 167 foreach(j; 0 .. n) { 168 _buf[_len++] = cast(char)(d[j] & 0xFF); 169 _buf[_len++] = cast(char)(d[j] >> 8); 170 } 171 } 172 break; 173 case UTF32LE: 174 foreach(i; 0 .. s.length) { 175 dchar ch = s[i]; 176 _buf[_len++] = cast(char)((ch >> 0) & 0xFF); 177 _buf[_len++] = cast(char)((ch >> 8) & 0xFF); 178 _buf[_len++] = cast(char)((ch >> 16) & 0xFF); 179 _buf[_len++] = cast(char)((ch >> 24) & 0xFF); 180 } 181 break; 182 case UTF32BE: 183 foreach(i; 0 .. s.length) { 184 dchar ch = s[i]; 185 _buf[_len++] = cast(char)((ch >> 24) & 0xFF); 186 _buf[_len++] = cast(char)((ch >> 16) & 0xFF); 187 _buf[_len++] = cast(char)((ch >> 8) & 0xFF); 188 _buf[_len++] = cast(char)((ch >> 0) & 0xFF); 189 } 190 break; 191 } 192 if (_len > MAX_BUFFER_SIZE) 193 flush(); 194 } 195 /// write single line 196 void writeLine(dstring line) { 197 if (_firstLine) { 198 if (_format.bom) 199 convertAndWrite("\uFEFF"d); // write BOM 200 _firstLine = false; 201 } 202 convertAndWrite(line); 203 switch(_format.lineEnding) { 204 case LineEnding.LF: 205 convertAndWrite("\n"d); 206 break; 207 case LineEnding.CR: 208 convertAndWrite("\r"d); 209 break; 210 default: 211 case LineEnding.CRLF: 212 convertAndWrite("\r\n"d); 213 break; 214 } 215 } 216 /// close stream 217 void close() { 218 flush(); 219 _stream.close(); 220 _buf = null; 221 } 222 } 223 224 225 /** 226 Support reading of file (or string in memory) by lines 227 228 Support utf8, utf16, utf32 be and le encodings, and line endings - according to D language source file specification. 229 230 Low resource consuming. Doesn't flood with GC allocations. Dup line if you want to store it somewhere. 231 232 Tracks line number. 233 */ 234 class LineStream { 235 236 /// Error codes 237 public enum ErrorCodes { 238 /// invalid character for current encoding 239 INVALID_CHARACTER 240 } 241 242 private InputStream _stream; 243 private string _filename; 244 private ubyte[] _buf; // stream reading buffer 245 private uint _pos; // reading position of stream buffer 246 private uint _len; // number of bytes in stream buffer 247 private bool _streamEof; // true if input stream is in EOF state 248 private uint _line; // current line number 249 250 private uint _textPos; // start of text line in text buffer 251 private uint _textLen; // position of last filled char in text buffer + 1 252 private dchar[] _textBuf; // text buffer 253 private bool _eof; // end of file, no more lines 254 protected bool _bomDetected; 255 protected int _crCount; 256 protected int _lfCount; 257 protected int _crlfCount; 258 259 /// Returns file name 260 @property string filename() { return _filename; } 261 /// Returns current line number 262 @property uint line() { return _line; } 263 /// Returns file encoding EncodingType 264 @property EncodingType encoding() { return _encoding; } 265 266 @property TextFileFormat textFormat() { 267 LineEnding le = LineEnding.CRLF; 268 if (_crlfCount) { 269 if (_crCount == _lfCount) 270 le = LineEnding.CRLF; 271 else 272 le = LineEnding.MIXED; 273 } else if (_crCount > _lfCount) { 274 le = LineEnding.CR; 275 } else if (_lfCount > _crCount) { 276 le = LineEnding.LF; 277 } else { 278 le = LineEnding.MIXED; 279 } 280 TextFileFormat res = TextFileFormat(_encoding, le, _bomDetected); 281 return res; 282 } 283 284 285 /// Returns error code 286 @property int errorCode() { return _errorCode; } 287 /// Returns error message 288 @property string errorMessage() { return _errorMessage; } 289 /// Returns line where error is found 290 @property int errorLine() { return _errorLine; } 291 /// Returns line position (number of character in line) where error is found 292 @property int errorPos() { return _errorPos; } 293 294 private immutable EncodingType _encoding; 295 296 private int _errorCode; 297 private string _errorMessage; 298 private uint _errorLine; 299 private uint _errorPos; 300 301 /// Open file with known encoding 302 protected this(InputStream stream, string filename, EncodingType encoding, ubyte[] buf, uint offset, uint len) { 303 _filename = filename; 304 _stream = stream; 305 _encoding = encoding; 306 _buf = buf; 307 _len = len; 308 _pos = offset; 309 _streamEof = _stream.eof; 310 } 311 312 /// this constructor was created for unittests only 313 protected this(){ 314 _encoding = EncodingType.UTF8; 315 } 316 317 /// returns slice of bytes available in buffer 318 protected uint readBytes() { 319 uint bytesLeft = _len - _pos; 320 if (_streamEof || bytesLeft > QUARTER_BYTE_BUFFER_SIZE) 321 return bytesLeft; 322 if (_pos > 0) { 323 foreach(i; 0 .. bytesLeft) 324 _buf[i] = _buf[i + _pos]; 325 _len = bytesLeft; 326 _pos = 0; 327 } 328 uint bytesRead = cast(uint)_stream.read(_buf[_len .. BYTE_BUFFER_SIZE]); 329 _len += bytesRead; 330 _streamEof = _stream.eof; 331 return _len - _pos; //_buf[_pos .. _len]; 332 } 333 334 // when bytes consumed from byte buffer, call this method to update position 335 protected void consumedBytes(uint count) { 336 _pos += count; 337 } 338 339 // reserve text buffer for specified number of characters, and return pointer to first free character in buffer 340 protected dchar * reserveTextBuf(uint len) { 341 // create new text buffer if necessary 342 if (_textBuf == null) { 343 if (len < TEXT_BUFFER_SIZE) 344 len = TEXT_BUFFER_SIZE; 345 _textBuf = new dchar[len]; 346 return _textBuf.ptr; 347 } 348 uint spaceLeft = cast(uint)_textBuf.length - _textLen; 349 if (spaceLeft >= len) 350 return _textBuf.ptr + _textLen; 351 // move text to beginning of buffer, if necessary 352 if (_textPos > _textBuf.length / 2) { 353 uint charCount = _textLen - _textPos; 354 dchar * p = _textBuf.ptr; 355 foreach(i; 0 .. charCount) 356 p[i] = p[i + _textPos]; 357 _textLen = charCount; 358 _textPos = 0; 359 } 360 // resize buffer if necessary 361 if (_textLen + len > _textBuf.length) { 362 // resize buffer 363 uint newsize = cast(uint)_textBuf.length * 2; 364 if (newsize < _textLen + len) 365 newsize = _textLen + len; 366 _textBuf.length = newsize; 367 } 368 return _textBuf.ptr + _textLen; 369 } 370 371 protected void appendedText(uint len) { 372 //writeln("appended ", len, " chars of text"); //:", _textBuf[_textLen .. _textLen + len]); 373 _textLen += len; 374 } 375 376 protected void setError(int code, string message, uint errorLine, uint errorPos) { 377 _errorCode = code; 378 _errorMessage = message; 379 _errorLine = errorLine; 380 _errorPos = errorPos; 381 } 382 383 // override to decode text 384 protected abstract uint decodeText(); 385 386 /// Unknown line position 387 immutable static uint LINE_POSITION_UNDEFINED = uint.max; 388 389 /// Read line from stream 390 public dchar[] readLine() { 391 if (_errorCode != 0) { 392 //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine); 393 return null; // error detected 394 } 395 if (_eof) { 396 //writeln("EOF found"); 397 return null; 398 } 399 _line++; 400 uint p = 0; 401 uint eol = LINE_POSITION_UNDEFINED; 402 uint eof = LINE_POSITION_UNDEFINED; 403 uint lastchar = LINE_POSITION_UNDEFINED; 404 do { 405 if (_errorCode != 0) { 406 //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine); 407 return null; // error detected 408 } 409 uint charsLeft = _textLen - _textPos; 410 if (p >= charsLeft) { 411 uint decodedChars = decodeText(); 412 if (_errorCode != 0) { 413 return null; // error detected 414 } 415 charsLeft = _textLen - _textPos; 416 if (decodedChars == 0) { 417 eol = charsLeft; 418 eof = charsLeft; 419 lastchar = charsLeft; 420 break; 421 } 422 } 423 for (; p < charsLeft; p++) { 424 dchar ch = _textBuf[_textPos + p]; 425 if (ch == '\r') { // CR 426 lastchar = p; 427 if (p == charsLeft - 1) { 428 // need one more char to check if it's 0D0A or just 0D eol 429 //writeln("read one more char for 0D0A detection"); 430 decodeText(); 431 if (_errorCode != 0) { 432 return null; // error detected 433 } 434 charsLeft = _textLen - _textPos; 435 } 436 dchar ch2 = (p < charsLeft - 1) ? _textBuf[_textPos + p + 1] : 0; 437 if (ch2 == '\n') { // LF 438 // CRLF 439 eol = p + 2; 440 _lfCount++; 441 _crCount++; 442 _crlfCount++; 443 } else { 444 // just CR 445 eol = p + 1; 446 _crCount++; 447 } 448 break; 449 } else if (ch == '\n' || ch == 0x2028 || ch == 0x2029) { 450 // single char eoln 451 lastchar = p; 452 eol = p + 1; 453 _lfCount++; 454 break; 455 } else if (ch == 0 || ch == 0x001A) { 456 // eof 457 //writeln("EOF char found"); 458 lastchar = p; 459 eol = eof = p + 1; 460 break; 461 } 462 } 463 } while (eol == LINE_POSITION_UNDEFINED); 464 uint lineStart = _textPos; 465 uint lineEnd = _textPos + lastchar; 466 _textPos += eol; // consume text 467 if (eof != LINE_POSITION_UNDEFINED) { 468 _eof = true; 469 //writeln("Setting eof flag. lastchar=", lastchar, ", p=", p, ", lineStart=", lineStart); 470 if (lineStart >= lineEnd) { 471 //writeln("lineStart >= lineEnd -- treat as eof"); 472 return null; // eof 473 } 474 } 475 // return slice with decoded line 476 return _textBuf[lineStart .. lineEnd]; 477 } 478 479 protected immutable static int TEXT_BUFFER_SIZE = 1024; 480 protected immutable static int BYTE_BUFFER_SIZE = 512; 481 protected immutable static int QUARTER_BYTE_BUFFER_SIZE = BYTE_BUFFER_SIZE / 4; 482 483 /// Factory method for string parser 484 public static LineStream create(string code, string filename = "") { 485 uint len = cast(uint)code.length; 486 ubyte[] data = new ubyte[len + 3]; 487 foreach(i; 0 .. len) 488 data[i + 3] = code[i]; 489 // BOM for UTF8 490 data[0] = 0xEF; 491 data[1] = 0xBB; 492 data[2] = 0xBF; 493 InputStream stream = new MemoryInputStream(data); //new MemoryStream(data); 494 return create(stream, filename); 495 } 496 497 /// Factory for InputStream parser 498 public static LineStream create(InputStream stream, string filename, bool autodetectUTFIfNoBOM = true) { 499 ubyte[] buf = new ubyte[BYTE_BUFFER_SIZE]; 500 buf[0] = buf[1] = buf[2] = buf[3] = 0; 501 if (!stream.isOpen) 502 return null; 503 uint len = cast(uint)stream.read(buf); 504 LineStream res = null; 505 if (buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) { 506 res = new Utf8LineStream(stream, filename, buf, len, 3); 507 } else if (buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF) { 508 res = new Utf32beLineStream(stream, filename, buf, len); 509 } else if (buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00) { 510 res = new Utf32leLineStream(stream, filename, buf, len); 511 } else if (buf[0] == 0xFE && buf[1] == 0xFF) { 512 res = new Utf16beLineStream(stream, filename, buf, len); 513 } else if (buf[0] == 0xFF && buf[1] == 0xFE) { 514 res = new Utf16leLineStream(stream, filename, buf, len); 515 } 516 if (res) { 517 res._bomDetected = true; 518 } else { 519 if (autodetectUTFIfNoBOM) { 520 res = new Utf8LineStream(stream, filename, buf, len, 0); 521 } else { 522 res = new AsciiLineStream(stream, filename, buf, len); 523 } 524 } 525 return res; 526 } 527 528 protected bool invalidCharFlag; 529 protected void invalidCharError() { 530 uint pos = _textLen - _textPos + 1; 531 setError(ErrorCodes.INVALID_CHARACTER, "Invalid character in line " ~ to!string(_line) ~ ":" ~ to!string(pos), _line, pos); 532 } 533 } 534 535 536 private class AsciiLineStream : LineStream { 537 this(InputStream stream, string filename, ubyte[] buf, uint len) { 538 super(stream, filename, EncodingType.ASCII, buf, 0, len); 539 } 540 override uint decodeText() { 541 if (invalidCharFlag) { 542 invalidCharError(); 543 return 0; 544 } 545 uint bytesAvailable = readBytes(); 546 ubyte * bytes = _buf.ptr + _pos; 547 if (bytesAvailable == 0) 548 return 0; // nothing to decode 549 uint len = bytesAvailable; 550 ubyte* b = bytes; 551 dchar* text = reserveTextBuf(len); 552 uint i = 0; 553 for (; i < len; i++) { 554 ubyte ch = b[i]; 555 if (ch & 0x80) { 556 // invalid character 557 invalidCharFlag = true; 558 break; 559 } 560 text[i] = ch; 561 } 562 consumedBytes(i); 563 appendedText(i); 564 return len; 565 } 566 567 } 568 569 private class Utf8LineStream : LineStream { 570 this(InputStream stream, string filename, ubyte[] buf, uint len, int skip) { 571 super(stream, filename, EncodingType.UTF8, buf, skip, len); 572 } 573 574 uint decodeBytes(ubyte* b,in uint bleft, out uint ch, out bool needMoreFlag){ 575 uint bread = 0; 576 uint ch0 = b[0]; 577 if (!(ch0 & 0x80)) { 578 // 0x00..0x7F single byte 579 // 0x80 == 10000000 580 // !(ch0 & 0x80) => ch0 < 10000000 581 ch = ch0; 582 bread = 1; 583 } else if ((ch0 & 0xE0) == 0xC0) { 584 // two bytes 110xxxxx 10xxxxxx 585 if (bleft < 2) { 586 needMoreFlag = true; 587 return 0; 588 } 589 uint ch1 = b[1]; 590 if ((ch1 & 0xC0) != 0x80) { 591 return 0; 592 } 593 ch = ((ch0 & 0x1F) << 6) | (ch1 & 0x3F); 594 bread = 2; 595 } else if ((ch0 & 0xF0) == 0xE0) { 596 // three bytes 1110xxxx 10xxxxxx 10xxxxxx 597 if (bleft < 3) { 598 needMoreFlag = true; 599 return 0; 600 } 601 uint ch1 = b[1]; 602 uint ch2 = b[2]; 603 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) { 604 return 0; 605 } 606 ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x3F) << 6) | (ch2 & 0x3F); 607 bread = 3; 608 } else if ((ch0 & 0xF8) == 0xF0) { 609 // four bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 610 if (bleft < 4) { 611 needMoreFlag = true; 612 return 0; 613 } 614 uint ch1 = b[1]; 615 uint ch2 = b[2]; 616 uint ch3 = b[3]; 617 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) { 618 return 0; 619 } 620 ch = ((ch0 & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); 621 bread = 4; 622 } else if ((ch0 & 0xFC) == 0xF8) { 623 // five bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 624 if (bleft < 5) { 625 needMoreFlag = true; 626 return 0; 627 } 628 uint ch1 = b[1]; 629 uint ch2 = b[2]; 630 uint ch3 = b[3]; 631 uint ch4 = b[4]; 632 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) { 633 return 0; 634 } 635 ch = ((ch0 & 0x03) << 24) | ((ch1 & 0x3F) << 18) | ((ch2 & 0x3F) << 12) | ((ch3 & 0x3F) << 6) | (ch4 & 0x3F); 636 bread = 5; 637 } else if ((ch0 & 0xFE) == 0xFC) { 638 // six bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 639 if (bleft < 6){ 640 needMoreFlag = true; 641 return 0; 642 } 643 644 uint ch1 = b[1]; 645 uint ch2 = b[2]; 646 uint ch3 = b[3]; 647 uint ch4 = b[4]; 648 uint ch5 = b[5]; 649 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80 || (ch5 & 0xC0) != 0x80) { 650 return 0; 651 } 652 ch = ((ch0 & 0x01) << 30) | ((ch1 & 0x3F) << 24) | ((ch2 & 0x3F) << 18) | ((ch3 & 0x3F) << 12) | ((ch4 & 0x3F) << 6) | (ch5 & 0x3F); 653 bread = 5; 654 } 655 if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { 656 return 0; 657 } 658 return bread; 659 } 660 661 /// this constructor was created for unittests only 662 protected this(){ 663 664 } 665 666 unittest { 667 auto o = new Utf8LineStream(); 668 ubyte[] buffer = new ubyte[4]; 669 ubyte * bytes = buffer.ptr; 670 uint ch; 671 bool needMoreFlag; 672 uint bread; 673 674 //convert simple character 675 buffer[0] = '/'; 676 bread = o.decodeBytes(bytes,1,ch,needMoreFlag); 677 assert(!needMoreFlag); 678 assert(bread == 1); 679 assert(ch == '/'); 680 //writefln("/ as hex: 0x%32x,0x%32x", ch,'/'); 681 682 683 //convert 2byte character 684 buffer[0] = 0xc3; 685 buffer[1] = 0x84; 686 bread = o.decodeBytes(bytes,1,ch,needMoreFlag); 687 assert(needMoreFlag); 688 689 bread = o.decodeBytes(bytes,2,ch,needMoreFlag); 690 assert(!needMoreFlag); 691 assert(bread == 2); 692 assert(ch == 'Ä'); 693 //writefln("Ä as hex: 0x%32x,0x%32x", ch,'Ä'); 694 695 //convert 3byte character 696 buffer[0] = 0xe0; 697 buffer[1] = 0xa4; 698 buffer[2] = 0xb4; 699 bread = o.decodeBytes(bytes,2,ch,needMoreFlag); 700 assert(needMoreFlag); 701 702 bread = o.decodeBytes(bytes,3,ch,needMoreFlag); 703 assert(!needMoreFlag); 704 assert(bread == 3); 705 //writefln("ऴ as hex: 0x%32x,0x%32x", ch,'ऴ'); 706 assert(ch == 'ऴ'); 707 708 //regression test for https://github.com/buggins/dlangide/issues/65 709 buffer[0] = 0xEB; 710 buffer[1] = 0xB8; 711 buffer[2] = 0x94; 712 bread = o.decodeBytes(bytes,3,ch,needMoreFlag); 713 assert(!needMoreFlag); 714 assert(bread == 3); 715 //writefln("블 as hex: 0x%32x,0x%32x", ch,'블'); 716 assert(ch == '블'); 717 } 718 719 override uint decodeText() { 720 //number of bytesAvailable 721 uint len = readBytes(); 722 if (len == 0) 723 return 0; // nothing to decode 724 725 if (invalidCharFlag) { 726 invalidCharError(); 727 return 0; 728 } 729 ubyte * bytes = _buf.ptr + _pos; 730 ubyte* b = bytes; 731 uint chars = 0; 732 uint maxResultingBytes = len*2; //len*2 because worst case is if all input chars are singelbyte and resulting in two bytes 733 dchar* text = reserveTextBuf(maxResultingBytes); 734 uint i = 0; 735 736 bool needMoreFlag = false; 737 for (; i < len; i++) { 738 uint ch = 0; 739 uint bleft = len - i; 740 uint bread = decodeBytes(b+i,bleft,ch,needMoreFlag); 741 742 if(needMoreFlag){ 743 //decodeBytes needs more bytes, but nore more bytes left in the buffer 744 break; 745 } 746 747 if (bread == 0) { 748 //decodeBytes could not read any charater. stop procesing 749 invalidCharFlag = true; 750 break; 751 } 752 753 if (ch < 0x10000) { 754 text[chars++] = ch; 755 } else { 756 uint lo = ch & 0x3FF; 757 uint hi = ch >> 10; 758 text[chars++] = (0xd800 | hi); 759 text[chars++] = (0xdc00 | lo); 760 } 761 i += bread - 1; 762 } 763 consumedBytes(i); 764 appendedText(chars); 765 uint bleft = len - i; 766 if (_streamEof && bleft > 0) 767 invalidCharFlag = true; // incomplete character at end of stream 768 return chars; 769 } 770 } 771 772 private class Utf16beLineStream : LineStream { 773 this(InputStream stream, string filename, ubyte[] buf, uint len) { 774 super(stream, filename, EncodingType.UTF16BE, buf, 2, len); 775 } 776 override uint decodeText() { 777 if (invalidCharFlag) { 778 invalidCharError(); 779 return 0; 780 } 781 uint bytesAvailable = readBytes(); 782 ubyte * bytes = _buf.ptr + _pos; 783 if (bytesAvailable == 0) 784 return 0; // nothing to decode 785 uint len = bytesAvailable; 786 uint chars = 0; 787 ubyte* b = bytes; 788 dchar* text = reserveTextBuf(len / 2 + 1); 789 uint i = 0; 790 for (; i < len - 1; i += 2) { 791 uint ch0 = b[i]; 792 uint ch1 = b[i + 1]; 793 uint ch = (ch0 << 8) | ch1; 794 // TODO: check special cases 795 text[chars++] = ch; 796 } 797 consumedBytes(i); 798 appendedText(chars); 799 uint bleft = len - i; 800 if (_streamEof && bleft > 0) 801 invalidCharFlag = true; // incomplete character at end of stream 802 return chars; 803 } 804 } 805 806 private class Utf16leLineStream : LineStream { 807 this(InputStream stream, string filename, ubyte[] buf, uint len) { 808 super(stream, filename, EncodingType.UTF16LE, buf, 2, len); 809 } 810 override uint decodeText() { 811 if (invalidCharFlag) { 812 invalidCharError(); 813 return 0; 814 } 815 uint bytesAvailable = readBytes(); 816 ubyte * bytes = _buf.ptr + _pos; 817 if (bytesAvailable == 0) 818 return 0; // nothing to decode 819 uint len = bytesAvailable; 820 uint chars = 0; 821 ubyte* b = bytes; 822 dchar* text = reserveTextBuf(len / 2 + 1); 823 uint i = 0; 824 for (; i < len - 1; i += 2) { 825 uint ch0 = b[i]; 826 uint ch1 = b[i + 1]; 827 uint ch = (ch1 << 8) | ch0; 828 // TODO: check special cases 829 text[chars++] = ch; 830 } 831 consumedBytes(i); 832 appendedText(chars); 833 uint bleft = len - i; 834 if (_streamEof && bleft > 0) 835 invalidCharFlag = true; // incomplete character at end of stream 836 return chars; 837 } 838 } 839 840 private class Utf32beLineStream : LineStream { 841 this(InputStream stream, string filename, ubyte[] buf, uint len) { 842 super(stream, filename, EncodingType.UTF32BE, buf, 4, len); 843 } 844 override uint decodeText() { 845 if (invalidCharFlag) { 846 invalidCharError(); 847 return 0; 848 } 849 uint bytesAvailable = readBytes(); 850 ubyte * bytes = _buf.ptr + _pos; 851 if (bytesAvailable == 0) 852 return 0; // nothing to decode 853 uint len = bytesAvailable; 854 uint chars = 0; 855 ubyte* b = bytes; 856 dchar* text = reserveTextBuf(len / 2 + 1); 857 uint i = 0; 858 for (; i < len - 3; i += 4) { 859 uint ch0 = b[i]; 860 uint ch1 = b[i + 1]; 861 uint ch2 = b[i + 2]; 862 uint ch3 = b[i + 3]; 863 uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3; 864 if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { 865 invalidCharFlag = true; 866 break; 867 } 868 text[chars++] = ch; 869 } 870 consumedBytes(i); 871 appendedText(chars); 872 uint bleft = len - i; 873 if (_streamEof && bleft > 0) 874 invalidCharFlag = true; // incomplete character at end of stream 875 return chars; 876 } 877 } 878 879 private class Utf32leLineStream : LineStream { 880 this(InputStream stream, string filename, ubyte[] buf, uint len) { 881 super(stream, filename, EncodingType.UTF32LE, buf, 4, len); 882 } 883 override uint decodeText() { 884 if (invalidCharFlag) { 885 invalidCharError(); 886 return 0; 887 } 888 uint bytesAvailable = readBytes(); 889 ubyte * bytes = _buf.ptr + _pos; 890 if (bytesAvailable == 0) 891 return 0; // nothing to decode 892 uint len = bytesAvailable; 893 uint chars = 0; 894 ubyte* b = bytes; 895 dchar* text = reserveTextBuf(len / 2 + 1); 896 uint i = 0; 897 for (; i < len - 3; i += 4) { 898 uint ch3 = b[i]; 899 uint ch2 = b[i + 1]; 900 uint ch1 = b[i + 2]; 901 uint ch0 = b[i + 3]; 902 uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3; 903 if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { 904 invalidCharFlag = true; 905 break; 906 } 907 text[chars++] = ch; 908 } 909 consumedBytes(i); 910 appendedText(chars); 911 uint bleft = len - i; 912 if (_streamEof && bleft > 0) 913 invalidCharFlag = true; // incomplete character at end of stream 914 return chars; 915 } 916 } 917 918 919 unittest { 920 static if (false) { 921 import std.stdio; 922 import std.conv; 923 import std.utf; 924 //string fname = "C:\\projects\\d\\ddc\\ddclexer\\src\\ddc\\lexer\\LineStream.d"; 925 //string fname = "/home/lve/src/d/ddc/ddclexer/" ~ __FILE__; //"/home/lve/src/d/ddc/ddclexer/src/ddc/lexer/Lexer.d"; 926 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf8.d"; 927 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16be.d"; 928 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16le.d"; 929 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32be.d"; 930 string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32le.d"; 931 writeln("opening file"); 932 std.stream.File f = new std.stream.File(fname); 933 scope(exit) { f.close(); } 934 try { 935 LineStream lines = LineStream.create(f, fname); 936 for (;;) { 937 dchar[] s = lines.readLine(); 938 if (s is null) 939 break; 940 writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s)); 941 } 942 if (lines.errorCode != 0) { 943 writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos); 944 } else { 945 writeln("EOF reached"); 946 } 947 } catch (Exception e) { 948 writeln("Exception " ~ e.toString); 949 } 950 } 951 } 952 // LAST LINE