1 // Written in the D programming language. 2 3 /** 4 DLANGUI library. 5 6 This module contains text file reader implementation. 7 8 Support utf8, utf16, utf32 be and le encodings, and line endings - according to D language source file specification. 9 10 Low resource consuming. Doesn't flood with GC allocations. Dup line if you want to store it somewhere. 11 12 Tracks line number. 13 14 15 Synopsis: 16 17 ---- 18 import dlangui.core.linestream; 19 20 import std.stdio; 21 import std.conv; 22 import std.utf; 23 string fname = "somefile.d"; 24 writeln("opening file"); 25 std.stream.File f = new std.stream.File(fname); 26 scope(exit) { f.close(); } 27 try { 28 LineStream lines = LineStream.create(f, fname); 29 for (;;) { 30 dchar[] s = lines.readLine(); 31 if (s is null) 32 break; 33 writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s)); 34 } 35 if (lines.errorCode != 0) { 36 writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos); 37 } else { 38 writeln("EOF reached"); 39 } 40 } catch (Exception e) { 41 writeln("Exception " ~ e.toString); 42 } 43 44 ---- 45 46 Copyright: Vadim Lopatin, 2014 47 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0). 48 Authors: $(WEB coolreader.org, Vadim Lopatin) 49 */ 50 module dlangui.core.linestream; 51 52 import std.stream; 53 import std.stdio; 54 import std.conv; 55 56 class LineStream { 57 public enum EncodingType { 58 ASCII, 59 UTF8, 60 UTF16BE, 61 UTF16LE, 62 UTF32BE, 63 UTF32LE 64 }; 65 66 InputStream _stream; 67 string _filename; 68 ubyte[] _buf; // stream reading buffer 69 uint _pos; // reading position of stream buffer 70 uint _len; // number of bytes in stream buffer 71 bool _streamEof; // true if input stream is in EOF state 72 uint _line; // current line number 73 74 uint _textPos; // start of text line in text buffer 75 uint _textLen; // position of last filled char in text buffer + 1 76 dchar[] _textBuf; // text buffer 77 bool _eof; // end of file, no more lines 78 79 @property string filename() { return _filename; } 80 @property uint line() { return _line; } 81 @property EncodingType encoding() { return _encoding; } 82 @property int errorCode() { return _errorCode; } 83 @property string errorMessage() { return _errorMessage; } 84 @property int errorLine() { return _errorLine; } 85 @property int errorPos() { return _errorPos; } 86 87 immutable EncodingType _encoding; 88 89 int _errorCode; 90 string _errorMessage; 91 uint _errorLine; 92 uint _errorPos; 93 94 protected this(InputStream stream, string filename, EncodingType encoding, ubyte[] buf, uint offset, uint len) { 95 _filename = filename; 96 _stream = stream; 97 _encoding = encoding; 98 _buf = buf; 99 _len = len; 100 _pos = offset; 101 _streamEof = _stream.eof; 102 } 103 104 // returns slice of bytes available in buffer 105 uint readBytes() { 106 uint bytesLeft = _len - _pos; 107 if (_streamEof || bytesLeft > QUARTER_BYTE_BUFFER_SIZE) 108 return bytesLeft; 109 if (_pos > 0) { 110 for (uint i = 0; i < bytesLeft; i++) 111 _buf[i] = _buf[i + _pos]; 112 _len = bytesLeft; 113 _pos = 0; 114 } 115 uint bytesRead = cast(uint)_stream.read(_buf[_len .. BYTE_BUFFER_SIZE]); 116 _len += bytesRead; 117 _streamEof = _stream.eof; 118 return _len - _pos; //_buf[_pos .. _len]; 119 } 120 121 // when bytes consumed from byte buffer, call this method to update position 122 void consumedBytes(uint count) { 123 _pos += count; 124 } 125 126 // reserve text buffer for specified number of characters, and return pointer to first free character in buffer 127 dchar * reserveTextBuf(uint len) { 128 // create new text buffer if necessary 129 if (_textBuf == null) { 130 if (len < TEXT_BUFFER_SIZE) 131 len = TEXT_BUFFER_SIZE; 132 _textBuf = new dchar[len]; 133 return _textBuf.ptr; 134 } 135 uint spaceLeft = cast(uint)_textBuf.length - _textLen; 136 if (spaceLeft >= len) 137 return _textBuf.ptr + _textLen; 138 // move text to beginning of buffer, if necessary 139 if (_textPos > _textBuf.length / 2) { 140 uint charCount = _textLen - _textPos; 141 dchar * p = _textBuf.ptr; 142 for (uint i = 0; i < charCount; i++) 143 p[i] = p[i + _textPos]; 144 _textLen = charCount; 145 _textPos = 0; 146 } 147 // resize buffer if necessary 148 if (_textLen + len > _textBuf.length) { 149 // resize buffer 150 uint newsize = cast(uint)_textBuf.length * 2; 151 if (newsize < _textLen + len) 152 newsize = _textLen + len; 153 _textBuf.length = newsize; 154 } 155 return _textBuf.ptr + _textLen; 156 } 157 158 void appendedText(uint len) { 159 //writeln("appended ", len, " chars of text"); //:", _textBuf[_textLen .. _textLen + len]); 160 _textLen += len; 161 } 162 163 void setError(int code, string message, uint errorLine, uint errorPos) { 164 _errorCode = code; 165 _errorMessage = message; 166 _errorLine = errorLine; 167 _errorPos = errorPos; 168 } 169 170 // override to decode text 171 abstract uint decodeText(); 172 173 immutable static uint LINE_POSITION_UNDEFINED = uint.max; 174 public dchar[] readLine() { 175 if (_errorCode != 0) { 176 //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine); 177 return null; // error detected 178 } 179 if (_eof) { 180 //writeln("EOF found"); 181 return null; 182 } 183 _line++; 184 uint p = 0; 185 uint eol = LINE_POSITION_UNDEFINED; 186 uint eof = LINE_POSITION_UNDEFINED; 187 uint lastchar = LINE_POSITION_UNDEFINED; 188 do { 189 if (_errorCode != 0) { 190 //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine); 191 return null; // error detected 192 } 193 uint charsLeft = _textLen - _textPos; 194 if (p >= charsLeft) { 195 uint decodedChars = decodeText(); 196 if (_errorCode != 0) { 197 return null; // error detected 198 } 199 charsLeft = _textLen - _textPos; 200 if (decodedChars == 0) { 201 eol = charsLeft; 202 eof = charsLeft; 203 lastchar = charsLeft; 204 break; 205 } 206 } 207 for (; p < charsLeft; p++) { 208 dchar ch = _textBuf[_textPos + p]; 209 if (ch == 0x0D) { 210 lastchar = p; 211 if (p == charsLeft - 1) { 212 // need one more char to check if it's 0D0A or just 0D eol 213 //writeln("read one more char for 0D0A detection"); 214 decodeText(); 215 if (_errorCode != 0) { 216 return null; // error detected 217 } 218 charsLeft = _textLen - _textPos; 219 } 220 dchar ch2 = (p < charsLeft - 1) ? _textBuf[_textPos + p + 1] : 0; 221 if (ch2 == 0x0A) 222 eol = p + 2; 223 else 224 eol = p + 1; 225 break; 226 } else if (ch == 0x0A || ch == 0x2028 || ch == 0x2029) { 227 // single char eoln 228 lastchar = p; 229 eol = p + 1; 230 break; 231 } else if (ch == 0 || ch == 0x001A) { 232 // eof 233 //writeln("EOF char found"); 234 lastchar = p; 235 eol = eof = p + 1; 236 break; 237 } 238 } 239 } while (eol == LINE_POSITION_UNDEFINED); 240 uint lineStart = _textPos; 241 uint lineEnd = _textPos + lastchar; 242 _textPos += eol; // consume text 243 if (eof != LINE_POSITION_UNDEFINED) { 244 _eof = true; 245 //writeln("Setting eof flag. lastchar=", lastchar, ", p=", p, ", lineStart=", lineStart); 246 if (lineStart >= lineEnd) { 247 //writeln("lineStart >= lineEnd -- treat as eof"); 248 return null; // eof 249 } 250 } 251 // return slice with decoded line 252 return _textBuf[lineStart .. lineEnd]; 253 } 254 255 immutable static int TEXT_BUFFER_SIZE = 1024; 256 immutable static int BYTE_BUFFER_SIZE = 512; 257 immutable static int QUARTER_BYTE_BUFFER_SIZE = BYTE_BUFFER_SIZE / 4; 258 259 // factory for string parser 260 public static LineStream create(string code, string filename = "") { 261 uint len = cast(uint)code.length; 262 ubyte[] data = new ubyte[len + 3]; 263 for (uint i = 0; i < len; i++) 264 data[i + 3] = code[i]; 265 // BOM for UTF8 266 data[0] = 0xEF; 267 data[1] = 0xBB; 268 data[2] = 0xBF; 269 MemoryStream stream = new MemoryStream(data); 270 return create(stream, filename); 271 } 272 273 // factory 274 public static LineStream create(InputStream stream, string filename) { 275 ubyte[] buf = new ubyte[BYTE_BUFFER_SIZE]; 276 buf[0] = buf[1] = buf[2] = buf[3] = 0; 277 if (!stream.isOpen) 278 return null; 279 uint len = cast(uint)stream.read(buf); 280 if (buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) { 281 return new Utf8LineStream(stream, filename, buf, len); 282 } else if (buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF) { 283 return new Utf32beLineStream(stream, filename, buf, len); 284 } else if (buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00) { 285 return new Utf32leLineStream(stream, filename, buf, len); 286 } else if (buf[0] == 0xFE && buf[1] == 0xFF) { 287 return new Utf16beLineStream(stream, filename, buf, len); 288 } else if (buf[0] == 0xFF && buf[1] == 0xFE) { 289 return new Utf16leLineStream(stream, filename, buf, len); 290 } else { 291 return new AsciiLineStream(stream, filename, buf, len); 292 } 293 } 294 295 protected bool invalidCharFlag; 296 protected void invalidCharError() { 297 uint pos = _textLen - _textPos + 1; 298 setError(1, "Invalid character in line " ~ to!string(_line) ~ ":" ~ to!string(pos), _line, pos); 299 } 300 } 301 302 303 304 class AsciiLineStream : LineStream { 305 this(InputStream stream, string filename, ubyte[] buf, uint len) { 306 super(stream, filename, EncodingType.ASCII, buf, 0, len); 307 } 308 override uint decodeText() { 309 if (invalidCharFlag) { 310 invalidCharError(); 311 return 0; 312 } 313 uint bytesAvailable = readBytes(); 314 ubyte * bytes = _buf.ptr + _pos; 315 if (bytesAvailable == 0) 316 return 0; // nothing to decode 317 uint len = bytesAvailable; 318 ubyte* b = bytes; 319 dchar* text = reserveTextBuf(len); 320 uint i = 0; 321 for (; i < len; i++) { 322 ubyte ch = b[i]; 323 if (ch & 0x80) { 324 // invalid character 325 invalidCharFlag = true; 326 break; 327 } 328 text[i] = ch; 329 } 330 consumedBytes(i); 331 appendedText(i); 332 return len; 333 } 334 335 } 336 337 class Utf8LineStream : LineStream { 338 this(InputStream stream, string filename, ubyte[] buf, uint len) { 339 super(stream, filename, EncodingType.UTF8, buf, 3, len); 340 } 341 override uint decodeText() { 342 if (invalidCharFlag) { 343 invalidCharError(); 344 return 0; 345 } 346 uint bytesAvailable = readBytes(); 347 ubyte * bytes = _buf.ptr + _pos; 348 if (bytesAvailable == 0) 349 return 0; // nothing to decode 350 uint len = bytesAvailable; 351 uint chars = 0; 352 ubyte* b = bytes; 353 dchar* text = reserveTextBuf(len); 354 uint i = 0; 355 for (; i < len; i++) { 356 uint ch = 0; 357 uint ch0 = b[i]; 358 uint bleft = len - i; 359 uint bread = 0; 360 if (!(ch0 & 0x80)) { 361 // 0x00..0x7F single byte 362 ch = ch0; 363 bread = 1; 364 } if ((ch0 & 0xE0) == 0xC0) { 365 // two bytes 110xxxxx 10xxxxxx 366 if (bleft < 2) 367 break; 368 uint ch1 = b[i + 1]; 369 if ((ch1 & 0xC0) != 0x80) { 370 invalidCharFlag = true; 371 break; 372 } 373 ch = ((ch0 & 0x1F) << 6) | ((ch1 & 0x3F)); 374 bread = 2; 375 } if ((ch0 & 0xF0) == 0xE0) { 376 // three bytes 1110xxxx 10xxxxxx 10xxxxxx 377 if (bleft < 3) 378 break; 379 uint ch1 = b[i + 1]; 380 uint ch2 = b[i + 2]; 381 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) { 382 invalidCharFlag = true; 383 break; 384 } 385 ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x1F) << 6) | ((ch2 & 0x3F)); 386 bread = 3; 387 } if ((ch0 & 0xF8) == 0xF0) { 388 // four bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 389 if (bleft < 4) 390 break; 391 uint ch1 = b[i + 1]; 392 uint ch2 = b[i + 2]; 393 uint ch3 = b[i + 3]; 394 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) { 395 invalidCharFlag = true; 396 break; 397 } 398 ch = ((ch0 & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | ((ch3 & 0x3F)); 399 bread = 4; 400 } if ((ch0 & 0xFC) == 0xF8) { 401 // five bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 402 if (bleft < 5) 403 break; 404 uint ch1 = b[i + 1]; 405 uint ch2 = b[i + 2]; 406 uint ch3 = b[i + 3]; 407 uint ch4 = b[i + 4]; 408 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) { 409 invalidCharFlag = true; 410 break; 411 } 412 ch = ((ch0 & 0x03) << 24) | ((ch1 & 0x3F) << 18) | ((ch2 & 0x3F) << 12) | ((ch3 & 0x3F) << 6) | ((ch4 & 0x3F)); 413 bread = 5; 414 } if ((ch0 & 0xFE) == 0xFC) { 415 // six bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 416 if (bleft < 6) 417 break; 418 uint ch1 = b[i + 1]; 419 uint ch2 = b[i + 2]; 420 uint ch3 = b[i + 3]; 421 uint ch4 = b[i + 4]; 422 uint ch5 = b[i + 5]; 423 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80 || (ch5 & 0xC0) != 0x80) { 424 invalidCharFlag = true; 425 break; 426 } 427 ch = ((ch0 & 0x01) << 30) | ((ch1 & 0x3F) << 24) | ((ch2 & 0x3F) << 18) | ((ch3 & 0x3F) << 12) | ((ch4 & 0x3F) << 6) | ((ch5 & 0x3F)); 428 bread = 5; 429 } 430 if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { 431 invalidCharFlag = true; 432 break; 433 } 434 if (ch < 0x10000) { 435 text[chars++] = ch; 436 } else { 437 uint lo = ch & 0x3FF; 438 uint hi = ch >> 10; 439 text[chars++] = (0xd800 | hi); 440 text[chars++] = (0xdc00 | lo); 441 } 442 i += bread - 1; 443 } 444 consumedBytes(i); 445 appendedText(chars); 446 uint bleft = len - i; 447 if (_streamEof && bleft > 0) 448 invalidCharFlag = true; // incomplete character at end of stream 449 return chars; 450 } 451 } 452 453 class Utf16beLineStream : LineStream { 454 this(InputStream stream, string filename, ubyte[] buf, uint len) { 455 super(stream, filename, EncodingType.UTF16BE, buf, 2, len); 456 } 457 override uint decodeText() { 458 if (invalidCharFlag) { 459 invalidCharError(); 460 return 0; 461 } 462 uint bytesAvailable = readBytes(); 463 ubyte * bytes = _buf.ptr + _pos; 464 if (bytesAvailable == 0) 465 return 0; // nothing to decode 466 uint len = bytesAvailable; 467 uint chars = 0; 468 ubyte* b = bytes; 469 dchar* text = reserveTextBuf(len / 2 + 1); 470 uint i = 0; 471 for (; i < len - 1; i += 2) { 472 uint ch0 = b[i]; 473 uint ch1 = b[i + 1]; 474 uint ch = (ch0 << 8) | ch1; 475 // TODO: check special cases 476 text[chars++] = ch; 477 } 478 consumedBytes(i); 479 appendedText(chars); 480 uint bleft = len - i; 481 if (_streamEof && bleft > 0) 482 invalidCharFlag = true; // incomplete character at end of stream 483 return chars; 484 } 485 } 486 487 class Utf16leLineStream : LineStream { 488 this(InputStream stream, string filename, ubyte[] buf, uint len) { 489 super(stream, filename, EncodingType.UTF16LE, buf, 2, len); 490 } 491 override uint decodeText() { 492 if (invalidCharFlag) { 493 invalidCharError(); 494 return 0; 495 } 496 uint bytesAvailable = readBytes(); 497 ubyte * bytes = _buf.ptr + _pos; 498 if (bytesAvailable == 0) 499 return 0; // nothing to decode 500 uint len = bytesAvailable; 501 uint chars = 0; 502 ubyte* b = bytes; 503 dchar* text = reserveTextBuf(len / 2 + 1); 504 uint i = 0; 505 for (; i < len - 1; i += 2) { 506 uint ch0 = b[i]; 507 uint ch1 = b[i + 1]; 508 uint ch = (ch1 << 8) | ch0; 509 // TODO: check special cases 510 text[chars++] = ch; 511 } 512 consumedBytes(i); 513 appendedText(chars); 514 uint bleft = len - i; 515 if (_streamEof && bleft > 0) 516 invalidCharFlag = true; // incomplete character at end of stream 517 return chars; 518 } 519 } 520 521 class Utf32beLineStream : LineStream { 522 this(InputStream stream, string filename, ubyte[] buf, uint len) { 523 super(stream, filename, EncodingType.UTF32BE, buf, 4, len); 524 } 525 override uint decodeText() { 526 if (invalidCharFlag) { 527 invalidCharError(); 528 return 0; 529 } 530 uint bytesAvailable = readBytes(); 531 ubyte * bytes = _buf.ptr + _pos; 532 if (bytesAvailable == 0) 533 return 0; // nothing to decode 534 uint len = bytesAvailable; 535 uint chars = 0; 536 ubyte* b = bytes; 537 dchar* text = reserveTextBuf(len / 2 + 1); 538 uint i = 0; 539 for (; i < len - 3; i += 4) { 540 uint ch0 = b[i]; 541 uint ch1 = b[i + 1]; 542 uint ch2 = b[i + 2]; 543 uint ch3 = b[i + 3]; 544 uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3; 545 if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { 546 invalidCharFlag = true; 547 break; 548 } 549 text[chars++] = ch; 550 } 551 consumedBytes(i); 552 appendedText(chars); 553 uint bleft = len - i; 554 if (_streamEof && bleft > 0) 555 invalidCharFlag = true; // incomplete character at end of stream 556 return chars; 557 } 558 } 559 560 class Utf32leLineStream : LineStream { 561 this(InputStream stream, string filename, ubyte[] buf, uint len) { 562 super(stream, filename, EncodingType.UTF32LE, buf, 4, len); 563 } 564 override uint decodeText() { 565 if (invalidCharFlag) { 566 invalidCharError(); 567 return 0; 568 } 569 uint bytesAvailable = readBytes(); 570 ubyte * bytes = _buf.ptr + _pos; 571 if (bytesAvailable == 0) 572 return 0; // nothing to decode 573 uint len = bytesAvailable; 574 uint chars = 0; 575 ubyte* b = bytes; 576 dchar* text = reserveTextBuf(len / 2 + 1); 577 uint i = 0; 578 for (; i < len - 3; i += 4) { 579 uint ch3 = b[i]; 580 uint ch2 = b[i + 1]; 581 uint ch1 = b[i + 2]; 582 uint ch0 = b[i + 3]; 583 uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3; 584 if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { 585 invalidCharFlag = true; 586 break; 587 } 588 text[chars++] = ch; 589 } 590 consumedBytes(i); 591 appendedText(chars); 592 uint bleft = len - i; 593 if (_streamEof && bleft > 0) 594 invalidCharFlag = true; // incomplete character at end of stream 595 return chars; 596 } 597 } 598 599 600 unittest { 601 static if (false) { 602 import std.stdio; 603 import std.conv; 604 import std.utf; 605 //string fname = "C:\\projects\\d\\ddc\\ddclexer\\src\\ddc\\lexer\\LineStream.d"; 606 //string fname = "/home/lve/src/d/ddc/ddclexer/" ~ __FILE__; //"/home/lve/src/d/ddc/ddclexer/src/ddc/lexer/Lexer.d"; 607 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf8.d"; 608 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16be.d"; 609 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16le.d"; 610 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32be.d"; 611 string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32le.d"; 612 writeln("opening file"); 613 std.stream.File f = new std.stream.File(fname); 614 scope(exit) { f.close(); } 615 try { 616 LineStream lines = LineStream.create(f, fname); 617 for (;;) { 618 dchar[] s = lines.readLine(); 619 if (s is null) 620 break; 621 writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s)); 622 } 623 if (lines.errorCode != 0) { 624 writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos); 625 } else { 626 writeln("EOF reached"); 627 } 628 } catch (Exception e) { 629 writeln("Exception " ~ e.toString); 630 } 631 } 632 } 633 // LAST LINE