1 // Written in the D programming language.
2
3 /**
4
5 This module contains text stream reader implementation
6
7 Implements class LineStream for reading of unicode text from stream and returning it by lines.
8
9 Support utf8, utf16, utf32 be and le encodings, and line endings - according to D language source file specification.
10
11 Low resource consuming. Doesn't flood with GC allocations. Dup line if you want to store it somewhere.
12
13 Tracks line number.
14
15
16 Synopsis:
17
18 ----
19 import dlangui.core.linestream;
20
21 import std.stdio;
22 import std.conv;
23 import std.utf;
24 string fname = "somefile.d";
25 writeln("opening file");
26 std.stream.File f = new std.stream.File(fname);
27 scope(exit) { f.close(); }
28 try {
29 LineStream lines = LineStream.create(f, fname);
30 for (;;) {
31 dchar[] s = lines.readLine();
32 if (s is null)
33 break;
34 writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s));
35 }
36 if (lines.errorCode != 0) {
37 writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos);
38 } else {
39 writeln("EOF reached");
40 }
41 } catch (Exception e) {
42 writeln("Exception " ~ e.toString);
43 }
44
45 ----
46
47 Copyright: Vadim Lopatin, 2014
48 License: Boost License 1.0
49 Authors: Vadim Lopatin, coolreader.org@gmail.com
50 */
51 module dlangui.core.linestream;
52
53 import dlangui.core.streams;
54 //import std.stream;
55 import std.stdio;
56 import std.conv;
57 import std.utf;
58
59 /// File encoding
60 public enum EncodingType : int {
61 /// utf-8 unicode
62 UTF8,
63 /// utf-16 unicode big endian
64 UTF16BE,
65 /// utf-16 unicode little endian
66 UTF16LE,
67 /// utf-32 unicode big endian
68 UTF32BE,
69 /// utf-32 unicode little endian
70 UTF32LE,
71 /// plain ASCII (character codes must be <= 127)
72 ASCII,
73 /// encoding is unknown
74 UNKNOWN
75 }
76 /// Line ending style
77 public enum LineEnding : int {
78 /// LF (0x0A) - unix style
79 LF,
80 /// CR followed by LF (0x0D,0x0A) - windows style
81 CRLF,
82 /// CR (0x0D) - mac style
83 CR,
84 /// unknown line ending
85 UNKNOWN,
86 /// mixed line endings detected
87 MIXED
88 }
89
90 /// Text file format
91 struct TextFileFormat {
92 /// character encoding
93 EncodingType encoding;
94 /// line ending style
95 LineEnding lineEnding;
96 /// byte order mark character flag
97 bool bom;
98 string toString() const {
99 return to!string(encoding) ~ " " ~ to!string(lineEnding) ~ (bom ? " bom" : "");
100 }
101 }
102
103 /// Text file writer which supports different text file formats
104 class OutputLineStream {
105 protected OutputStream _stream;
106 protected string _filename;
107 protected TextFileFormat _format;
108 protected bool _firstLine;
109 protected char[] _buf;
110 protected int _len;
111 protected static immutable int MAX_BUFFER_SIZE = 0x10000; // 64K
112 /// create
113 this(OutputStream stream, string filename, TextFileFormat format) {
114 _stream = stream;
115 _filename = filename;
116 _format = format;
117 _firstLine = true;
118 // fix format
119 if (_format.encoding == EncodingType.UNKNOWN || _format.encoding == EncodingType.ASCII)
120 _format.encoding = EncodingType.UTF8;
121 if (_format.lineEnding == LineEnding.UNKNOWN || _format.lineEnding == LineEnding.MIXED) {
122 version (Windows) {
123 _format.lineEnding = LineEnding.CRLF;
124 } else {
125 _format.lineEnding = LineEnding.LF;
126 }
127 }
128 }
129
130 protected void flush() {
131 if (_len > 0) {
132 _stream.write(cast(ubyte[])_buf[0 .. _len]);
133 _len = 0;
134 }
135 }
136
137 /// convert character encoding and write to output stream
138 protected void convertAndWrite(dstring s) {
139 /// reserve buf space
140 if (_buf.length < _len + s.length * 4 + 4)
141 _buf.length = _len + s.length * 4 + 4;
142 switch (_format.encoding) with(EncodingType)
143 {
144 case UTF8:
145 default:
146 char[4] d;
147 foreach(i; 0 .. s.length) {
148 int bytes = cast(int)encode(d, s[i]);
149 foreach(j; 0 .. bytes)
150 _buf[_len++] = d[j];
151 }
152 break;
153 case UTF16BE:
154 wchar[2] d;
155 foreach(i; 0 .. s.length) {
156 int n = cast(int)encode(d, s[i]);
157 foreach(j; 0 .. n) {
158 _buf[_len++] = cast(char)(d[j] >> 8);
159 _buf[_len++] = cast(char)(d[j] & 0xFF);
160 }
161 }
162 break;
163 case UTF16LE:
164 wchar[2] d;
165 foreach(i; 0 .. s.length) {
166 int n = cast(int)encode(d, s[i]);
167 foreach(j; 0 .. n) {
168 _buf[_len++] = cast(char)(d[j] & 0xFF);
169 _buf[_len++] = cast(char)(d[j] >> 8);
170 }
171 }
172 break;
173 case UTF32LE:
174 foreach(i; 0 .. s.length) {
175 dchar ch = s[i];
176 _buf[_len++] = cast(char)((ch >> 0) & 0xFF);
177 _buf[_len++] = cast(char)((ch >> 8) & 0xFF);
178 _buf[_len++] = cast(char)((ch >> 16) & 0xFF);
179 _buf[_len++] = cast(char)((ch >> 24) & 0xFF);
180 }
181 break;
182 case UTF32BE:
183 foreach(i; 0 .. s.length) {
184 dchar ch = s[i];
185 _buf[_len++] = cast(char)((ch >> 24) & 0xFF);
186 _buf[_len++] = cast(char)((ch >> 16) & 0xFF);
187 _buf[_len++] = cast(char)((ch >> 8) & 0xFF);
188 _buf[_len++] = cast(char)((ch >> 0) & 0xFF);
189 }
190 break;
191 }
192 if (_len > MAX_BUFFER_SIZE)
193 flush();
194 }
195 /// write single line
196 void writeLine(dstring line) {
197 if (_firstLine) {
198 if (_format.bom)
199 convertAndWrite("\uFEFF"d); // write BOM
200 _firstLine = false;
201 }
202 convertAndWrite(line);
203 switch(_format.lineEnding) {
204 case LineEnding.LF:
205 convertAndWrite("\n"d);
206 break;
207 case LineEnding.CR:
208 convertAndWrite("\r"d);
209 break;
210 default:
211 case LineEnding.CRLF:
212 convertAndWrite("\r\n"d);
213 break;
214 }
215 }
216 /// close stream
217 void close() {
218 flush();
219 _stream.close();
220 _buf = null;
221 }
222 }
223
224
225 /**
226 Support reading of file (or string in memory) by lines
227
228 Support utf8, utf16, utf32 be and le encodings, and line endings - according to D language source file specification.
229
230 Low resource consuming. Doesn't flood with GC allocations. Dup line if you want to store it somewhere.
231
232 Tracks line number.
233 */
234 class LineStream {
235
236 /// Error codes
237 public enum ErrorCodes {
238 /// invalid character for current encoding
239 INVALID_CHARACTER
240 }
241
242 private InputStream _stream;
243 private string _filename;
244 private ubyte[] _buf; // stream reading buffer
245 private uint _pos; // reading position of stream buffer
246 private uint _len; // number of bytes in stream buffer
247 private bool _streamEof; // true if input stream is in EOF state
248 private uint _line; // current line number
249
250 private uint _textPos; // start of text line in text buffer
251 private uint _textLen; // position of last filled char in text buffer + 1
252 private dchar[] _textBuf; // text buffer
253 private bool _eof; // end of file, no more lines
254 protected bool _bomDetected;
255 protected int _crCount;
256 protected int _lfCount;
257 protected int _crlfCount;
258
259 /// Returns file name
260 @property string filename() { return _filename; }
261 /// Returns current line number
262 @property uint line() { return _line; }
263 /// Returns file encoding EncodingType
264 @property EncodingType encoding() { return _encoding; }
265
266 @property TextFileFormat textFormat() {
267 LineEnding le = LineEnding.CRLF;
268 if (_crlfCount) {
269 if (_crCount == _lfCount)
270 le = LineEnding.CRLF;
271 else
272 le = LineEnding.MIXED;
273 } else if (_crCount > _lfCount) {
274 le = LineEnding.CR;
275 } else if (_lfCount > _crCount) {
276 le = LineEnding.LF;
277 } else {
278 le = LineEnding.MIXED;
279 }
280 TextFileFormat res = TextFileFormat(_encoding, le, _bomDetected);
281 return res;
282 }
283
284
285 /// Returns error code
286 @property int errorCode() { return _errorCode; }
287 /// Returns error message
288 @property string errorMessage() { return _errorMessage; }
289 /// Returns line where error is found
290 @property int errorLine() { return _errorLine; }
291 /// Returns line position (number of character in line) where error is found
292 @property int errorPos() { return _errorPos; }
293
294 private immutable EncodingType _encoding;
295
296 private int _errorCode;
297 private string _errorMessage;
298 private uint _errorLine;
299 private uint _errorPos;
300
301 /// Open file with known encoding
302 protected this(InputStream stream, string filename, EncodingType encoding, ubyte[] buf, uint offset, uint len) {
303 _filename = filename;
304 _stream = stream;
305 _encoding = encoding;
306 _buf = buf;
307 _len = len;
308 _pos = offset;
309 _streamEof = _stream.eof;
310 }
311
312 /// this constructor was created for unittests only
313 protected this(){
314 _encoding = EncodingType.UTF8;
315 }
316
317 /// returns slice of bytes available in buffer
318 protected uint readBytes() {
319 uint bytesLeft = _len - _pos;
320 if (_streamEof || bytesLeft > QUARTER_BYTE_BUFFER_SIZE)
321 return bytesLeft;
322 if (_pos > 0) {
323 foreach(i; 0 .. bytesLeft)
324 _buf[i] = _buf[i + _pos];
325 _len = bytesLeft;
326 _pos = 0;
327 }
328 uint bytesRead = cast(uint)_stream.read(_buf[_len .. BYTE_BUFFER_SIZE]);
329 _len += bytesRead;
330 _streamEof = _stream.eof;
331 return _len - _pos; //_buf[_pos .. _len];
332 }
333
334 // when bytes consumed from byte buffer, call this method to update position
335 protected void consumedBytes(uint count) {
336 _pos += count;
337 }
338
339 // reserve text buffer for specified number of characters, and return pointer to first free character in buffer
340 protected dchar * reserveTextBuf(uint len) {
341 // create new text buffer if necessary
342 if (_textBuf == null) {
343 if (len < TEXT_BUFFER_SIZE)
344 len = TEXT_BUFFER_SIZE;
345 _textBuf = new dchar[len];
346 return _textBuf.ptr;
347 }
348 uint spaceLeft = cast(uint)_textBuf.length - _textLen;
349 if (spaceLeft >= len)
350 return _textBuf.ptr + _textLen;
351 // move text to beginning of buffer, if necessary
352 if (_textPos > _textBuf.length / 2) {
353 uint charCount = _textLen - _textPos;
354 dchar * p = _textBuf.ptr;
355 foreach(i; 0 .. charCount)
356 p[i] = p[i + _textPos];
357 _textLen = charCount;
358 _textPos = 0;
359 }
360 // resize buffer if necessary
361 if (_textLen + len > _textBuf.length) {
362 // resize buffer
363 uint newsize = cast(uint)_textBuf.length * 2;
364 if (newsize < _textLen + len)
365 newsize = _textLen + len;
366 _textBuf.length = newsize;
367 }
368 return _textBuf.ptr + _textLen;
369 }
370
371 protected void appendedText(uint len) {
372 //writeln("appended ", len, " chars of text"); //:", _textBuf[_textLen .. _textLen + len]);
373 _textLen += len;
374 }
375
376 protected void setError(int code, string message, uint errorLine, uint errorPos) {
377 _errorCode = code;
378 _errorMessage = message;
379 _errorLine = errorLine;
380 _errorPos = errorPos;
381 }
382
383 // override to decode text
384 protected abstract uint decodeText();
385
386 /// Unknown line position
387 immutable static uint LINE_POSITION_UNDEFINED = uint.max;
388
389 /// Read line from stream
390 public dchar[] readLine() {
391 if (_errorCode != 0) {
392 //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine);
393 return null; // error detected
394 }
395 if (_eof) {
396 //writeln("EOF found");
397 return null;
398 }
399 _line++;
400 uint p = 0;
401 uint eol = LINE_POSITION_UNDEFINED;
402 uint eof = LINE_POSITION_UNDEFINED;
403 uint lastchar = LINE_POSITION_UNDEFINED;
404 do {
405 if (_errorCode != 0) {
406 //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine);
407 return null; // error detected
408 }
409 uint charsLeft = _textLen - _textPos;
410 if (p >= charsLeft) {
411 uint decodedChars = decodeText();
412 if (_errorCode != 0) {
413 return null; // error detected
414 }
415 charsLeft = _textLen - _textPos;
416 if (decodedChars == 0) {
417 eol = charsLeft;
418 eof = charsLeft;
419 lastchar = charsLeft;
420 break;
421 }
422 }
423 for (; p < charsLeft; p++) {
424 dchar ch = _textBuf[_textPos + p];
425 if (ch == '\r') { // CR
426 lastchar = p;
427 if (p == charsLeft - 1) {
428 // need one more char to check if it's 0D0A or just 0D eol
429 //writeln("read one more char for 0D0A detection");
430 decodeText();
431 if (_errorCode != 0) {
432 return null; // error detected
433 }
434 charsLeft = _textLen - _textPos;
435 }
436 dchar ch2 = (p < charsLeft - 1) ? _textBuf[_textPos + p + 1] : 0;
437 if (ch2 == '\n') { // LF
438 // CRLF
439 eol = p + 2;
440 _lfCount++;
441 _crCount++;
442 _crlfCount++;
443 } else {
444 // just CR
445 eol = p + 1;
446 _crCount++;
447 }
448 break;
449 } else if (ch == '\n' || ch == 0x2028 || ch == 0x2029) {
450 // single char eoln
451 lastchar = p;
452 eol = p + 1;
453 _lfCount++;
454 break;
455 } else if (ch == 0 || ch == 0x001A) {
456 // eof
457 //writeln("EOF char found");
458 lastchar = p;
459 eol = eof = p + 1;
460 break;
461 }
462 }
463 } while (eol == LINE_POSITION_UNDEFINED);
464 uint lineStart = _textPos;
465 uint lineEnd = _textPos + lastchar;
466 _textPos += eol; // consume text
467 if (eof != LINE_POSITION_UNDEFINED) {
468 _eof = true;
469 //writeln("Setting eof flag. lastchar=", lastchar, ", p=", p, ", lineStart=", lineStart);
470 if (lineStart >= lineEnd) {
471 //writeln("lineStart >= lineEnd -- treat as eof");
472 return null; // eof
473 }
474 }
475 // return slice with decoded line
476 return _textBuf[lineStart .. lineEnd];
477 }
478
479 protected immutable static int TEXT_BUFFER_SIZE = 1024;
480 protected immutable static int BYTE_BUFFER_SIZE = 512;
481 protected immutable static int QUARTER_BYTE_BUFFER_SIZE = BYTE_BUFFER_SIZE / 4;
482
483 /// Factory method for string parser
484 public static LineStream create(string code, string filename = "") {
485 uint len = cast(uint)code.length;
486 ubyte[] data = new ubyte[len + 3];
487 foreach(i; 0 .. len)
488 data[i + 3] = code[i];
489 // BOM for UTF8
490 data[0] = 0xEF;
491 data[1] = 0xBB;
492 data[2] = 0xBF;
493 InputStream stream = new MemoryInputStream(data); //new MemoryStream(data);
494 return create(stream, filename);
495 }
496
497 /// Factory for InputStream parser
498 public static LineStream create(InputStream stream, string filename, bool autodetectUTFIfNoBOM = true) {
499 ubyte[] buf = new ubyte[BYTE_BUFFER_SIZE];
500 buf[0] = buf[1] = buf[2] = buf[3] = 0;
501 if (!stream.isOpen)
502 return null;
503 uint len = cast(uint)stream.read(buf);
504 LineStream res = null;
505 if (buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) {
506 res = new Utf8LineStream(stream, filename, buf, len, 3);
507 } else if (buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF) {
508 res = new Utf32beLineStream(stream, filename, buf, len);
509 } else if (buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00) {
510 res = new Utf32leLineStream(stream, filename, buf, len);
511 } else if (buf[0] == 0xFE && buf[1] == 0xFF) {
512 res = new Utf16beLineStream(stream, filename, buf, len);
513 } else if (buf[0] == 0xFF && buf[1] == 0xFE) {
514 res = new Utf16leLineStream(stream, filename, buf, len);
515 }
516 if (res) {
517 res._bomDetected = true;
518 } else {
519 if (autodetectUTFIfNoBOM) {
520 res = new Utf8LineStream(stream, filename, buf, len, 0);
521 } else {
522 res = new AsciiLineStream(stream, filename, buf, len);
523 }
524 }
525 return res;
526 }
527
528 protected bool invalidCharFlag;
529 protected void invalidCharError() {
530 uint pos = _textLen - _textPos + 1;
531 setError(ErrorCodes.INVALID_CHARACTER, "Invalid character in line " ~ to!string(_line) ~ ":" ~ to!string(pos), _line, pos);
532 }
533 }
534
535
536 private class AsciiLineStream : LineStream {
537 this(InputStream stream, string filename, ubyte[] buf, uint len) {
538 super(stream, filename, EncodingType.ASCII, buf, 0, len);
539 }
540 override uint decodeText() {
541 if (invalidCharFlag) {
542 invalidCharError();
543 return 0;
544 }
545 uint bytesAvailable = readBytes();
546 ubyte * bytes = _buf.ptr + _pos;
547 if (bytesAvailable == 0)
548 return 0; // nothing to decode
549 uint len = bytesAvailable;
550 ubyte* b = bytes;
551 dchar* text = reserveTextBuf(len);
552 uint i = 0;
553 for (; i < len; i++) {
554 ubyte ch = b[i];
555 if (ch & 0x80) {
556 // invalid character
557 invalidCharFlag = true;
558 break;
559 }
560 text[i] = ch;
561 }
562 consumedBytes(i);
563 appendedText(i);
564 return len;
565 }
566
567 }
568
569 private class Utf8LineStream : LineStream {
570 this(InputStream stream, string filename, ubyte[] buf, uint len, int skip) {
571 super(stream, filename, EncodingType.UTF8, buf, skip, len);
572 }
573
574 uint decodeBytes(ubyte* b,in uint bleft, out uint ch, out bool needMoreFlag){
575 uint bread = 0;
576 uint ch0 = b[0];
577 if (!(ch0 & 0x80)) {
578 // 0x00..0x7F single byte
579 // 0x80 == 10000000
580 // !(ch0 & 0x80) => ch0 < 10000000
581 ch = ch0;
582 bread = 1;
583 } else if ((ch0 & 0xE0) == 0xC0) {
584 // two bytes 110xxxxx 10xxxxxx
585 if (bleft < 2) {
586 needMoreFlag = true;
587 return 0;
588 }
589 uint ch1 = b[1];
590 if ((ch1 & 0xC0) != 0x80) {
591 return 0;
592 }
593 ch = ((ch0 & 0x1F) << 6) | (ch1 & 0x3F);
594 bread = 2;
595 } else if ((ch0 & 0xF0) == 0xE0) {
596 // three bytes 1110xxxx 10xxxxxx 10xxxxxx
597 if (bleft < 3) {
598 needMoreFlag = true;
599 return 0;
600 }
601 uint ch1 = b[1];
602 uint ch2 = b[2];
603 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) {
604 return 0;
605 }
606 ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x3F) << 6) | (ch2 & 0x3F);
607 bread = 3;
608 } else if ((ch0 & 0xF8) == 0xF0) {
609 // four bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
610 if (bleft < 4) {
611 needMoreFlag = true;
612 return 0;
613 }
614 uint ch1 = b[1];
615 uint ch2 = b[2];
616 uint ch3 = b[3];
617 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) {
618 return 0;
619 }
620 ch = ((ch0 & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
621 bread = 4;
622 } else if ((ch0 & 0xFC) == 0xF8) {
623 // five bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
624 if (bleft < 5) {
625 needMoreFlag = true;
626 return 0;
627 }
628 uint ch1 = b[1];
629 uint ch2 = b[2];
630 uint ch3 = b[3];
631 uint ch4 = b[4];
632 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) {
633 return 0;
634 }
635 ch = ((ch0 & 0x03) << 24) | ((ch1 & 0x3F) << 18) | ((ch2 & 0x3F) << 12) | ((ch3 & 0x3F) << 6) | (ch4 & 0x3F);
636 bread = 5;
637 } else if ((ch0 & 0xFE) == 0xFC) {
638 // six bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
639 if (bleft < 6){
640 needMoreFlag = true;
641 return 0;
642 }
643
644 uint ch1 = b[1];
645 uint ch2 = b[2];
646 uint ch3 = b[3];
647 uint ch4 = b[4];
648 uint ch5 = b[5];
649 if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80 || (ch5 & 0xC0) != 0x80) {
650 return 0;
651 }
652 ch = ((ch0 & 0x01) << 30) | ((ch1 & 0x3F) << 24) | ((ch2 & 0x3F) << 18) | ((ch3 & 0x3F) << 12) | ((ch4 & 0x3F) << 6) | (ch5 & 0x3F);
653 bread = 5;
654 }
655 if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
656 return 0;
657 }
658 return bread;
659 }
660
661 /// this constructor was created for unittests only
662 protected this(){
663
664 }
665
666 unittest {
667 auto o = new Utf8LineStream();
668 ubyte[] buffer = new ubyte[4];
669 ubyte * bytes = buffer.ptr;
670 uint ch;
671 bool needMoreFlag;
672 uint bread;
673
674 //convert simple character
675 buffer[0] = '/';
676 bread = o.decodeBytes(bytes,1,ch,needMoreFlag);
677 assert(!needMoreFlag);
678 assert(bread == 1);
679 assert(ch == '/');
680 //writefln("/ as hex: 0x%32x,0x%32x", ch,'/');
681
682
683 //convert 2byte character
684 buffer[0] = 0xc3;
685 buffer[1] = 0x84;
686 bread = o.decodeBytes(bytes,1,ch,needMoreFlag);
687 assert(needMoreFlag);
688
689 bread = o.decodeBytes(bytes,2,ch,needMoreFlag);
690 assert(!needMoreFlag);
691 assert(bread == 2);
692 assert(ch == 'Ä');
693 //writefln("Ä as hex: 0x%32x,0x%32x", ch,'Ä');
694
695 //convert 3byte character
696 buffer[0] = 0xe0;
697 buffer[1] = 0xa4;
698 buffer[2] = 0xb4;
699 bread = o.decodeBytes(bytes,2,ch,needMoreFlag);
700 assert(needMoreFlag);
701
702 bread = o.decodeBytes(bytes,3,ch,needMoreFlag);
703 assert(!needMoreFlag);
704 assert(bread == 3);
705 //writefln("ऴ as hex: 0x%32x,0x%32x", ch,'ऴ');
706 assert(ch == 'ऴ');
707
708 //regression test for https://github.com/buggins/dlangide/issues/65
709 buffer[0] = 0xEB;
710 buffer[1] = 0xB8;
711 buffer[2] = 0x94;
712 bread = o.decodeBytes(bytes,3,ch,needMoreFlag);
713 assert(!needMoreFlag);
714 assert(bread == 3);
715 //writefln("블 as hex: 0x%32x,0x%32x", ch,'블');
716 assert(ch == '블');
717 }
718
719 override uint decodeText() {
720 //number of bytesAvailable
721 uint len = readBytes();
722 if (len == 0)
723 return 0; // nothing to decode
724
725 if (invalidCharFlag) {
726 invalidCharError();
727 return 0;
728 }
729 ubyte * bytes = _buf.ptr + _pos;
730 ubyte* b = bytes;
731 uint chars = 0;
732 uint maxResultingBytes = len*2; //len*2 because worst case is if all input chars are singelbyte and resulting in two bytes
733 dchar* text = reserveTextBuf(maxResultingBytes);
734 uint i = 0;
735
736 bool needMoreFlag = false;
737 for (; i < len; i++) {
738 uint ch = 0;
739 uint bleft = len - i;
740 uint bread = decodeBytes(b+i,bleft,ch,needMoreFlag);
741
742 if(needMoreFlag){
743 //decodeBytes needs more bytes, but nore more bytes left in the buffer
744 break;
745 }
746
747 if (bread == 0) {
748 //decodeBytes could not read any charater. stop procesing
749 invalidCharFlag = true;
750 break;
751 }
752
753 if (ch < 0x10000) {
754 text[chars++] = ch;
755 } else {
756 uint lo = ch & 0x3FF;
757 uint hi = ch >> 10;
758 text[chars++] = (0xd800 | hi);
759 text[chars++] = (0xdc00 | lo);
760 }
761 i += bread - 1;
762 }
763 consumedBytes(i);
764 appendedText(chars);
765 uint bleft = len - i;
766 if (_streamEof && bleft > 0)
767 invalidCharFlag = true; // incomplete character at end of stream
768 return chars;
769 }
770 }
771
772 private class Utf16beLineStream : LineStream {
773 this(InputStream stream, string filename, ubyte[] buf, uint len) {
774 super(stream, filename, EncodingType.UTF16BE, buf, 2, len);
775 }
776 override uint decodeText() {
777 if (invalidCharFlag) {
778 invalidCharError();
779 return 0;
780 }
781 uint bytesAvailable = readBytes();
782 ubyte * bytes = _buf.ptr + _pos;
783 if (bytesAvailable == 0)
784 return 0; // nothing to decode
785 uint len = bytesAvailable;
786 uint chars = 0;
787 ubyte* b = bytes;
788 dchar* text = reserveTextBuf(len / 2 + 1);
789 uint i = 0;
790 for (; i < len - 1; i += 2) {
791 uint ch0 = b[i];
792 uint ch1 = b[i + 1];
793 uint ch = (ch0 << 8) | ch1;
794 // TODO: check special cases
795 text[chars++] = ch;
796 }
797 consumedBytes(i);
798 appendedText(chars);
799 uint bleft = len - i;
800 if (_streamEof && bleft > 0)
801 invalidCharFlag = true; // incomplete character at end of stream
802 return chars;
803 }
804 }
805
806 private class Utf16leLineStream : LineStream {
807 this(InputStream stream, string filename, ubyte[] buf, uint len) {
808 super(stream, filename, EncodingType.UTF16LE, buf, 2, len);
809 }
810 override uint decodeText() {
811 if (invalidCharFlag) {
812 invalidCharError();
813 return 0;
814 }
815 uint bytesAvailable = readBytes();
816 ubyte * bytes = _buf.ptr + _pos;
817 if (bytesAvailable == 0)
818 return 0; // nothing to decode
819 uint len = bytesAvailable;
820 uint chars = 0;
821 ubyte* b = bytes;
822 dchar* text = reserveTextBuf(len / 2 + 1);
823 uint i = 0;
824 for (; i < len - 1; i += 2) {
825 uint ch0 = b[i];
826 uint ch1 = b[i + 1];
827 uint ch = (ch1 << 8) | ch0;
828 // TODO: check special cases
829 text[chars++] = ch;
830 }
831 consumedBytes(i);
832 appendedText(chars);
833 uint bleft = len - i;
834 if (_streamEof && bleft > 0)
835 invalidCharFlag = true; // incomplete character at end of stream
836 return chars;
837 }
838 }
839
840 private class Utf32beLineStream : LineStream {
841 this(InputStream stream, string filename, ubyte[] buf, uint len) {
842 super(stream, filename, EncodingType.UTF32BE, buf, 4, len);
843 }
844 override uint decodeText() {
845 if (invalidCharFlag) {
846 invalidCharError();
847 return 0;
848 }
849 uint bytesAvailable = readBytes();
850 ubyte * bytes = _buf.ptr + _pos;
851 if (bytesAvailable == 0)
852 return 0; // nothing to decode
853 uint len = bytesAvailable;
854 uint chars = 0;
855 ubyte* b = bytes;
856 dchar* text = reserveTextBuf(len / 2 + 1);
857 uint i = 0;
858 for (; i < len - 3; i += 4) {
859 uint ch0 = b[i];
860 uint ch1 = b[i + 1];
861 uint ch2 = b[i + 2];
862 uint ch3 = b[i + 3];
863 uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3;
864 if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
865 invalidCharFlag = true;
866 break;
867 }
868 text[chars++] = ch;
869 }
870 consumedBytes(i);
871 appendedText(chars);
872 uint bleft = len - i;
873 if (_streamEof && bleft > 0)
874 invalidCharFlag = true; // incomplete character at end of stream
875 return chars;
876 }
877 }
878
879 private class Utf32leLineStream : LineStream {
880 this(InputStream stream, string filename, ubyte[] buf, uint len) {
881 super(stream, filename, EncodingType.UTF32LE, buf, 4, len);
882 }
883 override uint decodeText() {
884 if (invalidCharFlag) {
885 invalidCharError();
886 return 0;
887 }
888 uint bytesAvailable = readBytes();
889 ubyte * bytes = _buf.ptr + _pos;
890 if (bytesAvailable == 0)
891 return 0; // nothing to decode
892 uint len = bytesAvailable;
893 uint chars = 0;
894 ubyte* b = bytes;
895 dchar* text = reserveTextBuf(len / 2 + 1);
896 uint i = 0;
897 for (; i < len - 3; i += 4) {
898 uint ch3 = b[i];
899 uint ch2 = b[i + 1];
900 uint ch1 = b[i + 2];
901 uint ch0 = b[i + 3];
902 uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3;
903 if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
904 invalidCharFlag = true;
905 break;
906 }
907 text[chars++] = ch;
908 }
909 consumedBytes(i);
910 appendedText(chars);
911 uint bleft = len - i;
912 if (_streamEof && bleft > 0)
913 invalidCharFlag = true; // incomplete character at end of stream
914 return chars;
915 }
916 }
917
918
919 unittest {
920 static if (false) {
921 import std.stdio;
922 import std.conv;
923 import std.utf;
924 //string fname = "C:\\projects\\d\\ddc\\ddclexer\\src\\ddc\\lexer\\LineStream.d";
925 //string fname = "/home/lve/src/d/ddc/ddclexer/" ~ __FILE__; //"/home/lve/src/d/ddc/ddclexer/src/ddc/lexer/Lexer.d";
926 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf8.d";
927 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16be.d";
928 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16le.d";
929 //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32be.d";
930 string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32le.d";
931 writeln("opening file");
932 std.stream.File f = new std.stream.File(fname);
933 scope(exit) { f.close(); }
934 try {
935 LineStream lines = LineStream.create(f, fname);
936 for (;;) {
937 dchar[] s = lines.readLine();
938 if (s is null)
939 break;
940 writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s));
941 }
942 if (lines.errorCode != 0) {
943 writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos);
944 } else {
945 writeln("EOF reached");
946 }
947 } catch (Exception e) {
948 writeln("Exception " ~ e.toString);
949 }
950 }
951 }
952 // LAST LINE