1 // Written in the D programming language.
2 
3 /**
4 
5 This module contains text stream reader implementation
6 
7 Implements class LineStream for reading of unicode text from stream and returning it by lines.
8 
9 Support utf8, utf16, utf32 be and le encodings, and line endings - according to D language source file specification.
10 
11 Low resource consuming. Doesn't flood with GC allocations. Dup line if you want to store it somewhere.
12 
13 Tracks line number.
14 
15 
16 Synopsis:
17 
18 ----
19 import dlangui.core.linestream;
20 
21 import std.stdio;
22 import std.conv;
23 import std.utf;
24 string fname = "somefile.d";
25 writeln("opening file");
26 std.stream.File f = new std.stream.File(fname);
27 scope(exit) { f.close(); }
28 try {
29     LineStream lines = LineStream.create(f, fname);
30     for (;;) {
31         dchar[] s = lines.readLine();
32         if (s is null)
33             break;
34         writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s));
35     }
36     if (lines.errorCode != 0) {
37         writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos);
38     } else {
39         writeln("EOF reached");
40     }
41 } catch (Exception e) {
42     writeln("Exception " ~ e.toString);
43 }
44 
45 ----
46 
47 Copyright: Vadim Lopatin, 2014
48 License:   Boost License 1.0
49 Authors:   Vadim Lopatin, coolreader.org@gmail.com
50 */
51 module dlangui.core.linestream;
52 
53 import dlangui.core.streams;
54 //import std.stream;
55 import std.stdio;
56 import std.conv;
57 import std.utf;
58 
59 /// File encoding
60 public enum EncodingType : int {
61     /// utf-8 unicode
62     UTF8,
63     /// utf-16 unicode big endian
64     UTF16BE,
65     /// utf-16 unicode little endian
66     UTF16LE,
67     /// utf-32 unicode big endian
68     UTF32BE,
69     /// utf-32 unicode little endian
70     UTF32LE,
71     /// plain ASCII (character codes must be <= 127)
72     ASCII,
73     /// encoding is unknown
74     UNKNOWN
75 }
76 /// Line ending style
77 public enum LineEnding : int {
78     /// LF (0x0A) - unix style
79     LF,
80     /// CR followed by LF (0x0D,0x0A) - windows style
81     CRLF,
82     /// CR (0x0D) - mac style
83     CR,
84     /// unknown line ending
85     UNKNOWN,
86     /// mixed line endings detected
87     MIXED
88 }
89 
90 /// Text file format
91 struct TextFileFormat {
92     /// character encoding
93     EncodingType encoding;
94     /// line ending style
95     LineEnding lineEnding;
96     /// byte order mark character flag
97     bool bom;
98     string toString() const {
99         return to!string(encoding) ~ " " ~ to!string(lineEnding) ~ (bom ? " bom" : "");
100     }
101 }
102 
103 /// Text file writer which supports different text file formats
104 class OutputLineStream {
105     protected OutputStream _stream;
106     protected string _filename;
107     protected TextFileFormat _format;
108     protected bool _firstLine;
109     protected char[] _buf;
110     protected int _len;
111     protected static immutable int MAX_BUFFER_SIZE = 0x10000; // 64K
112     /// create
113     this(OutputStream stream, string filename, TextFileFormat format) {
114         _stream = stream;
115         _filename = filename;
116         _format = format;
117         _firstLine = true;
118         // fix format
119         if (_format.encoding == EncodingType.UNKNOWN || _format.encoding == EncodingType.ASCII)
120             _format.encoding = EncodingType.UTF8;
121         if (_format.lineEnding == LineEnding.UNKNOWN || _format.lineEnding == LineEnding.MIXED) {
122             version (Windows) {
123                 _format.lineEnding = LineEnding.CRLF;
124             } else {
125                 _format.lineEnding = LineEnding.LF;
126             }
127         }
128     }
129 
130     protected void flush() {
131         if (_len > 0) {
132             _stream.write(cast(ubyte[])_buf[0 .. _len]);
133             _len = 0;
134         }
135     }
136 
137     /// convert character encoding and write to output stream
138     protected void convertAndWrite(dstring s) {
139         /// reserve buf space
140         if (_buf.length < _len + s.length * 4 + 4)
141             _buf.length = _len + s.length * 4 + 4;
142         switch (_format.encoding) with(EncodingType)
143         {
144             case UTF8:
145             default:
146                 char[4] d;
147                 foreach(i; 0 .. s.length) {
148                     int bytes = cast(int)encode(d, s[i]);
149                     foreach(j; 0 .. bytes)
150                         _buf[_len++] = d[j];
151                 }
152                 break;
153             case UTF16BE:
154                 wchar[2] d;
155                 foreach(i; 0 .. s.length) {
156                     int n = cast(int)encode(d, s[i]);
157                     foreach(j; 0 .. n) {
158                         _buf[_len++] = cast(char)(d[j] >> 8);
159                         _buf[_len++] = cast(char)(d[j] & 0xFF);
160                     }
161                 }
162                 break;
163             case UTF16LE:
164                 wchar[2] d;
165                 foreach(i; 0 .. s.length) {
166                     int n = cast(int)encode(d, s[i]);
167                     foreach(j; 0 .. n) {
168                         _buf[_len++] = cast(char)(d[j] & 0xFF);
169                         _buf[_len++] = cast(char)(d[j] >> 8);
170                     }
171                 }
172                 break;
173             case UTF32LE:
174                 foreach(i; 0 .. s.length) {
175                     dchar ch = s[i];
176                     _buf[_len++] = cast(char)((ch >> 0) & 0xFF);
177                     _buf[_len++] = cast(char)((ch >> 8) & 0xFF);
178                     _buf[_len++] = cast(char)((ch >> 16) & 0xFF);
179                     _buf[_len++] = cast(char)((ch >> 24) & 0xFF);
180                 }
181                 break;
182             case UTF32BE:
183                 foreach(i; 0 .. s.length) {
184                     dchar ch = s[i];
185                     _buf[_len++] = cast(char)((ch >> 24) & 0xFF);
186                     _buf[_len++] = cast(char)((ch >> 16) & 0xFF);
187                     _buf[_len++] = cast(char)((ch >> 8) & 0xFF);
188                     _buf[_len++] = cast(char)((ch >> 0) & 0xFF);
189                 }
190                 break;
191         }
192         if (_len > MAX_BUFFER_SIZE)
193             flush();
194     }
195     /// write single line
196     void writeLine(dstring line) {
197         if (_firstLine) {
198             if (_format.bom)
199                 convertAndWrite("\uFEFF"d); // write BOM
200             _firstLine = false;
201         }
202         convertAndWrite(line);
203         switch(_format.lineEnding) {
204             case LineEnding.LF:
205                 convertAndWrite("\n"d);
206                 break;
207             case LineEnding.CR:
208                 convertAndWrite("\r"d);
209                 break;
210             default:
211             case LineEnding.CRLF:
212                 convertAndWrite("\r\n"d);
213                 break;
214         }
215     }
216     /// close stream
217     void close() {
218         flush();
219         _stream.close();
220         _buf = null;
221     }
222 }
223 
224 
225 /**
226     Support reading of file (or string in memory) by lines
227 
228     Support utf8, utf16, utf32 be and le encodings, and line endings - according to D language source file specification.
229 
230     Low resource consuming. Doesn't flood with GC allocations. Dup line if you want to store it somewhere.
231 
232     Tracks line number.
233 */
234 class LineStream {
235 
236     /// Error codes
237     public enum ErrorCodes {
238         /// invalid character for current encoding
239         INVALID_CHARACTER
240     }
241 
242     private InputStream _stream;
243     private string _filename;
244     private ubyte[] _buf;  // stream reading buffer
245     private uint _pos; // reading position of stream buffer
246     private uint _len; // number of bytes in stream buffer
247     private bool _streamEof; // true if input stream is in EOF state
248     private uint _line; // current line number
249 
250     private uint _textPos; // start of text line in text buffer
251     private uint _textLen; // position of last filled char in text buffer + 1
252     private dchar[] _textBuf; // text buffer
253     private bool _eof; // end of file, no more lines
254     protected bool _bomDetected;
255     protected int _crCount;
256     protected int _lfCount;
257     protected int _crlfCount;
258 
259     /// Returns file name
260     @property string filename() { return _filename; }
261     /// Returns current line number
262     @property uint line() { return _line; }
263     /// Returns file encoding EncodingType
264     @property EncodingType encoding() { return _encoding; }
265 
266     @property TextFileFormat textFormat() {
267         LineEnding le = LineEnding.CRLF;
268         if (_crlfCount) {
269             if (_crCount == _lfCount)
270                 le = LineEnding.CRLF;
271             else
272                 le = LineEnding.MIXED;
273         } else if (_crCount > _lfCount) {
274             le = LineEnding.CR;
275         } else if (_lfCount > _crCount) {
276             le = LineEnding.LF;
277         } else {
278             le = LineEnding.MIXED;
279         }
280         TextFileFormat res = TextFileFormat(_encoding, le, _bomDetected);
281         return res;
282     }
283 
284 
285     /// Returns error code
286     @property int errorCode() { return _errorCode; }
287     /// Returns error message
288     @property string errorMessage() { return _errorMessage; }
289     /// Returns line where error is found
290     @property int errorLine() { return _errorLine; }
291     /// Returns line position (number of character in line) where error is found
292     @property int errorPos() { return _errorPos; }
293 
294     private immutable EncodingType _encoding;
295 
296     private int _errorCode;
297     private string _errorMessage;
298     private uint _errorLine;
299     private uint _errorPos;
300 
301     /// Open file with known encoding
302     protected this(InputStream stream, string filename, EncodingType encoding, ubyte[] buf, uint offset, uint len) {
303         _filename = filename;
304         _stream = stream;
305         _encoding = encoding;
306         _buf = buf;
307         _len = len;
308         _pos = offset;
309         _streamEof = _stream.eof;
310     }
311 
312     /// this constructor was created for unittests only
313     protected this(){
314         _encoding = EncodingType.UTF8;
315     }
316 
317     /// returns slice of bytes available in buffer
318     protected uint readBytes() {
319         uint bytesLeft = _len - _pos;
320         if (_streamEof || bytesLeft > QUARTER_BYTE_BUFFER_SIZE)
321             return bytesLeft;
322         if (_pos > 0) {
323             foreach(i; 0 .. bytesLeft)
324                 _buf[i] = _buf[i + _pos];
325             _len = bytesLeft;
326             _pos = 0;
327         }
328         uint bytesRead = cast(uint)_stream.read(_buf[_len .. BYTE_BUFFER_SIZE]);
329         _len += bytesRead;
330         _streamEof = _stream.eof;
331         return _len - _pos; //_buf[_pos .. _len];
332     }
333 
334     // when bytes consumed from byte buffer, call this method to update position
335     protected void consumedBytes(uint count) {
336         _pos += count;
337     }
338 
339     // reserve text buffer for specified number of characters, and return pointer to first free character in buffer
340     protected dchar * reserveTextBuf(uint len) {
341         // create new text buffer if necessary
342         if (_textBuf == null) {
343             if (len < TEXT_BUFFER_SIZE)
344                 len = TEXT_BUFFER_SIZE;
345             _textBuf = new dchar[len];
346             return _textBuf.ptr;
347         }
348         uint spaceLeft = cast(uint)_textBuf.length - _textLen;
349         if (spaceLeft >= len)
350             return _textBuf.ptr + _textLen;
351         // move text to beginning of buffer, if necessary
352         if (_textPos > _textBuf.length / 2) {
353             uint charCount = _textLen - _textPos;
354             dchar * p = _textBuf.ptr;
355             foreach(i; 0 .. charCount)
356                 p[i] = p[i + _textPos];
357             _textLen = charCount;
358             _textPos = 0;
359         }
360         // resize buffer if necessary
361         if (_textLen + len > _textBuf.length) {
362             // resize buffer
363             uint newsize = cast(uint)_textBuf.length * 2;
364             if (newsize < _textLen + len)
365                 newsize = _textLen + len;
366             _textBuf.length = newsize;
367         }
368         return _textBuf.ptr + _textLen;
369     }
370 
371     protected void appendedText(uint len) {
372         //writeln("appended ", len, " chars of text"); //:", _textBuf[_textLen .. _textLen + len]);
373         _textLen += len;
374     }
375 
376     protected void setError(int code, string message, uint errorLine, uint errorPos) {
377         _errorCode = code;
378         _errorMessage = message;
379         _errorLine = errorLine;
380         _errorPos = errorPos;
381     }
382 
383     // override to decode text
384     protected abstract uint decodeText();
385 
386     /// Unknown line position
387     immutable static uint LINE_POSITION_UNDEFINED = uint.max;
388 
389     /// Read line from stream
390     public dchar[] readLine() {
391         if (_errorCode != 0) {
392             //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine);
393             return null; // error detected
394         }
395         if (_eof) {
396             //writeln("EOF found");
397             return null;
398         }
399         _line++;
400         uint p = 0;
401         uint eol = LINE_POSITION_UNDEFINED;
402         uint eof = LINE_POSITION_UNDEFINED;
403         uint lastchar = LINE_POSITION_UNDEFINED;
404         do {
405             if (_errorCode != 0) {
406                 //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine);
407                 return null; // error detected
408             }
409             uint charsLeft = _textLen - _textPos;
410             if (p >= charsLeft) {
411                 uint decodedChars = decodeText();
412                 if (_errorCode != 0) {
413                     return null; // error detected
414                 }
415                 charsLeft = _textLen - _textPos;
416                 if (decodedChars == 0) {
417                     eol = charsLeft;
418                     eof = charsLeft;
419                     lastchar = charsLeft;
420                     break;
421                 }
422             }
423             for (; p < charsLeft; p++) {
424                 dchar ch = _textBuf[_textPos + p];
425                 if (ch == '\r') { // CR
426                     lastchar = p;
427                     if (p == charsLeft - 1) {
428                         // need one more char to check if it's 0D0A or just 0D eol
429                         //writeln("read one more char for 0D0A detection");
430                         decodeText();
431                         if (_errorCode != 0) {
432                             return null; // error detected
433                         }
434                         charsLeft = _textLen - _textPos;
435                     }
436                     dchar ch2 = (p < charsLeft - 1) ? _textBuf[_textPos + p + 1] : 0;
437                     if (ch2 == '\n') { // LF
438                         // CRLF
439                         eol = p + 2;
440                         _lfCount++;
441                         _crCount++;
442                         _crlfCount++;
443                     } else {
444                         // just CR
445                         eol = p + 1;
446                         _crCount++;
447                     }
448                     break;
449                 } else if (ch == '\n' || ch == 0x2028 || ch == 0x2029) {
450                     // single char eoln
451                     lastchar = p;
452                     eol = p + 1;
453                     _lfCount++;
454                     break;
455                 } else if (ch == 0 || ch == 0x001A) {
456                     // eof
457                     //writeln("EOF char found");
458                     lastchar = p;
459                     eol = eof = p + 1;
460                     break;
461                 }
462             }
463         } while (eol == LINE_POSITION_UNDEFINED);
464         uint lineStart = _textPos;
465         uint lineEnd = _textPos + lastchar;
466         _textPos += eol; // consume text
467         if (eof != LINE_POSITION_UNDEFINED) {
468             _eof = true;
469             //writeln("Setting eof flag. lastchar=", lastchar, ", p=", p, ", lineStart=", lineStart);
470             if (lineStart >= lineEnd) {
471                 //writeln("lineStart >= lineEnd -- treat as eof");
472                 return null; // eof
473             }
474         }
475         // return slice with decoded line
476         return _textBuf[lineStart .. lineEnd];
477     }
478 
479     protected immutable static int TEXT_BUFFER_SIZE = 1024;
480     protected immutable static int BYTE_BUFFER_SIZE = 512;
481     protected immutable static int QUARTER_BYTE_BUFFER_SIZE = BYTE_BUFFER_SIZE / 4;
482 
483     /// Factory method for string parser
484     public static LineStream create(string code, string filename = "") {
485         uint len = cast(uint)code.length;
486         ubyte[] data = new ubyte[len + 3];
487         foreach(i; 0 .. len)
488             data[i + 3] = code[i];
489         // BOM for UTF8
490         data[0] = 0xEF;
491         data[1] = 0xBB;
492         data[2] = 0xBF;
493         InputStream stream = new MemoryInputStream(data); //new MemoryStream(data);
494         return create(stream, filename);
495     }
496 
497     /// Factory for InputStream parser
498     public static LineStream create(InputStream stream, string filename, bool autodetectUTFIfNoBOM = true) {
499         ubyte[] buf = new ubyte[BYTE_BUFFER_SIZE];
500         buf[0] = buf[1] = buf[2]  = buf[3] = 0;
501         if (!stream.isOpen)
502             return null;
503         uint len = cast(uint)stream.read(buf);
504         LineStream res = null;
505         if (buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) {
506             res = new Utf8LineStream(stream, filename, buf, len, 3);
507         } else if (buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF) {
508             res = new Utf32beLineStream(stream, filename, buf, len);
509         } else if (buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00) {
510             res = new Utf32leLineStream(stream, filename, buf, len);
511         } else if (buf[0] == 0xFE && buf[1] == 0xFF) {
512             res =  new Utf16beLineStream(stream, filename, buf, len);
513         } else if (buf[0] == 0xFF && buf[1] == 0xFE) {
514             res = new Utf16leLineStream(stream, filename, buf, len);
515         }
516         if (res) {
517             res._bomDetected = true;
518         } else {
519             if (autodetectUTFIfNoBOM) {
520                 res = new Utf8LineStream(stream, filename, buf, len, 0);
521             } else {
522                 res = new AsciiLineStream(stream, filename, buf, len);
523             }
524         }
525         return res;
526     }
527 
528     protected bool invalidCharFlag;
529     protected void invalidCharError() {
530         uint pos = _textLen - _textPos + 1;
531         setError(ErrorCodes.INVALID_CHARACTER, "Invalid character in line " ~ to!string(_line) ~ ":" ~ to!string(pos), _line, pos);
532     }
533 }
534 
535 
536 private class AsciiLineStream : LineStream {
537     this(InputStream stream, string filename, ubyte[] buf, uint len) {
538         super(stream, filename, EncodingType.ASCII, buf, 0, len);
539     }
540     override uint decodeText() {
541         if (invalidCharFlag) {
542             invalidCharError();
543             return 0;
544         }
545         uint bytesAvailable = readBytes();
546         ubyte * bytes = _buf.ptr + _pos;
547         if (bytesAvailable == 0)
548             return 0; // nothing to decode
549         uint len = bytesAvailable;
550         ubyte* b = bytes;
551         dchar* text = reserveTextBuf(len);
552         uint i = 0;
553         for (; i < len; i++) {
554             ubyte ch = b[i];
555             if (ch & 0x80) {
556                 // invalid character
557                 invalidCharFlag = true;
558                 break;
559             }
560             text[i] = ch;
561         }
562         consumedBytes(i);
563         appendedText(i);
564         return len;
565     }
566 
567 }
568 
569 private class Utf8LineStream : LineStream {
570     this(InputStream stream, string filename, ubyte[] buf, uint len, int skip) {
571         super(stream, filename, EncodingType.UTF8, buf, skip, len);
572     }
573 
574     uint decodeBytes(ubyte* b,in uint bleft, out uint ch, out bool needMoreFlag){
575         uint bread = 0;
576         uint ch0 = b[0];
577         if (!(ch0 & 0x80)) {
578             // 0x00..0x7F single byte
579             // 0x80 == 10000000
580             // !(ch0 & 0x80) => ch0 < 10000000
581             ch = ch0;
582             bread = 1;
583         } else if ((ch0 & 0xE0) == 0xC0) {
584             // two bytes 110xxxxx 10xxxxxx
585             if (bleft < 2) {
586                 needMoreFlag = true;
587                 return 0;
588             }
589             uint ch1 = b[1];
590             if ((ch1 & 0xC0) != 0x80) {
591                 return 0;
592             }
593             ch = ((ch0 & 0x1F) << 6) | (ch1 & 0x3F);
594             bread = 2;
595         } else if ((ch0 & 0xF0) == 0xE0) {
596             // three bytes 1110xxxx 10xxxxxx 10xxxxxx
597             if (bleft < 3) {
598                 needMoreFlag = true;
599                 return 0;
600             }
601             uint ch1 = b[1];
602             uint ch2 = b[2];
603             if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) {
604                 return 0;
605             }
606             ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x3F) << 6) | (ch2 & 0x3F);
607             bread = 3;
608         } else if ((ch0 & 0xF8) == 0xF0) {
609             // four bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
610             if (bleft < 4) {
611                 needMoreFlag = true;
612                 return 0;
613             }
614             uint ch1 = b[1];
615             uint ch2 = b[2];
616             uint ch3 = b[3];
617             if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) {
618                 return 0;
619             }
620             ch = ((ch0 & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
621             bread = 4;
622         } else if ((ch0 & 0xFC) == 0xF8) {
623             // five bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
624             if (bleft < 5) {
625                 needMoreFlag = true;
626                 return 0;
627             }
628             uint ch1 = b[1];
629             uint ch2 = b[2];
630             uint ch3 = b[3];
631             uint ch4 = b[4];
632             if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) {
633                 return 0;
634             }
635             ch = ((ch0 & 0x03) << 24) | ((ch1 & 0x3F) << 18) | ((ch2 & 0x3F) << 12) | ((ch3 & 0x3F) << 6) | (ch4 & 0x3F);
636             bread = 5;
637         } else if ((ch0 & 0xFE) == 0xFC) {
638             // six bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
639             if (bleft < 6){
640                 needMoreFlag = true;
641                 return 0;
642             }
643 
644             uint ch1 = b[1];
645             uint ch2 = b[2];
646             uint ch3 = b[3];
647             uint ch4 = b[4];
648             uint ch5 = b[5];
649             if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80 || (ch5 & 0xC0) != 0x80) {
650                 return 0;
651             }
652             ch = ((ch0 & 0x01) << 30) | ((ch1 & 0x3F) << 24) | ((ch2 & 0x3F) << 18) | ((ch3 & 0x3F) << 12) | ((ch4 & 0x3F) << 6) | (ch5 & 0x3F);
653             bread = 5;
654         }
655         if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
656             return 0;
657         }
658         return bread;
659     }
660 
661     /// this constructor was created for unittests only
662     protected this(){
663 
664     }
665 
666     unittest {
667         auto o = new Utf8LineStream();
668         ubyte[] buffer =  new ubyte[4];
669         ubyte * bytes  = buffer.ptr;
670         uint ch;
671         bool needMoreFlag;
672         uint bread;
673 
674         //convert simple character
675         buffer[0] = '/';
676         bread = o.decodeBytes(bytes,1,ch,needMoreFlag);
677         assert(!needMoreFlag);
678         assert(bread == 1);
679         assert(ch == '/');
680         //writefln("/ as hex: 0x%32x,0x%32x", ch,'/');
681 
682 
683         //convert 2byte character
684         buffer[0] = 0xc3;
685         buffer[1] = 0x84;
686         bread = o.decodeBytes(bytes,1,ch,needMoreFlag);
687         assert(needMoreFlag);
688 
689         bread = o.decodeBytes(bytes,2,ch,needMoreFlag);
690         assert(!needMoreFlag);
691         assert(bread == 2);
692         assert(ch == 'Ä');
693         //writefln("Ä as hex: 0x%32x,0x%32x", ch,'Ä');
694 
695         //convert 3byte character
696         buffer[0] = 0xe0;
697         buffer[1] = 0xa4;
698         buffer[2] = 0xb4;
699         bread = o.decodeBytes(bytes,2,ch,needMoreFlag);
700         assert(needMoreFlag);
701 
702         bread = o.decodeBytes(bytes,3,ch,needMoreFlag);
703         assert(!needMoreFlag);
704         assert(bread == 3);
705         //writefln("ऴ as hex: 0x%32x,0x%32x", ch,'ऴ');
706         assert(ch == 'ऴ');
707 
708         //regression test for https://github.com/buggins/dlangide/issues/65
709         buffer[0] = 0xEB;
710         buffer[1] = 0xB8;
711         buffer[2] = 0x94;
712         bread = o.decodeBytes(bytes,3,ch,needMoreFlag);
713         assert(!needMoreFlag);
714         assert(bread == 3);
715         //writefln("블 as hex: 0x%32x,0x%32x", ch,'블');
716         assert(ch == '블');
717     }
718 
719     override uint decodeText() {
720         //number of bytesAvailable
721         uint len = readBytes();
722         if (len == 0)
723             return 0; // nothing to decode
724 
725         if (invalidCharFlag) {
726             invalidCharError();
727             return 0;
728         }
729         ubyte * bytes = _buf.ptr + _pos;
730         ubyte* b = bytes;
731         uint chars = 0;
732         uint maxResultingBytes = len*2; //len*2 because worst case is if all input chars are singelbyte and resulting in two bytes
733         dchar* text = reserveTextBuf(maxResultingBytes);
734         uint i = 0;
735 
736         bool needMoreFlag = false;
737         for (; i < len; i++) {
738             uint ch = 0;
739             uint bleft = len - i;
740             uint bread = decodeBytes(b+i,bleft,ch,needMoreFlag);
741 
742             if(needMoreFlag){
743                 //decodeBytes needs more bytes, but nore more bytes left in the buffer
744                 break;
745             }
746 
747             if (bread == 0) {
748                 //decodeBytes could not read any charater. stop procesing
749                 invalidCharFlag = true;
750                 break;
751             }
752 
753             if (ch < 0x10000) {
754                 text[chars++] = ch;
755             } else {
756                 uint lo = ch & 0x3FF;
757                 uint hi = ch >> 10;
758                 text[chars++] = (0xd800 | hi);
759                 text[chars++] = (0xdc00 | lo);
760             }
761             i += bread - 1;
762         }
763         consumedBytes(i);
764         appendedText(chars);
765         uint bleft = len - i;
766         if (_streamEof && bleft > 0)
767             invalidCharFlag = true; // incomplete character at end of stream
768         return chars;
769     }
770 }
771 
772 private class Utf16beLineStream : LineStream {
773     this(InputStream stream, string filename, ubyte[] buf, uint len) {
774         super(stream, filename, EncodingType.UTF16BE, buf, 2, len);
775     }
776     override uint decodeText() {
777         if (invalidCharFlag) {
778             invalidCharError();
779             return 0;
780         }
781         uint bytesAvailable = readBytes();
782         ubyte * bytes = _buf.ptr + _pos;
783         if (bytesAvailable == 0)
784             return 0; // nothing to decode
785         uint len = bytesAvailable;
786         uint chars = 0;
787         ubyte* b = bytes;
788         dchar* text = reserveTextBuf(len / 2 + 1);
789         uint i = 0;
790         for (; i < len - 1; i += 2) {
791             uint ch0 = b[i];
792             uint ch1 = b[i + 1];
793             uint ch = (ch0 << 8) | ch1;
794             // TODO: check special cases
795             text[chars++] = ch;
796         }
797         consumedBytes(i);
798         appendedText(chars);
799         uint bleft = len - i;
800         if (_streamEof && bleft > 0)
801             invalidCharFlag = true; // incomplete character at end of stream
802         return chars;
803     }
804 }
805 
806 private class Utf16leLineStream : LineStream {
807     this(InputStream stream, string filename, ubyte[] buf, uint len) {
808         super(stream, filename, EncodingType.UTF16LE, buf, 2, len);
809     }
810     override uint decodeText() {
811         if (invalidCharFlag) {
812             invalidCharError();
813             return 0;
814         }
815         uint bytesAvailable = readBytes();
816         ubyte * bytes = _buf.ptr + _pos;
817         if (bytesAvailable == 0)
818             return 0; // nothing to decode
819         uint len = bytesAvailable;
820         uint chars = 0;
821         ubyte* b = bytes;
822         dchar* text = reserveTextBuf(len / 2 + 1);
823         uint i = 0;
824         for (; i < len - 1; i += 2) {
825             uint ch0 = b[i];
826             uint ch1 = b[i + 1];
827             uint ch = (ch1 << 8) | ch0;
828             // TODO: check special cases
829             text[chars++] = ch;
830         }
831         consumedBytes(i);
832         appendedText(chars);
833         uint bleft = len - i;
834         if (_streamEof && bleft > 0)
835             invalidCharFlag = true; // incomplete character at end of stream
836         return chars;
837     }
838 }
839 
840 private class Utf32beLineStream : LineStream {
841     this(InputStream stream, string filename, ubyte[] buf, uint len) {
842         super(stream, filename, EncodingType.UTF32BE, buf, 4, len);
843     }
844     override uint decodeText() {
845         if (invalidCharFlag) {
846             invalidCharError();
847             return 0;
848         }
849         uint bytesAvailable = readBytes();
850         ubyte * bytes = _buf.ptr + _pos;
851         if (bytesAvailable == 0)
852             return 0; // nothing to decode
853         uint len = bytesAvailable;
854         uint chars = 0;
855         ubyte* b = bytes;
856         dchar* text = reserveTextBuf(len / 2 + 1);
857         uint i = 0;
858         for (; i < len - 3; i += 4) {
859             uint ch0 = b[i];
860             uint ch1 = b[i + 1];
861             uint ch2 = b[i + 2];
862             uint ch3 = b[i + 3];
863             uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3;
864             if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
865                 invalidCharFlag = true;
866                 break;
867             }
868             text[chars++] = ch;
869         }
870         consumedBytes(i);
871         appendedText(chars);
872         uint bleft = len - i;
873         if (_streamEof && bleft > 0)
874             invalidCharFlag = true; // incomplete character at end of stream
875         return chars;
876     }
877 }
878 
879 private class Utf32leLineStream : LineStream {
880     this(InputStream stream, string filename, ubyte[] buf, uint len) {
881         super(stream, filename, EncodingType.UTF32LE, buf, 4, len);
882     }
883     override uint decodeText() {
884         if (invalidCharFlag) {
885             invalidCharError();
886             return 0;
887         }
888         uint bytesAvailable = readBytes();
889         ubyte * bytes = _buf.ptr + _pos;
890         if (bytesAvailable == 0)
891             return 0; // nothing to decode
892         uint len = bytesAvailable;
893         uint chars = 0;
894         ubyte* b = bytes;
895         dchar* text = reserveTextBuf(len / 2 + 1);
896         uint i = 0;
897         for (; i < len - 3; i += 4) {
898             uint ch3 = b[i];
899             uint ch2 = b[i + 1];
900             uint ch1 = b[i + 2];
901             uint ch0 = b[i + 3];
902             uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3;
903             if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
904                 invalidCharFlag = true;
905                 break;
906             }
907             text[chars++] = ch;
908         }
909         consumedBytes(i);
910         appendedText(chars);
911         uint bleft = len - i;
912         if (_streamEof && bleft > 0)
913             invalidCharFlag = true; // incomplete character at end of stream
914         return chars;
915     }
916 }
917 
918 
919 unittest {
920     static if (false) {
921         import std.stdio;
922         import std.conv;
923         import std.utf;
924         //string fname = "C:\\projects\\d\\ddc\\ddclexer\\src\\ddc\\lexer\\LineStream.d";
925         //string fname = "/home/lve/src/d/ddc/ddclexer/" ~ __FILE__; //"/home/lve/src/d/ddc/ddclexer/src/ddc/lexer/Lexer.d";
926         //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf8.d";
927         //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16be.d";
928         //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16le.d";
929         //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32be.d";
930         string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32le.d";
931         writeln("opening file");
932         std.stream.File f = new std.stream.File(fname);
933         scope(exit) { f.close(); }
934         try {
935             LineStream lines = LineStream.create(f, fname);
936             for (;;) {
937                 dchar[] s = lines.readLine();
938                 if (s is null)
939                     break;
940                 writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s));
941             }
942             if (lines.errorCode != 0) {
943                 writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos);
944             } else {
945                 writeln("EOF reached");
946             }
947         } catch (Exception e) {
948             writeln("Exception " ~ e.toString);
949         }
950     }
951 }
952 // LAST LINE