1 // Written in the D programming language.
3 /**
5 This module contains text stream reader implementation
7 Implements class LineStream for reading of unicode text from stream and returning it by lines.
9 Support utf8, utf16, utf32 be and le encodings, and line endings - according to D language source file specification.
11 Low resource consuming. Doesn't flood with GC allocations. Dup line if you want to store it somewhere.
13 Tracks line number.
16 Synopsis:
18 ----
19 import dlangui.core.linestream;
21 import std.stdio;
22 import std.conv;
23 import std.utf;
24 string fname = "somefile.d";
25 writeln("opening file");
26 std.stream.File f = new std.stream.File(fname);
27 scope(exit) { f.close(); }
28 try {
29     LineStream lines = LineStream.create(f, fname);
30     for (;;) {
31         dchar[] s = lines.readLine();
32         if (s is null)
33             break;
34         writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s));
35     }
36     if (lines.errorCode != 0) {
37         writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos);
38     } else {
39         writeln("EOF reached");
40     }
41 } catch (Exception e) {
42     writeln("Exception " ~ e.toString);
43 }
45 ----
47 Copyright: Vadim Lopatin, 2014
48 License:   Boost License 1.0
49 Authors:   Vadim Lopatin, coolreader.org@gmail.com
50 */
51 module dlangui.core.linestream;
53 import dlangui.core.streams;
54 //import std.stream;
55 import std.stdio;
56 import std.conv;
57 import std.utf;
59 /// File encoding
60 public enum EncodingType : int {
61     /// utf-8 unicode
62     UTF8,
63     /// utf-16 unicode big endian
64     UTF16BE,
65     /// utf-16 unicode little endian
66     UTF16LE,
67     /// utf-32 unicode big endian
68     UTF32BE,
69     /// utf-32 unicode little endian
70     UTF32LE,
71     /// plain ASCII (character codes must be <= 127)
72     ASCII,
73     /// encoding is unknown
74     UNKNOWN
75 }
76 /// Line ending style
77 public enum LineEnding : int {
78     /// LF (0x0A) - unix style
79     LF,
80     /// CR followed by LF (0x0D,0x0A) - windows style
81     CRLF,
82     /// CR (0x0D) - mac style
83     CR,
84     /// unknown line ending
85     UNKNOWN,
86     /// mixed line endings detected
87     MIXED
88 }
90 /// Text file format
91 struct TextFileFormat {
92     /// character encoding
93     EncodingType encoding;
94     /// line ending style
95     LineEnding lineEnding;
96     /// byte order mark character flag
97     bool bom;
98     string toString() const {
99         return to!string(encoding) ~ " " ~ to!string(lineEnding) ~ (bom ? " bom" : "");
100     }
101 }
103 /// Text file writer which supports different text file formats
104 class OutputLineStream {
105     protected OutputStream _stream;
106     protected string _filename;
107     protected TextFileFormat _format;
108     protected bool _firstLine;
109     protected char[] _buf;
110     protected int _len;
111     protected static immutable int MAX_BUFFER_SIZE = 0x10000; // 64K
112     /// create
113     this(OutputStream stream, string filename, TextFileFormat format) {
114         _stream = stream;
115         _filename = filename;
116         _format = format;
117         _firstLine = true;
118         // fix format
119         if (_format.encoding == EncodingType.UNKNOWN || _format.encoding == EncodingType.ASCII)
120             _format.encoding = EncodingType.UTF8;
121         if (_format.lineEnding == LineEnding.UNKNOWN || _format.lineEnding == LineEnding.MIXED) {
122             version (Windows) {
123                 _format.lineEnding = LineEnding.CRLF;
124             } else {
125                 _format.lineEnding = LineEnding.LF;
126             }
127         }
128     }
130     protected void flush() {
131         if (_len > 0) {
132             _stream.write(cast(ubyte[])_buf[0 .. _len]);
133             _len = 0;
134         }
135     }
137     /// convert character encoding and write to output stream
138     protected void convertAndWrite(dstring s) {
139         /// reserve buf space
140         if (_buf.length < _len + s.length * 4 + 4)
141             _buf.length = _len + s.length * 4 + 4;
142         switch (_format.encoding) with(EncodingType)
143         {
144             case UTF8:
145             default:
146                 char[4] d;
147                 foreach(i; 0 .. s.length) {
148                     int bytes = cast(int)encode(d, s[i]);
149                     foreach(j; 0 .. bytes)
150                         _buf[_len++] = d[j];
151                 }
152                 break;
153             case UTF16BE:
154                 wchar[2] d;
155                 foreach(i; 0 .. s.length) {
156                     int n = cast(int)encode(d, s[i]);
157                     foreach(j; 0 .. n) {
158                         _buf[_len++] = cast(char)(d[j] >> 8);
159                         _buf[_len++] = cast(char)(d[j] & 0xFF);
160                     }
161                 }
162                 break;
163             case UTF16LE:
164                 wchar[2] d;
165                 foreach(i; 0 .. s.length) {
166                     int n = cast(int)encode(d, s[i]);
167                     foreach(j; 0 .. n) {
168                         _buf[_len++] = cast(char)(d[j] & 0xFF);
169                         _buf[_len++] = cast(char)(d[j] >> 8);
170                     }
171                 }
172                 break;
173             case UTF32LE:
174                 foreach(i; 0 .. s.length) {
175                     dchar ch = s[i];
176                     _buf[_len++] = cast(char)((ch >> 0) & 0xFF);
177                     _buf[_len++] = cast(char)((ch >> 8) & 0xFF);
178                     _buf[_len++] = cast(char)((ch >> 16) & 0xFF);
179                     _buf[_len++] = cast(char)((ch >> 24) & 0xFF);
180                 }
181                 break;
182             case UTF32BE:
183                 foreach(i; 0 .. s.length) {
184                     dchar ch = s[i];
185                     _buf[_len++] = cast(char)((ch >> 24) & 0xFF);
186                     _buf[_len++] = cast(char)((ch >> 16) & 0xFF);
187                     _buf[_len++] = cast(char)((ch >> 8) & 0xFF);
188                     _buf[_len++] = cast(char)((ch >> 0) & 0xFF);
189                 }
190                 break;
191         }
192         if (_len > MAX_BUFFER_SIZE)
193             flush();
194     }
195     /// write single line
196     void writeLine(dstring line) {
197         if (_firstLine) {
198             if (_format.bom)
199                 convertAndWrite("\uFEFF"d); // write BOM
200             _firstLine = false;
201         }
202         convertAndWrite(line);
203         switch(_format.lineEnding) {
204             case LineEnding.LF:
205                 convertAndWrite("\n"d);
206                 break;
207             case LineEnding.CR:
208                 convertAndWrite("\r"d);
209                 break;
210             default:
211             case LineEnding.CRLF:
212                 convertAndWrite("\r\n"d);
213                 break;
214         }
215     }
216     /// close stream
217     void close() {
218         flush();
219         _stream.close();
220         _buf = null;
221     }
222 }
225 /**
226     Support reading of file (or string in memory) by lines
228     Support utf8, utf16, utf32 be and le encodings, and line endings - according to D language source file specification.
230     Low resource consuming. Doesn't flood with GC allocations. Dup line if you want to store it somewhere.
232     Tracks line number.
233 */
234 class LineStream {
236     /// Error codes
237     public enum ErrorCodes {
238         /// invalid character for current encoding
240     }
242     private InputStream _stream;
243     private string _filename;
244     private ubyte[] _buf;  // stream reading buffer
245     private uint _pos; // reading position of stream buffer
246     private uint _len; // number of bytes in stream buffer
247     private bool _streamEof; // true if input stream is in EOF state
248     private uint _line; // current line number
250     private uint _textPos; // start of text line in text buffer
251     private uint _textLen; // position of last filled char in text buffer + 1
252     private dchar[] _textBuf; // text buffer
253     private bool _eof; // end of file, no more lines
254     protected bool _bomDetected;
255     protected int _crCount;
256     protected int _lfCount;
257     protected int _crlfCount;
259     /// Returns file name
260     @property string filename() { return _filename; }
261     /// Returns current line number
262     @property uint line() { return _line; }
263     /// Returns file encoding EncodingType
264     @property EncodingType encoding() { return _encoding; }
266     @property TextFileFormat textFormat() {
267         LineEnding le = LineEnding.CRLF;
268         if (_crlfCount) {
269             if (_crCount == _lfCount)
270                 le = LineEnding.CRLF;
271             else
272                 le = LineEnding.MIXED;
273         } else if (_crCount > _lfCount) {
274             le = LineEnding.CR;
275         } else if (_lfCount > _crCount) {
276             le = LineEnding.LF;
277         } else {
278             le = LineEnding.MIXED;
279         }
280         TextFileFormat res = TextFileFormat(_encoding, le, _bomDetected);
281         return res;
282     }
285     /// Returns error code
286     @property int errorCode() { return _errorCode; }
287     /// Returns error message
288     @property string errorMessage() { return _errorMessage; }
289     /// Returns line where error is found
290     @property int errorLine() { return _errorLine; }
291     /// Returns line position (number of character in line) where error is found
292     @property int errorPos() { return _errorPos; }
294     private immutable EncodingType _encoding;
296     private int _errorCode;
297     private string _errorMessage;
298     private uint _errorLine;
299     private uint _errorPos;
301     /// Open file with known encoding
302     protected this(InputStream stream, string filename, EncodingType encoding, ubyte[] buf, uint offset, uint len) {
303         _filename = filename;
304         _stream = stream;
305         _encoding = encoding;
306         _buf = buf;
307         _len = len;
308         _pos = offset;
309         _streamEof = _stream.eof;
310     }
312     /// this constructor was created for unittests only
313     protected this(){
314         _encoding = EncodingType.UTF8;
315     }
317     /// returns slice of bytes available in buffer
318     protected uint readBytes() {
319         uint bytesLeft = _len - _pos;
320         if (_streamEof || bytesLeft > QUARTER_BYTE_BUFFER_SIZE)
321             return bytesLeft;
322         if (_pos > 0) {
323             foreach(i; 0 .. bytesLeft)
324                 _buf[i] = _buf[i + _pos];
325             _len = bytesLeft;
326             _pos = 0;
327         }
328         uint bytesRead = cast(uint)_stream.read(_buf[_len .. BYTE_BUFFER_SIZE]);
329         _len += bytesRead;
330         _streamEof = _stream.eof;
331         return _len - _pos; //_buf[_pos .. _len];
332     }
334     // when bytes consumed from byte buffer, call this method to update position
335     protected void consumedBytes(uint count) {
336         _pos += count;
337     }
339     // reserve text buffer for specified number of characters, and return pointer to first free character in buffer
340     protected dchar * reserveTextBuf(uint len) {
341         // create new text buffer if necessary
342         if (_textBuf == null) {
343             if (len < TEXT_BUFFER_SIZE)
344                 len = TEXT_BUFFER_SIZE;
345             _textBuf = new dchar[len];
346             return _textBuf.ptr;
347         }
348         uint spaceLeft = cast(uint)_textBuf.length - _textLen;
349         if (spaceLeft >= len)
350             return _textBuf.ptr + _textLen;
351         // move text to beginning of buffer, if necessary
352         if (_textPos > _textBuf.length / 2) {
353             uint charCount = _textLen - _textPos;
354             dchar * p = _textBuf.ptr;
355             foreach(i; 0 .. charCount)
356                 p[i] = p[i + _textPos];
357             _textLen = charCount;
358             _textPos = 0;
359         }
360         // resize buffer if necessary
361         if (_textLen + len > _textBuf.length) {
362             // resize buffer
363             uint newsize = cast(uint)_textBuf.length * 2;
364             if (newsize < _textLen + len)
365                 newsize = _textLen + len;
366             _textBuf.length = newsize;
367         }
368         return _textBuf.ptr + _textLen;
369     }
371     protected void appendedText(uint len) {
372         //writeln("appended ", len, " chars of text"); //:", _textBuf[_textLen .. _textLen + len]);
373         _textLen += len;
374     }
376     protected void setError(int code, string message, uint errorLine, uint errorPos) {
377         _errorCode = code;
378         _errorMessage = message;
379         _errorLine = errorLine;
380         _errorPos = errorPos;
381     }
383     // override to decode text
384     protected abstract uint decodeText();
386     /// Unknown line position
387     immutable static uint LINE_POSITION_UNDEFINED = uint.max;
389     /// Read line from stream
390     public dchar[] readLine() {
391         if (_errorCode != 0) {
392             //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine);
393             return null; // error detected
394         }
395         if (_eof) {
396             //writeln("EOF found");
397             return null;
398         }
399         _line++;
400         uint p = 0;
401         uint eol = LINE_POSITION_UNDEFINED;
402         uint eof = LINE_POSITION_UNDEFINED;
403         uint lastchar = LINE_POSITION_UNDEFINED;
404         do {
405             if (_errorCode != 0) {
406                 //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine);
407                 return null; // error detected
408             }
409             uint charsLeft = _textLen - _textPos;
410             if (p >= charsLeft) {
411                 uint decodedChars = decodeText();
412                 if (_errorCode != 0) {
413                     return null; // error detected
414                 }
415                 charsLeft = _textLen - _textPos;
416                 if (decodedChars == 0) {
417                     eol = charsLeft;
418                     eof = charsLeft;
419                     lastchar = charsLeft;
420                     break;
421                 }
422             }
423             for (; p < charsLeft; p++) {
424                 dchar ch = _textBuf[_textPos + p];
425                 if (ch == '\r') { // CR
426                     lastchar = p;
427                     if (p == charsLeft - 1) {
428                         // need one more char to check if it's 0D0A or just 0D eol
429                         //writeln("read one more char for 0D0A detection");
430                         decodeText();
431                         if (_errorCode != 0) {
432                             return null; // error detected
433                         }
434                         charsLeft = _textLen - _textPos;
435                     }
436                     dchar ch2 = (p < charsLeft - 1) ? _textBuf[_textPos + p + 1] : 0;
437                     if (ch2 == '\n') { // LF
438                         // CRLF
439                         eol = p + 2;
440                         _lfCount++;
441                         _crCount++;
442                         _crlfCount++;
443                     } else {
444                         // just CR
445                         eol = p + 1;
446                         _crCount++;
447                     }
448                     break;
449                 } else if (ch == '\n' || ch == 0x2028 || ch == 0x2029) {
450                     // single char eoln
451                     lastchar = p;
452                     eol = p + 1;
453                     _lfCount++;
454                     break;
455                 } else if (ch == 0 || ch == 0x001A) {
456                     // eof
457                     //writeln("EOF char found");
458                     lastchar = p;
459                     eol = eof = p + 1;
460                     break;
461                 }
462             }
463         } while (eol == LINE_POSITION_UNDEFINED);
464         uint lineStart = _textPos;
465         uint lineEnd = _textPos + lastchar;
466         _textPos += eol; // consume text
467         if (eof != LINE_POSITION_UNDEFINED) {
468             _eof = true;
469             //writeln("Setting eof flag. lastchar=", lastchar, ", p=", p, ", lineStart=", lineStart);
470             if (lineStart >= lineEnd) {
471                 //writeln("lineStart >= lineEnd -- treat as eof");
472                 return null; // eof
473             }
474         }
475         // return slice with decoded line
476         return _textBuf[lineStart .. lineEnd];
477     }
479     protected immutable static int TEXT_BUFFER_SIZE = 1024;
480     protected immutable static int BYTE_BUFFER_SIZE = 512;
481     protected immutable static int QUARTER_BYTE_BUFFER_SIZE = BYTE_BUFFER_SIZE / 4;
483     /// Factory method for string parser
484     public static LineStream create(string code, string filename = "") {
485         uint len = cast(uint)code.length;
486         ubyte[] data = new ubyte[len + 3];
487         foreach(i; 0 .. len)
488             data[i + 3] = code[i];
489         // BOM for UTF8
490         data[0] = 0xEF;
491         data[1] = 0xBB;
492         data[2] = 0xBF;
493         InputStream stream = new MemoryInputStream(data); //new MemoryStream(data);
494         return create(stream, filename);
495     }
497     /// Factory for InputStream parser
498     public static LineStream create(InputStream stream, string filename, bool autodetectUTFIfNoBOM = true) {
499         ubyte[] buf = new ubyte[BYTE_BUFFER_SIZE];
500         buf[0] = buf[1] = buf[2]  = buf[3] = 0;
501         if (!stream.isOpen)
502             return null;
503         uint len = cast(uint)stream.read(buf);
504         LineStream res = null;
505         if (buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) {
506             res = new Utf8LineStream(stream, filename, buf, len, 3);
507         } else if (buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF) {
508             res = new Utf32beLineStream(stream, filename, buf, len);
509         } else if (buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00) {
510             res = new Utf32leLineStream(stream, filename, buf, len);
511         } else if (buf[0] == 0xFE && buf[1] == 0xFF) {
512             res =  new Utf16beLineStream(stream, filename, buf, len);
513         } else if (buf[0] == 0xFF && buf[1] == 0xFE) {
514             res = new Utf16leLineStream(stream, filename, buf, len);
515         }
516         if (res) {
517             res._bomDetected = true;
518         } else {
519             if (autodetectUTFIfNoBOM) {
520                 res = new Utf8LineStream(stream, filename, buf, len, 0);
521             } else {
522                 res = new AsciiLineStream(stream, filename, buf, len);
523             }
524         }
525         return res;
526     }
528     protected bool invalidCharFlag;
529     protected void invalidCharError() {
530         uint pos = _textLen - _textPos + 1;
531         setError(ErrorCodes.INVALID_CHARACTER, "Invalid character in line " ~ to!string(_line) ~ ":" ~ to!string(pos), _line, pos);
532     }
533 }
536 private class AsciiLineStream : LineStream {
537     this(InputStream stream, string filename, ubyte[] buf, uint len) {
538         super(stream, filename, EncodingType.ASCII, buf, 0, len);
539     }
540     override uint decodeText() {
541         if (invalidCharFlag) {
542             invalidCharError();
543             return 0;
544         }
545         uint bytesAvailable = readBytes();
546         ubyte * bytes = _buf.ptr + _pos;
547         if (bytesAvailable == 0)
548             return 0; // nothing to decode
549         uint len = bytesAvailable;
550         ubyte* b = bytes;
551         dchar* text = reserveTextBuf(len);
552         uint i = 0;
553         for (; i < len; i++) {
554             ubyte ch = b[i];
555             if (ch & 0x80) {
556                 // invalid character
557                 invalidCharFlag = true;
558                 break;
559             }
560             text[i] = ch;
561         }
562         consumedBytes(i);
563         appendedText(i);
564         return len;
565     }
567 }
569 private class Utf8LineStream : LineStream {
570     this(InputStream stream, string filename, ubyte[] buf, uint len, int skip) {
571         super(stream, filename, EncodingType.UTF8, buf, skip, len);
572     }
574     uint decodeBytes(ubyte* b,in uint bleft, out uint ch, out bool needMoreFlag){
575         uint bread = 0;
576         uint ch0 = b[0];
577         if (!(ch0 & 0x80)) {
578             // 0x00..0x7F single byte
579             // 0x80 == 10000000
580             // !(ch0 & 0x80) => ch0 < 10000000
581             ch = ch0;
582             bread = 1;
583         } else if ((ch0 & 0xE0) == 0xC0) {
584             // two bytes 110xxxxx 10xxxxxx
585             if (bleft < 2) {
586                 needMoreFlag = true;
587                 return 0;
588             }
589             uint ch1 = b[1];
590             if ((ch1 & 0xC0) != 0x80) {
591                 return 0;
592             }
593             ch = ((ch0 & 0x1F) << 6) | (ch1 & 0x3F);
594             bread = 2;
595         } else if ((ch0 & 0xF0) == 0xE0) {
596             // three bytes 1110xxxx 10xxxxxx 10xxxxxx
597             if (bleft < 3) {
598                 needMoreFlag = true;
599                 return 0;
600             }
601             uint ch1 = b[1];
602             uint ch2 = b[2];
603             if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) {
604                 return 0;
605             }
606             ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x3F) << 6) | (ch2 & 0x3F);
607             bread = 3;
608         } else if ((ch0 & 0xF8) == 0xF0) {
609             // four bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
610             if (bleft < 4) {
611                 needMoreFlag = true;
612                 return 0;
613             }
614             uint ch1 = b[1];
615             uint ch2 = b[2];
616             uint ch3 = b[3];
617             if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) {
618                 return 0;
619             }
620             ch = ((ch0 & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
621             bread = 4;
622         } else if ((ch0 & 0xFC) == 0xF8) {
623             // five bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
624             if (bleft < 5) {
625                 needMoreFlag = true;
626                 return 0;
627             }
628             uint ch1 = b[1];
629             uint ch2 = b[2];
630             uint ch3 = b[3];
631             uint ch4 = b[4];
632             if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) {
633                 return 0;
634             }
635             ch = ((ch0 & 0x03) << 24) | ((ch1 & 0x3F) << 18) | ((ch2 & 0x3F) << 12) | ((ch3 & 0x3F) << 6) | (ch4 & 0x3F);
636             bread = 5;
637         } else if ((ch0 & 0xFE) == 0xFC) {
638             // six bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
639             if (bleft < 6){
640                 needMoreFlag = true;
641                 return 0;
642             }
644             uint ch1 = b[1];
645             uint ch2 = b[2];
646             uint ch3 = b[3];
647             uint ch4 = b[4];
648             uint ch5 = b[5];
649             if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80 || (ch5 & 0xC0) != 0x80) {
650                 return 0;
651             }
652             ch = ((ch0 & 0x01) << 30) | ((ch1 & 0x3F) << 24) | ((ch2 & 0x3F) << 18) | ((ch3 & 0x3F) << 12) | ((ch4 & 0x3F) << 6) | (ch5 & 0x3F);
653             bread = 5;
654         }
655         if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
656             return 0;
657         }
658         return bread;
659     }
661     /// this constructor was created for unittests only
662     protected this(){
664     }
666     unittest {
667         auto o = new Utf8LineStream();
668         ubyte[] buffer =  new ubyte[4];
669         ubyte * bytes  = buffer.ptr;
670         uint ch;
671         bool needMoreFlag;
672         uint bread;
674         //convert simple character
675         buffer[0] = '/';
676         bread = o.decodeBytes(bytes,1,ch,needMoreFlag);
677         assert(!needMoreFlag);
678         assert(bread == 1);
679         assert(ch == '/');
680         //writefln("/ as hex: 0x%32x,0x%32x", ch,'/');
683         //convert 2byte character
684         buffer[0] = 0xc3;
685         buffer[1] = 0x84;
686         bread = o.decodeBytes(bytes,1,ch,needMoreFlag);
687         assert(needMoreFlag);
689         bread = o.decodeBytes(bytes,2,ch,needMoreFlag);
690         assert(!needMoreFlag);
691         assert(bread == 2);
692         assert(ch == 'Ä');
693         //writefln("Ä as hex: 0x%32x,0x%32x", ch,'Ä');
695         //convert 3byte character
696         buffer[0] = 0xe0;
697         buffer[1] = 0xa4;
698         buffer[2] = 0xb4;
699         bread = o.decodeBytes(bytes,2,ch,needMoreFlag);
700         assert(needMoreFlag);
702         bread = o.decodeBytes(bytes,3,ch,needMoreFlag);
703         assert(!needMoreFlag);
704         assert(bread == 3);
705         //writefln("ऴ as hex: 0x%32x,0x%32x", ch,'ऴ');
706         assert(ch == 'ऴ');
708         //regression test for https://github.com/buggins/dlangide/issues/65
709         buffer[0] = 0xEB;
710         buffer[1] = 0xB8;
711         buffer[2] = 0x94;
712         bread = o.decodeBytes(bytes,3,ch,needMoreFlag);
713         assert(!needMoreFlag);
714         assert(bread == 3);
715         //writefln("블 as hex: 0x%32x,0x%32x", ch,'블');
716         assert(ch == '블');
717     }
719     override uint decodeText() {
720         //number of bytesAvailable
721         uint len = readBytes();
722         if (len == 0)
723             return 0; // nothing to decode
725         if (invalidCharFlag) {
726             invalidCharError();
727             return 0;
728         }
729         ubyte * bytes = _buf.ptr + _pos;
730         ubyte* b = bytes;
731         uint chars = 0;
732         uint maxResultingBytes = len*2; //len*2 because worst case is if all input chars are singelbyte and resulting in two bytes
733         dchar* text = reserveTextBuf(maxResultingBytes);
734         uint i = 0;
736         bool needMoreFlag = false;
737         for (; i < len; i++) {
738             uint ch = 0;
739             uint bleft = len - i;
740             uint bread = decodeBytes(b+i,bleft,ch,needMoreFlag);
742             if(needMoreFlag){
743                 //decodeBytes needs more bytes, but nore more bytes left in the buffer
744                 break;
745             }
747             if (bread == 0) {
748                 //decodeBytes could not read any charater. stop procesing
749                 invalidCharFlag = true;
750                 break;
751             }
753             if (ch < 0x10000) {
754                 text[chars++] = ch;
755             } else {
756                 uint lo = ch & 0x3FF;
757                 uint hi = ch >> 10;
758                 text[chars++] = (0xd800 | hi);
759                 text[chars++] = (0xdc00 | lo);
760             }
761             i += bread - 1;
762         }
763         consumedBytes(i);
764         appendedText(chars);
765         uint bleft = len - i;
766         if (_streamEof && bleft > 0)
767             invalidCharFlag = true; // incomplete character at end of stream
768         return chars;
769     }
770 }
772 private class Utf16beLineStream : LineStream {
773     this(InputStream stream, string filename, ubyte[] buf, uint len) {
774         super(stream, filename, EncodingType.UTF16BE, buf, 2, len);
775     }
776     override uint decodeText() {
777         if (invalidCharFlag) {
778             invalidCharError();
779             return 0;
780         }
781         uint bytesAvailable = readBytes();
782         ubyte * bytes = _buf.ptr + _pos;
783         if (bytesAvailable == 0)
784             return 0; // nothing to decode
785         uint len = bytesAvailable;
786         uint chars = 0;
787         ubyte* b = bytes;
788         dchar* text = reserveTextBuf(len / 2 + 1);
789         uint i = 0;
790         for (; i < len - 1; i += 2) {
791             uint ch0 = b[i];
792             uint ch1 = b[i + 1];
793             uint ch = (ch0 << 8) | ch1;
794             // TODO: check special cases
795             text[chars++] = ch;
796         }
797         consumedBytes(i);
798         appendedText(chars);
799         uint bleft = len - i;
800         if (_streamEof && bleft > 0)
801             invalidCharFlag = true; // incomplete character at end of stream
802         return chars;
803     }
804 }
806 private class Utf16leLineStream : LineStream {
807     this(InputStream stream, string filename, ubyte[] buf, uint len) {
808         super(stream, filename, EncodingType.UTF16LE, buf, 2, len);
809     }
810     override uint decodeText() {
811         if (invalidCharFlag) {
812             invalidCharError();
813             return 0;
814         }
815         uint bytesAvailable = readBytes();
816         ubyte * bytes = _buf.ptr + _pos;
817         if (bytesAvailable == 0)
818             return 0; // nothing to decode
819         uint len = bytesAvailable;
820         uint chars = 0;
821         ubyte* b = bytes;
822         dchar* text = reserveTextBuf(len / 2 + 1);
823         uint i = 0;
824         for (; i < len - 1; i += 2) {
825             uint ch0 = b[i];
826             uint ch1 = b[i + 1];
827             uint ch = (ch1 << 8) | ch0;
828             // TODO: check special cases
829             text[chars++] = ch;
830         }
831         consumedBytes(i);
832         appendedText(chars);
833         uint bleft = len - i;
834         if (_streamEof && bleft > 0)
835             invalidCharFlag = true; // incomplete character at end of stream
836         return chars;
837     }
838 }
840 private class Utf32beLineStream : LineStream {
841     this(InputStream stream, string filename, ubyte[] buf, uint len) {
842         super(stream, filename, EncodingType.UTF32BE, buf, 4, len);
843     }
844     override uint decodeText() {
845         if (invalidCharFlag) {
846             invalidCharError();
847             return 0;
848         }
849         uint bytesAvailable = readBytes();
850         ubyte * bytes = _buf.ptr + _pos;
851         if (bytesAvailable == 0)
852             return 0; // nothing to decode
853         uint len = bytesAvailable;
854         uint chars = 0;
855         ubyte* b = bytes;
856         dchar* text = reserveTextBuf(len / 2 + 1);
857         uint i = 0;
858         for (; i < len - 3; i += 4) {
859             uint ch0 = b[i];
860             uint ch1 = b[i + 1];
861             uint ch2 = b[i + 2];
862             uint ch3 = b[i + 3];
863             uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3;
864             if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
865                 invalidCharFlag = true;
866                 break;
867             }
868             text[chars++] = ch;
869         }
870         consumedBytes(i);
871         appendedText(chars);
872         uint bleft = len - i;
873         if (_streamEof && bleft > 0)
874             invalidCharFlag = true; // incomplete character at end of stream
875         return chars;
876     }
877 }
879 private class Utf32leLineStream : LineStream {
880     this(InputStream stream, string filename, ubyte[] buf, uint len) {
881         super(stream, filename, EncodingType.UTF32LE, buf, 4, len);
882     }
883     override uint decodeText() {
884         if (invalidCharFlag) {
885             invalidCharError();
886             return 0;
887         }
888         uint bytesAvailable = readBytes();
889         ubyte * bytes = _buf.ptr + _pos;
890         if (bytesAvailable == 0)
891             return 0; // nothing to decode
892         uint len = bytesAvailable;
893         uint chars = 0;
894         ubyte* b = bytes;
895         dchar* text = reserveTextBuf(len / 2 + 1);
896         uint i = 0;
897         for (; i < len - 3; i += 4) {
898             uint ch3 = b[i];
899             uint ch2 = b[i + 1];
900             uint ch1 = b[i + 2];
901             uint ch0 = b[i + 3];
902             uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3;
903             if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
904                 invalidCharFlag = true;
905                 break;
906             }
907             text[chars++] = ch;
908         }
909         consumedBytes(i);
910         appendedText(chars);
911         uint bleft = len - i;
912         if (_streamEof && bleft > 0)
913             invalidCharFlag = true; // incomplete character at end of stream
914         return chars;
915     }
916 }
919 unittest {
920     static if (false) {
921         import std.stdio;
922         import std.conv;
923         import std.utf;
924         //string fname = "C:\\projects\\d\\ddc\\ddclexer\\src\\ddc\\lexer\\LineStream.d";
925         //string fname = "/home/lve/src/d/ddc/ddclexer/" ~ __FILE__; //"/home/lve/src/d/ddc/ddclexer/src/ddc/lexer/Lexer.d";
926         //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf8.d";
927         //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16be.d";
928         //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16le.d";
929         //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32be.d";
930         string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32le.d";
931         writeln("opening file");
932         std.stream.File f = new std.stream.File(fname);
933         scope(exit) { f.close(); }
934         try {
935             LineStream lines = LineStream.create(f, fname);
936             for (;;) {
937                 dchar[] s = lines.readLine();
938                 if (s is null)
939                     break;
940                 writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s));
941             }
942             if (lines.errorCode != 0) {
943                 writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos);
944             } else {
945                 writeln("EOF reached");
946             }
947         } catch (Exception e) {
948             writeln("Exception " ~ e.toString);
949         }
950     }
951 }
952 // LAST LINE