// ==++== // // Copyright (c) Microsoft Corporation. All rights reserved. // // ==--== namespace System.Text { using System; using System.Diagnostics; using System.Globalization; // Encodes text into and out of UTF-8. UTF-8 is a way of writing // Unicode characters with variable numbers of bytes per character, // optimized for the lower 127 ASCII characters. It's an efficient way // of encoding US English in an internationalizable way. // // The UTF-8 byte order mark is simply the Unicode byte order mark // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is // used mostly to distinguish UTF-8 text from other encodings, and doesn't // switch the byte orderings. //| public class UTF8Encoding : Encoding { /* bytes bits UTF-8 representation ----- ---- ----------------------------------- 1 7 0vvvvvvv 2 11 110vvvvv 10vvvvvv 3 16 1110vvvv 10vvvvvv 10vvvvvv 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv ----- ---- ----------------------------------- Surrogate: Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000 */ private const int UTF8_CODEPAGE=65001; // Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into // the standard. private bool emitUTF8Identifier; private bool isThrowException = false; //| public UTF8Encoding(): this(false) { } //| public UTF8Encoding(bool encoderShouldEmitUTF8Identifier): this(encoderShouldEmitUTF8Identifier, false) { } //| public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes): base(UTF8_CODEPAGE) { this.emitUTF8Identifier = encoderShouldEmitUTF8Identifier; this.isThrowException = throwOnInvalidBytes; } //| public override int GetByteCount(char[] chars, int index, int count) { return (GetByteCount(chars, index, count, null)); } internal unsafe int GetByteCount(char[] chars, int index, int count, UTF8Encoder encoder) { if (chars == null) { throw new ArgumentNullException("chars", "ArgumentNull_Array"); } if (index < 0 || count < 0) { throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"), "ArgumentOutOfRange_NeedNonNegNum"); } if (chars.Length - index < count) { throw new ArgumentOutOfRangeException("chars", "ArgumentOutOfRange_IndexCountBuffer"); } int retVal = -1; if (chars.Length == 0) { return 0; } fixed (char *p = chars) { retVal = GetByteCount(p, index, count, encoder); } Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1"); return retVal; } //| public unsafe override int GetByteCount(String chars) { if (chars==null) throw new ArgumentNullException("chars"); int retVal = -1; fixed (char *p = chars) { retVal = GetByteCount(p, 0, chars.Length, null); } Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1"); return retVal; } internal unsafe int GetByteCount(char *chars, int index, int count, UTF8Encoder encoder) { Debug.Assert(chars!=null, "[UTF8Encoding.GetByteCount]chars!=null"); int end = index + count; int byteCount = 0; bool inSurrogate; if (encoder == null || !encoder.storedSurrogate) { inSurrogate = false; } else { inSurrogate = true; } while (index < end && byteCount >= 0) { char ch = chars[index++]; if (inSurrogate) { // // In previous char, we encounter a high surrogate, so we are expecting a low surrogate here. // if (CharacterInfo.IsLowSurrogate(ch)) { inSurrogate = false; // // One surrogate pair will be translated into 4 bytes UTF8. // byteCount += 4; } else if (CharacterInfo.IsHighSurrogate(ch)) { // We have two high surrogates. if (isThrowException) { throw new ArgumentException("Argument_InvalidHighSurrogate"); } // Encode the previous high-surrogate char. byteCount += 3; // The isSurrogate is still true, because this could be the start of another valid surrogate pair. } else { if (isThrowException) { throw new ArgumentException("Argument_InvalidHighSurrogate"); } // Encode the previous high-surrogate char. byteCount += 3; // Not a surrogate. Put the char back so that we can restart the encoding. inSurrogate = false; index--; } } else if (ch < 0x0080) byteCount++; else if (ch < 0x0800) { byteCount += 2; } else { if (CharacterInfo.IsHighSurrogate(ch)) { // // Found the start of a surrogate. // inSurrogate = true; } else if (CharacterInfo.IsLowSurrogate(ch) && isThrowException) { // // Found a low surrogate without encountering a high surrogate first. // throw new ArgumentException("Argument_InvalidLowSurrogate"); } else { byteCount += 3; } } } // Check for overflows. if (byteCount < 0) throw new ArgumentOutOfRangeException("count", "ArgumentOutOfRange_GetByteCountOverflow"); if (inSurrogate) { if (encoder == null || encoder.mustFlush) { if (isThrowException) { throw new ArgumentException("Argument_InvalidHighSurrogate"); } byteCount += 3; } } return byteCount; } //| public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) { return GetBytes(chars, charIndex, charCount, bytes, byteIndex, null); } private void EncodeThreeBytes(int ch, byte[] bytes, ref int byteIndex) { bytes[byteIndex++] = (byte)(0xE0 | ch >> 12 & 0x0F); bytes[byteIndex++] = (byte)(0x80 | ch >> 6 & 0x3F); bytes[byteIndex++] = (byte)(0x80 | ch & 0x3F); } private unsafe int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, UTF8Encoder encoder) { if (chars == null || bytes == null) { throw new ArgumentNullException((chars == null ? "chars" : "bytes"), "ArgumentNull_Array"); } if (charIndex < 0 || charCount < 0) { throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"), "ArgumentOutOfRange_NeedNonNegNum"); } if (chars.Length - charIndex < charCount) { throw new ArgumentOutOfRangeException("chars", "ArgumentOutOfRange_IndexCountBuffer"); } if (byteIndex < 0 || byteIndex > bytes.Length) { throw new ArgumentOutOfRangeException("byteIndex", "ArgumentOutOfRange_Index"); } int retVal = -1; if (chars.Length==0) { return 0; } fixed (char *p = chars) { retVal = GetBytes(p, charIndex, charCount, bytes, byteIndex, encoder); } Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1"); return retVal; } //| public override byte[] GetBytes(String s) { if (s == null) { throw new ArgumentNullException("s", "ArgumentNull_String"); } int byteLen = GetByteCount(s); byte[] bytes = new byte[byteLen]; GetBytes(s, 0, s.Length, bytes, 0); return bytes; } //| public unsafe override int GetBytes(String s, int charIndex, int charCount, byte[] bytes, int byteIndex) { if (s == null || bytes == null) { throw new ArgumentNullException((s == null ? "s" : "bytes"), "ArgumentNull_String"); } if (charIndex < 0 || charCount < 0) { throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"), "ArgumentOutOfRange_NeedNonNegNum"); } if (s.Length - charIndex < charCount) { throw new ArgumentOutOfRangeException("s", "ArgumentOutOfRange_IndexCount"); } if (byteIndex < 0 || byteIndex > bytes.Length) { throw new ArgumentOutOfRangeException("byteIndex", "ArgumentOutOfRange_Index"); } int retVal = -1; fixed (char *p = s) { retVal = GetBytes(p, charIndex, charCount, bytes, byteIndex, null); } Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1"); return retVal; } private unsafe int GetBytes(char *chars, int charIndex, int charCount, byte[] bytes, int byteIndex, UTF8Encoder encoder) { Debug.Assert(chars!=null, "[UTF8Encoding.GetBytes]chars!=null"); int charEnd = charIndex + charCount; int byteStart = byteIndex; int surrogateChar; if (encoder == null || !encoder.storedSurrogate) { surrogateChar = -1; } else { surrogateChar = encoder.surrogateChar; encoder.storedSurrogate = false; } try { while (charIndex < charEnd) { char ch = chars[charIndex++]; // // In previous byte, we encounter a high surrogate, so we are expecting a low surrogate here. // if (surrogateChar > 0) { if (CharacterInfo.IsLowSurrogate(ch)) { // We have a complete surrogate pair. surrogateChar = (surrogateChar - CharacterInfo.HIGH_SURROGATE_START) << 10; // (ch - 0xd800) * 0x400 surrogateChar += (ch - CharacterInfo.LOW_SURROGATE_START); surrogateChar += 0x10000; bytes[byteIndex++] = (byte)(0xF0 | (surrogateChar >> 18) & 0x07); bytes[byteIndex++] = (byte)(0x80 | (surrogateChar >> 12) & 0x3F); bytes[byteIndex++] = (byte)(0x80 | (surrogateChar >> 6) & 0x3F); bytes[byteIndex++] = (byte)(0x80 | surrogateChar & 0x3F); surrogateChar = -1; } else if (CharacterInfo.IsHighSurrogate(ch)) { // We have two high surrogate. if (isThrowException) { throw new ArgumentException("Argument_InvalidHighSurrogate"); } // Encode the previous high-surrogate char. EncodeThreeBytes(surrogateChar, bytes, ref byteIndex); surrogateChar = ch; } else { if (isThrowException) { throw new ArgumentException("Argument_InvalidHighSurrogate"); } // Encode the previous high-surrogate char. EncodeThreeBytes(surrogateChar, bytes, ref byteIndex); // Not a surrogate. Put the char back so that we can restart the encoding. surrogateChar = -1; charIndex--; } } else if (ch < 0x0080) { bytes[byteIndex++] = (byte)ch; } else if (ch < 0x0800) { bytes[byteIndex++] = (byte)(0xC0 | ch >> 6 & 0x1F); bytes[byteIndex++] = (byte)(0x80 | ch & 0x3F); } else if (CharacterInfo.IsHighSurrogate(ch)) { // // Found the start of a surrogate. // surrogateChar = ch; } else if (CharacterInfo.IsLowSurrogate(ch) && isThrowException) { throw new ArgumentException("Argument_InvalidLowSurrogate"); } else { //we now know that the char is >=0x0800 and isn't a high surrogate bytes[byteIndex++] = (byte)(0xE0 | ch >> 12 & 0x0F); bytes[byteIndex++] = (byte)(0x80 | ch >> 6 & 0x3F); bytes[byteIndex++] = (byte)(0x80 | ch & 0x3F); } } if (surrogateChar > 0) { if (encoder != null && !encoder.mustFlush) { encoder.surrogateChar = surrogateChar; encoder.storedSurrogate = true; } else { if (isThrowException) { throw new ArgumentException("Argument_InvalidHighSurrogate"); } EncodeThreeBytes(surrogateChar, bytes, ref byteIndex); } } } catch (IndexOutOfRangeException) { throw new ArgumentException("Argument_ConversionOverflow"); } return byteIndex - byteStart; } //| public override int GetCharCount(byte[] bytes, int index, int count) { return GetCharCount(bytes, index, count, null); } internal virtual int GetCharCount(byte[] bytes, int index, int count, UTF8Decoder decoder) { if (bytes == null) { throw new ArgumentNullException("bytes", "ArgumentNull_Array"); } if (index < 0 || count < 0) { throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"), "ArgumentOutOfRange_NeedNonNegNum"); } if (bytes.Length - index < count) { throw new ArgumentOutOfRangeException("bytes", "ArgumentOutOfRange_IndexCountBuffer"); } int charCount = 0; int trailCount = 0; // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence. // This is used to detect non-shortest form. // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that // we don't check for non-shortest form again. int byteSequence = 0; bool isSurrogate = false; int bits = 0; if (decoder != null) { trailCount = decoder.trailCount; isSurrogate = decoder.isSurrogate; byteSequence = decoder.byteSequence; bits = decoder.bits; } int end = index + count; while (index < end) { byte b = bytes[index++]; if (trailCount == 0) { if ((b & 0x80) == 0) { // This is an ASCII. charCount++; } else { byte temp = b; while ((temp & 0x80) != 0) { temp <<= 1; trailCount++; } switch (trailCount) { case 1: trailCount = 0; break; case 2: // Make sure that bit 8 ~ bit 11 is not all zero. // 110XXXXx 10xxxxxx if ((b & 0x1e) == 0) { trailCount = 0; } break; case 3: byteSequence = 3; break; case 4: isSurrogate = true; byteSequence = 4; break; default: trailCount = 0; break; } if (trailCount == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } } else { bits = temp >> trailCount; trailCount--; } } } else { // We are expecting to see trailing bytes like 10vvvvvv if ((b & 0xC0) != 0x80) { // If not, this is NOT a valid sequence. if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } index--; trailCount = 0; isSurrogate = false; } else { switch (byteSequence) { case 3: // Check 3-byte sequence for non-shortest form. // 1110XXXX 10Xxxxxx 10xxxxxx if (bits == 0 && (b & 0x20) == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } // We are done checking the non-shortest form, reset byteSequence to 0, so that we don't // do the extra check for the remaining byte of the 3-byte chunk. byteSequence = 0; break; case 4: // Check 4-byte sequence for non-shortest form. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx if (bits == 0) { if ((b & 0x30) == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } } else if ((bits & 0x04) != 0) { // Make sure that the resulting Unicode is within the valid surrogate range. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20 // bit are all zero. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx, // XXXXX can only be 10000. if ((bits & 0x03) != 0 || (b & 0x30) != 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } } byteSequence = 0; break; } if (--trailCount == 0) { charCount++; if (isSurrogate) { charCount++; isSurrogate = false; } } } } } return charCount; } //| public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) { return GetChars(bytes, byteIndex, byteCount, chars, charIndex, null); } internal virtual int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, UTF8Decoder decoder) { if (bytes == null || chars == null) { throw new ArgumentNullException(bytes == null ? "bytes" : "chars", "ArgumentNull_Array"); } if (byteIndex < 0 || byteCount < 0) { throw new ArgumentOutOfRangeException((byteIndex<0 ? "byteIndex" : "byteCount"), "ArgumentOutOfRange_NeedNonNegNum"); } if ( bytes.Length - byteIndex < byteCount) { throw new ArgumentOutOfRangeException("bytes", "ArgumentOutOfRange_IndexCountBuffer"); } if (charIndex < 0 || charIndex > chars.Length) { throw new ArgumentOutOfRangeException("charIndex", "ArgumentOutOfRange_Index"); } int bits = 0; int trailCount = 0; bool isSurrogate = false; // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence. // This is used to detect non-shortest form. // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that // we don't check for non-shortest form again. int byteSequence = 0; if (decoder != null) { bits = decoder.bits; trailCount = decoder.trailCount; isSurrogate = decoder.isSurrogate; byteSequence = decoder.byteSequence; } int byteEnd = byteIndex + byteCount; int charStart = charIndex; try { while (byteIndex < byteEnd) { byte b = bytes[byteIndex++]; if (trailCount == 0) { // // We are not at a trailing byte. // if ((b & 0x80) == 0) { // This is the ASCII case. // 1 7 0vvvvvvv // // Found an ASCII character. // chars[charIndex++] = (char)b; } else { // Check if this is a valid starting byte. byte temp = (byte)b; while ((temp & 0x80) != 0) { temp <<= 1; trailCount++; } switch (trailCount) { case 1: trailCount = 0; break; case 2: // Make sure that bit 8 ~ bit 11 is not all zero. // 110XXXXx 10xxxxxx if ((b & 0x1e) == 0) { trailCount = 0; } break; case 3: byteSequence = 3; break; case 4: // // This is a surrogate unicode pair // byteSequence = 4; break; default: trailCount = 0; break; } if (trailCount == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } } else { isSurrogate = (trailCount == 4); bits = temp >> trailCount; trailCount--; } } } else { // We are expecting to see bytes like 10vvvvvv if ((b & 0xC0) != 0x80) { // If not, this is NOT a valid sequence. if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } // At this point, we are seeing an invalid trailing byte. // However, this can be a valid starting byte for another UTF8 byte sequence (e.g. // this character could be under 0x7f, or a valid leading byte like 110xxxxx). // So let's put the current byte back, and try to see if this is a valid byte // for another UTF8 byte sequence. byteIndex--; bits = 0; trailCount = 0; } else { switch (byteSequence) { case 3: // Check 3-byte sequence for non-shortest form. // 1110XXXX 10Xxxxxx 10xxxxxx if (bits == 0 && (b & 0x20) == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } // Rest byteSequence to zero since we are done with non-shortest form check. byteSequence = 0; break; case 4: // Check 4-byte sequence for non-shortest form. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx if (bits == 0) { if ((b & 0x30) == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } } else if ((bits & 0x04) != 0) { // Make sure that the resulting Unicode is within the valid surrogate range. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20 // bit are all zero. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx, // XXXXX can only be 10000. if ((bits & 0x03) != 0 || (b & 0x30) != 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } } byteSequence = 0; break; } if (--trailCount >= 0) { bits = bits << 6 | (b & 0x3F); if (trailCount == 0) { if (!isSurrogate) { chars[charIndex++] = (char)bits; } else { // // bits >= 0x10000, use surrogate. // chars[charIndex++] = (char)(0xD7C0 + (bits >> 10)); chars[charIndex++] = (char)(CharacterInfo.LOW_SURROGATE_START + (bits & 0x3FF)); } } } } } } } catch (IndexOutOfRangeException) { throw new ArgumentException("Argument_ConversionOverflow"); } if (decoder != null) { decoder.bits = bits; decoder.trailCount = trailCount; decoder.isSurrogate = isSurrogate; decoder.byteSequence = byteSequence; } return charIndex - charStart; } //| public override Decoder GetDecoder() { return new UTF8Decoder(this); } //| public override Encoder GetEncoder() { return new UTF8Encoder(this); } //| public override int GetMaxByteCount(int charCount) { if (charCount < 0) { throw new ArgumentOutOfRangeException("charCount", "ArgumentOutOfRange_NeedNonNegNum"); } long byteCount = (long)charCount * 4; if (byteCount > 0x7fffffff) throw new ArgumentOutOfRangeException("charCount", "ArgumentOutOfRange_GetByteCountOverflow"); return (int)byteCount; } //| public override int GetMaxCharCount(int byteCount) { if (byteCount < 0) { throw new ArgumentOutOfRangeException("byteCount", "ArgumentOutOfRange_NeedNonNegNum"); } return byteCount; } //| public override byte[] GetPreamble() { if (emitUTF8Identifier) { // Allocate new array to prevent users from modifying it. return new byte[3] { 0xEF, 0xBB, 0xBF }; } else return Encoding.emptyByteArray; } //| public override bool Equals(Object value) { UTF8Encoding that = value as UTF8Encoding; if (that != null) { return (emitUTF8Identifier == that.emitUTF8Identifier); } return (false); } //| public override int GetHashCode() { //Not great distribution, but this is relatively unlikely to be used as the key in a hashtable. return UTF8_CODEPAGE + (isThrowException?1:0) + (emitUTF8Identifier?1:0); } internal class UTF8Encoder : Encoder { private UTF8Encoding encoding; // We must save a high surrogate value until the next call, looking // for a low surrogate value. surrogateChar is the bitshifted value, // which can validly be 0. Since it can be 0, we need storedSurrogate. internal int surrogateChar; internal bool storedSurrogate; // The mustFlush parameter means whether we should throw for a dangling // high surrogate at the end of the char[]. It is only true when // the user of this encoding is writing the last block. internal bool mustFlush; public UTF8Encoder(UTF8Encoding encoding) { this.encoding = encoding; surrogateChar = 0; storedSurrogate = false; } public override int GetByteCount(char[] chars, int index, int count, bool flush) { mustFlush = flush; return encoding.GetByteCount(chars, index, count, this); } public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush) { mustFlush = flush; return encoding.GetBytes(chars, charIndex, charCount, bytes, byteIndex, this); } } internal class UTF8Decoder : Decoder { private UTF8Encoding encoding; internal int bits; internal int trailCount; // We need to maintain the status that if we are decoding a surrogate (which has 4-byte UTF8), so // that GetCharCount() can generate correct char count. // The flag is needed because GetCharCount(), unlike GetChars(), does not really calculate the bits, so it has no way // to know if the decoder bytes is a surrogate or not. internal bool isSurrogate; internal int byteSequence; public UTF8Decoder(UTF8Encoding encoding) { this.encoding = encoding; } //| public override int GetCharCount(byte[] bytes, int index, int count) { return encoding.GetCharCount(bytes, index, count, this); } public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) { return encoding.GetChars(bytes, byteIndex, byteCount, chars, charIndex, this); } } } }