827 lines
38 KiB
C#
827 lines
38 KiB
C#
// ==++==
|
|
//
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
//
|
|
// ==--==
|
|
namespace System.Text
|
|
{
|
|
|
|
using System;
|
|
using System.Diagnostics;
|
|
using System.Globalization;
|
|
// Encodes text into and out of UTF-8. UTF-8 is a way of writing
|
|
// Unicode characters with variable numbers of bytes per character,
|
|
// optimized for the lower 127 ASCII characters. It's an efficient way
|
|
// of encoding US English in an internationalizable way.
|
|
//
|
|
// The UTF-8 byte order mark is simply the Unicode byte order mark
|
|
// (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is
|
|
// used mostly to distinguish UTF-8 text from other encodings, and doesn't
|
|
// switch the byte orderings.
|
|
//| <include path='docs/doc[@for="UTF8Encoding"]/*' />
|
|
public class UTF8Encoding : Encoding {
|
|
//
|
|
// bytes bits UTF-8 representation
|
|
// ----- ---- -----------------------------------
|
|
// 1 7 0vvvvvvv
|
|
// 2 11 110vvvvv 10vvvvvv
|
|
// 3 16 1110vvvv 10vvvvvv 10vvvvvv
|
|
// 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
|
|
// ----- ---- -----------------------------------
|
|
//
|
|
// Surrogate:
|
|
// Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
|
|
//
|
|
|
|
private const int UTF8_CODEPAGE=65001;
|
|
|
|
// Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
|
|
// the standard.
|
|
private bool emitUTF8Identifier;
|
|
|
|
private bool isThrowException = false;
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.UTF8Encoding"]/*' />
|
|
public UTF8Encoding(): this(false) {
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.UTF8Encoding1"]/*' />
|
|
public UTF8Encoding(bool encoderShouldEmitUTF8Identifier):
|
|
this(encoderShouldEmitUTF8Identifier, false) {
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.UTF8Encoding2"]/*' />
|
|
public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes):
|
|
base(UTF8_CODEPAGE) {
|
|
this.emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
|
|
this.isThrowException = throwOnInvalidBytes;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetByteCount"]/*' />
|
|
public override int GetByteCount(char[] chars, int index, int count) {
|
|
return (GetByteCount(chars, index, count, null));
|
|
}
|
|
|
|
internal unsafe int GetByteCount(char[] chars, int index, int count, UTF8Encoder encoder) {
|
|
if (chars == null) {
|
|
throw new ArgumentNullException("chars",
|
|
"ArgumentNull_Array");
|
|
}
|
|
if (index < 0 || count < 0) {
|
|
throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
|
|
"ArgumentOutOfRange_NeedNonNegNum");
|
|
}
|
|
if (chars.Length - index < count) {
|
|
throw new ArgumentOutOfRangeException("chars",
|
|
"ArgumentOutOfRange_IndexCountBuffer");
|
|
}
|
|
|
|
int retVal = -1;
|
|
if (chars.Length == 0) {
|
|
return 0;
|
|
}
|
|
|
|
fixed (char *p = chars) {
|
|
retVal = GetByteCount(p, index, count, encoder);
|
|
}
|
|
Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1");
|
|
return retVal;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetByteCount1"]/*' />
|
|
public unsafe override int GetByteCount(String chars) {
|
|
if (chars == null)
|
|
throw new ArgumentNullException("chars");
|
|
int retVal = -1;
|
|
fixed (char *p = chars) {
|
|
retVal = GetByteCount(p, 0, chars.Length, null);
|
|
}
|
|
|
|
Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1");
|
|
return retVal;
|
|
}
|
|
|
|
internal unsafe int GetByteCount(char *chars, int index, int count, UTF8Encoder encoder) {
|
|
|
|
Debug.Assert(chars!=null, "[UTF8Encoding.GetByteCount]chars!=null");
|
|
|
|
int end = index + count;
|
|
int byteCount = 0;
|
|
|
|
bool inSurrogate;
|
|
|
|
if (encoder == null || !encoder.storedSurrogate) {
|
|
inSurrogate = false;
|
|
}
|
|
else {
|
|
inSurrogate = true;
|
|
}
|
|
|
|
while (index < end && byteCount >= 0) {
|
|
char ch = chars[index++];
|
|
|
|
if (inSurrogate) {
|
|
//
|
|
// In previous char, we encounter a high surrogate, so we are expecting a low surrogate here.
|
|
//
|
|
if (CharacterInfo.IsLowSurrogate(ch)) {
|
|
inSurrogate = false;
|
|
//
|
|
// One surrogate pair will be translated into 4 bytes UTF8.
|
|
//
|
|
byteCount += 4;
|
|
}
|
|
else if (CharacterInfo.IsHighSurrogate(ch)) {
|
|
// We have two high surrogates.
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidHighSurrogate");
|
|
}
|
|
// Encode the previous high-surrogate char.
|
|
byteCount += 3;
|
|
// The isSurrogate is still true, because this could be the start of another valid surrogate pair.
|
|
}
|
|
else {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidHighSurrogate");
|
|
}
|
|
// Encode the previous high-surrogate char.
|
|
byteCount += 3;
|
|
// Not a surrogate. Put the char back so that we can restart the encoding.
|
|
inSurrogate = false;
|
|
index--;
|
|
}
|
|
}
|
|
else if (ch < 0x0080)
|
|
byteCount++;
|
|
else if (ch < 0x0800) {
|
|
byteCount += 2;
|
|
}
|
|
else {
|
|
if (CharacterInfo.IsHighSurrogate(ch)) {
|
|
//
|
|
// Found the start of a surrogate.
|
|
//
|
|
inSurrogate = true;
|
|
}
|
|
else if (CharacterInfo.IsLowSurrogate(ch) && isThrowException) {
|
|
//
|
|
// Found a low surrogate without encountering a high surrogate first.
|
|
//
|
|
throw new ArgumentException("Argument_InvalidLowSurrogate");
|
|
}
|
|
else {
|
|
byteCount += 3;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check for overflows.
|
|
if (byteCount < 0)
|
|
throw new ArgumentOutOfRangeException("count", "ArgumentOutOfRange_GetByteCountOverflow");
|
|
|
|
if (inSurrogate) {
|
|
if (encoder == null || encoder.mustFlush) {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidHighSurrogate");
|
|
}
|
|
byteCount += 3;
|
|
}
|
|
}
|
|
return byteCount;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetBytes"]/*' />
|
|
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) {
|
|
return GetBytes(chars, charIndex, charCount, bytes, byteIndex, null);
|
|
}
|
|
|
|
private void EncodeThreeBytes(int ch, byte[] bytes, ref int byteIndex) {
|
|
bytes[byteIndex++] = (byte)(0xE0 | ch >> 12 & 0x0F);
|
|
bytes[byteIndex++] = (byte)(0x80 | ch >> 6 & 0x3F);
|
|
bytes[byteIndex++] = (byte)(0x80 | ch & 0x3F);
|
|
}
|
|
|
|
private unsafe int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, UTF8Encoder encoder) {
|
|
if (chars == null || bytes == null) {
|
|
throw new ArgumentNullException((chars == null ? "chars" : "bytes"),
|
|
"ArgumentNull_Array");
|
|
}
|
|
if (charIndex < 0 || charCount < 0) {
|
|
throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
|
|
"ArgumentOutOfRange_NeedNonNegNum");
|
|
}
|
|
if (chars.Length - charIndex < charCount) {
|
|
throw new ArgumentOutOfRangeException("chars",
|
|
"ArgumentOutOfRange_IndexCountBuffer");
|
|
}
|
|
if (byteIndex < 0 || byteIndex > bytes.Length) {
|
|
throw new ArgumentOutOfRangeException("byteIndex",
|
|
"ArgumentOutOfRange_Index");
|
|
}
|
|
|
|
int retVal = -1;
|
|
if (chars.Length == 0) {
|
|
return 0;
|
|
}
|
|
fixed (char *p = chars) {
|
|
retVal = GetBytes(p, charIndex, charCount, bytes, byteIndex, encoder);
|
|
}
|
|
Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1");
|
|
return retVal;
|
|
}
|
|
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetBytes2"]/*' />
|
|
public override byte[] GetBytes(String s) {
|
|
if (s == null) {
|
|
throw new ArgumentNullException("s",
|
|
"ArgumentNull_String");
|
|
}
|
|
int byteLen = GetByteCount(s);
|
|
byte[] bytes = new byte[byteLen];
|
|
GetBytes(s, 0, s.Length, bytes, 0);
|
|
return bytes;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetBytes1"]/*' />
|
|
public unsafe override int GetBytes(String s, int charIndex, int charCount, byte[] bytes, int byteIndex) {
|
|
if (s == null || bytes == null) {
|
|
throw new ArgumentNullException((s == null ? "s" : "bytes"),
|
|
"ArgumentNull_String");
|
|
}
|
|
|
|
if (charIndex < 0 || charCount < 0) {
|
|
throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
|
|
"ArgumentOutOfRange_NeedNonNegNum");
|
|
}
|
|
if (s.Length - charIndex < charCount) {
|
|
throw new ArgumentOutOfRangeException("s",
|
|
"ArgumentOutOfRange_IndexCount");
|
|
}
|
|
if (byteIndex < 0 || byteIndex > bytes.Length) {
|
|
throw new ArgumentOutOfRangeException("byteIndex",
|
|
"ArgumentOutOfRange_Index");
|
|
}
|
|
|
|
int retVal = -1;
|
|
fixed (char *p = s) {
|
|
retVal = GetBytes(p, charIndex, charCount, bytes, byteIndex, null);
|
|
}
|
|
Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1");
|
|
return retVal;
|
|
}
|
|
|
|
private unsafe int GetBytes(char *chars, int charIndex, int charCount, byte[] bytes, int byteIndex, UTF8Encoder encoder) {
|
|
Debug.Assert(chars!=null, "[UTF8Encoding.GetBytes]chars!=null");
|
|
|
|
int charEnd = charIndex + charCount;
|
|
int byteStart = byteIndex;
|
|
|
|
int surrogateChar;
|
|
if (encoder == null || !encoder.storedSurrogate) {
|
|
surrogateChar = -1;
|
|
}
|
|
else {
|
|
surrogateChar = encoder.surrogateChar;
|
|
encoder.storedSurrogate = false;
|
|
}
|
|
|
|
try {
|
|
while (charIndex < charEnd) {
|
|
char ch = chars[charIndex++];
|
|
//
|
|
// In previous byte, we encounter a high surrogate, so we are expecting a low surrogate here.
|
|
//
|
|
if (surrogateChar > 0) {
|
|
if (CharacterInfo.IsLowSurrogate(ch)) {
|
|
// We have a complete surrogate pair.
|
|
surrogateChar = (surrogateChar - CharacterInfo.HIGH_SURROGATE_START) << 10; // (ch - 0xd800) * 0x400
|
|
surrogateChar += (ch - CharacterInfo.LOW_SURROGATE_START);
|
|
surrogateChar += 0x10000;
|
|
bytes[byteIndex++] = (byte)(0xF0 | (surrogateChar >> 18) & 0x07);
|
|
bytes[byteIndex++] = (byte)(0x80 | (surrogateChar >> 12) & 0x3F);
|
|
bytes[byteIndex++] = (byte)(0x80 | (surrogateChar >> 6) & 0x3F);
|
|
bytes[byteIndex++] = (byte)(0x80 | surrogateChar & 0x3F);
|
|
surrogateChar = -1;
|
|
}
|
|
else if (CharacterInfo.IsHighSurrogate(ch)) {
|
|
// We have two high surrogate.
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidHighSurrogate");
|
|
}
|
|
// Encode the previous high-surrogate char.
|
|
EncodeThreeBytes(surrogateChar, bytes, ref byteIndex);
|
|
surrogateChar = ch;
|
|
}
|
|
else {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidHighSurrogate");
|
|
}
|
|
|
|
// Encode the previous high-surrogate char.
|
|
EncodeThreeBytes(surrogateChar, bytes, ref byteIndex);
|
|
// Not a surrogate. Put the char back so that we can restart the encoding.
|
|
surrogateChar = -1;
|
|
charIndex--;
|
|
}
|
|
}
|
|
else if (ch < 0x0080) {
|
|
bytes[byteIndex++] = (byte)ch;
|
|
}
|
|
else if (ch < 0x0800) {
|
|
bytes[byteIndex++] = (byte)(0xC0 | ch >> 6 & 0x1F);
|
|
bytes[byteIndex++] = (byte)(0x80 | ch & 0x3F);
|
|
}
|
|
else if (CharacterInfo.IsHighSurrogate(ch)) {
|
|
//
|
|
// Found the start of a surrogate.
|
|
//
|
|
surrogateChar = ch;
|
|
}
|
|
else if (CharacterInfo.IsLowSurrogate(ch) && isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidLowSurrogate");
|
|
}
|
|
else { //we now know that the char is >= 0x0800 and isn't a high surrogate
|
|
bytes[byteIndex++] = (byte)(0xE0 | ch >> 12 & 0x0F);
|
|
bytes[byteIndex++] = (byte)(0x80 | ch >> 6 & 0x3F);
|
|
bytes[byteIndex++] = (byte)(0x80 | ch & 0x3F);
|
|
}
|
|
}
|
|
if (surrogateChar > 0) {
|
|
if (encoder != null && !encoder.mustFlush) {
|
|
encoder.surrogateChar = surrogateChar;
|
|
encoder.storedSurrogate = true;
|
|
}
|
|
else {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidHighSurrogate");
|
|
}
|
|
EncodeThreeBytes(surrogateChar, bytes, ref byteIndex);
|
|
}
|
|
}
|
|
}
|
|
catch (IndexOutOfRangeException) {
|
|
throw new ArgumentException("Argument_ConversionOverflow");
|
|
}
|
|
|
|
return byteIndex - byteStart;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetCharCount"]/*' />
|
|
public override int GetCharCount(byte[] bytes, int index, int count) {
|
|
return GetCharCount(bytes, index, count, null);
|
|
}
|
|
|
|
internal virtual int GetCharCount(byte[] bytes, int index, int count, UTF8Decoder decoder) {
|
|
if (bytes == null) {
|
|
throw new ArgumentNullException("bytes",
|
|
"ArgumentNull_Array");
|
|
}
|
|
if (index < 0 || count < 0) {
|
|
throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
|
|
"ArgumentOutOfRange_NeedNonNegNum");
|
|
}
|
|
if (bytes.Length - index < count) {
|
|
throw new ArgumentOutOfRangeException("bytes",
|
|
"ArgumentOutOfRange_IndexCountBuffer");
|
|
}
|
|
int charCount = 0;
|
|
int trailCount = 0;
|
|
|
|
// Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence.
|
|
// This is used to detect non-shortest form.
|
|
// It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that
|
|
// we don't check for non-shortest form again.
|
|
int byteSequence = 0;
|
|
bool isSurrogate = false;
|
|
int bits = 0;
|
|
if (decoder != null) {
|
|
trailCount = decoder.trailCount;
|
|
isSurrogate = decoder.isSurrogate;
|
|
byteSequence = decoder.byteSequence;
|
|
bits = decoder.bits;
|
|
}
|
|
|
|
int end = index + count;
|
|
while (index < end) {
|
|
byte b = bytes[index++];
|
|
if (trailCount == 0) {
|
|
if ((b & 0x80) == 0) {
|
|
// This is an ASCII.
|
|
charCount++;
|
|
}
|
|
else {
|
|
byte temp = b;
|
|
while ((temp & 0x80) != 0) {
|
|
temp <<= 1;
|
|
trailCount++;
|
|
}
|
|
switch (trailCount) {
|
|
case 1:
|
|
trailCount = 0;
|
|
break;
|
|
case 2:
|
|
// Make sure that bit 8 ~ bit 11 is not all zero.
|
|
// 110XXXXx 10xxxxxx
|
|
if ((b & 0x1e) == 0) {
|
|
trailCount = 0;
|
|
}
|
|
break;
|
|
case 3:
|
|
byteSequence = 3;
|
|
break;
|
|
case 4:
|
|
isSurrogate = true;
|
|
byteSequence = 4;
|
|
break;
|
|
default:
|
|
trailCount = 0;
|
|
break;
|
|
}
|
|
if (trailCount == 0) {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
}
|
|
else {
|
|
bits = temp >> trailCount;
|
|
trailCount--;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// We are expecting to see trailing bytes like 10vvvvvv
|
|
if ((b & 0xC0) != 0x80) {
|
|
// If not, this is NOT a valid sequence.
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
index--;
|
|
trailCount = 0;
|
|
isSurrogate = false;
|
|
}
|
|
else {
|
|
switch (byteSequence) {
|
|
case 3:
|
|
// Check 3-byte sequence for non-shortest form.
|
|
// 1110XXXX 10Xxxxxx 10xxxxxx
|
|
if (bits == 0 && (b & 0x20) == 0) {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
trailCount = -1;
|
|
}
|
|
// We are done checking the non-shortest form, reset byteSequence to 0, so that we don't
|
|
// do the extra check for the remaining byte of the 3-byte chunk.
|
|
byteSequence = 0;
|
|
break;
|
|
case 4:
|
|
// Check 4-byte sequence for non-shortest form.
|
|
// 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
|
|
if (bits == 0) {
|
|
if ((b & 0x30) == 0) {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
trailCount = -1;
|
|
}
|
|
}
|
|
else if ((bits & 0x04) != 0) {
|
|
// Make sure that the resulting Unicode is within the valid surrogate range.
|
|
// The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range
|
|
// that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
|
|
// Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
|
|
// bit are all zero.
|
|
// I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
|
|
// XXXXX can only be 10000.
|
|
if ((bits & 0x03) != 0 || (b & 0x30) != 0) {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
trailCount = -1;
|
|
}
|
|
}
|
|
byteSequence = 0;
|
|
break;
|
|
}
|
|
|
|
if (--trailCount == 0) {
|
|
charCount++;
|
|
if (isSurrogate) {
|
|
charCount++;
|
|
isSurrogate = false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return charCount;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetChars"]/*' />
|
|
public override int GetChars(byte[] bytes, int byteIndex, int byteCount,
|
|
char[] chars, int charIndex) {
|
|
return GetChars(bytes, byteIndex, byteCount, chars, charIndex, null);
|
|
}
|
|
|
|
|
|
internal virtual int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, UTF8Decoder decoder) {
|
|
if (bytes == null || chars == null) {
|
|
throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
|
|
"ArgumentNull_Array");
|
|
}
|
|
if (byteIndex < 0 || byteCount < 0) {
|
|
throw new ArgumentOutOfRangeException((byteIndex<0 ? "byteIndex" : "byteCount"),
|
|
"ArgumentOutOfRange_NeedNonNegNum");
|
|
}
|
|
if (bytes.Length - byteIndex < byteCount) {
|
|
throw new ArgumentOutOfRangeException("bytes",
|
|
"ArgumentOutOfRange_IndexCountBuffer");
|
|
}
|
|
if (charIndex < 0 || charIndex > chars.Length) {
|
|
throw new ArgumentOutOfRangeException("charIndex",
|
|
"ArgumentOutOfRange_Index");
|
|
}
|
|
int bits = 0;
|
|
int trailCount = 0;
|
|
bool isSurrogate = false;
|
|
|
|
// Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence.
|
|
// This is used to detect non-shortest form.
|
|
// It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that
|
|
// we don't check for non-shortest form again.
|
|
int byteSequence = 0;
|
|
if (decoder != null) {
|
|
bits = decoder.bits;
|
|
trailCount = decoder.trailCount;
|
|
isSurrogate = decoder.isSurrogate;
|
|
byteSequence = decoder.byteSequence;
|
|
}
|
|
int byteEnd = byteIndex + byteCount;
|
|
int charStart = charIndex;
|
|
try {
|
|
while (byteIndex < byteEnd) {
|
|
byte b = bytes[byteIndex++];
|
|
if (trailCount == 0) {
|
|
//
|
|
// We are not at a trailing byte.
|
|
//
|
|
if ((b & 0x80) == 0) {
|
|
// This is the ASCII case.
|
|
// 1 7 0vvvvvvv
|
|
//
|
|
// Found an ASCII character.
|
|
//
|
|
chars[charIndex++] = (char)b;
|
|
}
|
|
else {
|
|
// Check if this is a valid starting byte.
|
|
byte temp = (byte)b;
|
|
while ((temp & 0x80) != 0) {
|
|
temp <<= 1;
|
|
trailCount++;
|
|
}
|
|
switch (trailCount) {
|
|
case 1:
|
|
trailCount = 0;
|
|
break;
|
|
case 2:
|
|
// Make sure that bit 8 ~ bit 11 is not all zero.
|
|
// 110XXXXx 10xxxxxx
|
|
if ((b & 0x1e) == 0) {
|
|
trailCount = 0;
|
|
}
|
|
break;
|
|
case 3:
|
|
byteSequence = 3;
|
|
break;
|
|
case 4:
|
|
//
|
|
// This is a surrogate unicode pair
|
|
//
|
|
byteSequence = 4;
|
|
break;
|
|
default:
|
|
trailCount = 0;
|
|
break;
|
|
}
|
|
if (trailCount == 0) {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
}
|
|
else {
|
|
isSurrogate = (trailCount == 4);
|
|
bits = temp >> trailCount;
|
|
trailCount--;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// We are expecting to see bytes like 10vvvvvv
|
|
if ((b & 0xC0) != 0x80) {
|
|
// If not, this is NOT a valid sequence.
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
// At this point, we are seeing an invalid trailing byte.
|
|
// However, this can be a valid starting byte for another UTF8 byte sequence (e.g.
|
|
// this character could be under 0x7f, or a valid leading byte like 110xxxxx).
|
|
// So let's put the current byte back, and try to see if this is a valid byte
|
|
// for another UTF8 byte sequence.
|
|
byteIndex--;
|
|
bits = 0;
|
|
trailCount = 0;
|
|
}
|
|
else {
|
|
switch (byteSequence) {
|
|
case 3:
|
|
// Check 3-byte sequence for non-shortest form.
|
|
// 1110XXXX 10Xxxxxx 10xxxxxx
|
|
if (bits == 0 && (b & 0x20) == 0) {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
trailCount = -1;
|
|
}
|
|
// Rest byteSequence to zero since we are done with non-shortest form check.
|
|
byteSequence = 0;
|
|
break;
|
|
case 4:
|
|
// Check 4-byte sequence for non-shortest form.
|
|
// 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
|
|
if (bits == 0) {
|
|
if ((b & 0x30) == 0) {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
trailCount = -1;
|
|
}
|
|
}
|
|
else if ((bits & 0x04) != 0) {
|
|
// Make sure that the resulting Unicode is within the valid surrogate range.
|
|
// The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range
|
|
// that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
|
|
// Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
|
|
// bit are all zero.
|
|
// I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
|
|
// XXXXX can only be 10000.
|
|
if ((bits & 0x03) != 0 || (b & 0x30) != 0) {
|
|
if (isThrowException) {
|
|
throw new ArgumentException("Argument_InvalidByteSequence");
|
|
}
|
|
trailCount = -1;
|
|
}
|
|
}
|
|
byteSequence = 0;
|
|
break;
|
|
}
|
|
if (--trailCount >= 0) {
|
|
bits = bits << 6 | (b & 0x3F);
|
|
if (trailCount == 0) {
|
|
if (!isSurrogate) {
|
|
chars[charIndex++] = (char)bits;
|
|
}
|
|
else {
|
|
//
|
|
// bits >= 0x10000, use surrogate.
|
|
//
|
|
chars[charIndex++] = (char)(0xD7C0 + (bits >> 10));
|
|
chars[charIndex++] = (char)(CharacterInfo.LOW_SURROGATE_START + (bits & 0x3FF));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
catch (IndexOutOfRangeException) {
|
|
throw new ArgumentException("Argument_ConversionOverflow");
|
|
}
|
|
if (decoder != null) {
|
|
decoder.bits = bits;
|
|
decoder.trailCount = trailCount;
|
|
decoder.isSurrogate = isSurrogate;
|
|
decoder.byteSequence = byteSequence;
|
|
}
|
|
return charIndex - charStart;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetDecoder"]/*' />
|
|
public override Decoder GetDecoder() {
|
|
return new UTF8Decoder(this);
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetEncoder"]/*' />
|
|
public override Encoder GetEncoder() {
|
|
return new UTF8Encoder(this);
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetMaxByteCount"]/*' />
|
|
public override int GetMaxByteCount(int charCount) {
|
|
if (charCount < 0) {
|
|
throw new ArgumentOutOfRangeException("charCount",
|
|
"ArgumentOutOfRange_NeedNonNegNum");
|
|
}
|
|
long byteCount = (long)charCount * 4;
|
|
if (byteCount > 0x7fffffff)
|
|
throw new ArgumentOutOfRangeException("charCount", "ArgumentOutOfRange_GetByteCountOverflow");
|
|
return (int)byteCount;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetMaxCharCount"]/*' />
|
|
public override int GetMaxCharCount(int byteCount) {
|
|
if (byteCount < 0) {
|
|
throw new ArgumentOutOfRangeException("byteCount",
|
|
"ArgumentOutOfRange_NeedNonNegNum");
|
|
}
|
|
return byteCount;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetPreamble"]/*' />
|
|
public override byte[] GetPreamble()
|
|
{
|
|
if (emitUTF8Identifier) {
|
|
// Allocate new array to prevent users from modifying it.
|
|
return new byte[3] { 0xEF, 0xBB, 0xBF };
|
|
}
|
|
else
|
|
return Encoding.emptyByteArray;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.Equals"]/*' />
|
|
public override bool Equals(Object value) {
|
|
UTF8Encoding that = value as UTF8Encoding;
|
|
if (that != null) {
|
|
return (emitUTF8Identifier == that.emitUTF8Identifier);
|
|
}
|
|
return (false);
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.GetHashCode"]/*' />
|
|
public override int GetHashCode() {
|
|
//Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
|
|
return UTF8_CODEPAGE + (isThrowException?1:0) + (emitUTF8Identifier?1:0);
|
|
}
|
|
|
|
internal class UTF8Encoder : Encoder
|
|
{
|
|
private UTF8Encoding encoding;
|
|
// We must save a high surrogate value until the next call, looking
|
|
// for a low surrogate value. surrogateChar is the bitshifted value,
|
|
// which can validly be 0. Since it can be 0, we need storedSurrogate.
|
|
internal int surrogateChar;
|
|
internal bool storedSurrogate;
|
|
// The mustFlush parameter means whether we should throw for a dangling
|
|
// high surrogate at the end of the char[]. It is only true when
|
|
// the user of this encoding is writing the last block.
|
|
internal bool mustFlush;
|
|
|
|
public UTF8Encoder(UTF8Encoding encoding) {
|
|
this.encoding = encoding;
|
|
surrogateChar = 0;
|
|
storedSurrogate = false;
|
|
}
|
|
|
|
public override int GetByteCount(char[] chars, int index, int count, bool flush) {
|
|
mustFlush = flush;
|
|
return encoding.GetByteCount(chars, index, count, this);
|
|
}
|
|
|
|
public override int GetBytes(char[] chars, int charIndex, int charCount,
|
|
byte[] bytes, int byteIndex, bool flush) {
|
|
mustFlush = flush;
|
|
return encoding.GetBytes(chars, charIndex, charCount, bytes, byteIndex, this);
|
|
}
|
|
}
|
|
|
|
internal class UTF8Decoder : Decoder
|
|
{
|
|
private UTF8Encoding encoding;
|
|
internal int bits;
|
|
internal int trailCount;
|
|
// We need to maintain the status that if we are decoding a surrogate (which has 4-byte UTF8), so
|
|
// that GetCharCount() can generate correct char count.
|
|
// The flag is needed because GetCharCount(), unlike GetChars(), does not really calculate the bits, so it has no way
|
|
// to know if the decoder bytes is a surrogate or not.
|
|
internal bool isSurrogate;
|
|
internal int byteSequence;
|
|
|
|
public UTF8Decoder(UTF8Encoding encoding) {
|
|
this.encoding = encoding;
|
|
}
|
|
|
|
//| <include path='docs/doc[@for="UTF8Encoding.UTF8Decoder.GetCharCount"]/*' />
|
|
public override int GetCharCount(byte[] bytes, int index, int count) {
|
|
return encoding.GetCharCount(bytes, index, count, this);
|
|
}
|
|
|
|
public override int GetChars(byte[] bytes, int byteIndex, int byteCount,
|
|
char[] chars, int charIndex) {
|
|
return encoding.GetChars(bytes, byteIndex, byteCount, chars,
|
|
charIndex, this);
|
|
}
|
|
}
|
|
}
|
|
}
|