singrdk/base/Applications/Runtime/Full/System/Text/UTF8Encoding.cs

827 lines
38 KiB
C#
Raw Normal View History

2008-03-05 09:52:00 -05:00
// ==++==
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// ==--==
2008-11-17 18:29:00 -05:00
namespace System.Text
{
2008-03-05 09:52:00 -05:00
using System;
using System.Diagnostics;
using System.Globalization;
// Encodes text into and out of UTF-8. UTF-8 is a way of writing
// Unicode characters with variable numbers of bytes per character,
// optimized for the lower 127 ASCII characters. It's an efficient way
// of encoding US English in an internationalizable way.
//
// The UTF-8 byte order mark is simply the Unicode byte order mark
// (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark is
// used mostly to distinguish UTF-8 text from other encodings, and doesn't
// switch the byte orderings.
//| <include path='docs/doc[@for="UTF8Encoding"]/*' />
public class UTF8Encoding : Encoding {
2008-11-17 18:29:00 -05:00
//
// bytes bits UTF-8 representation
// ----- ---- -----------------------------------
// 1 7 0vvvvvvv
// 2 11 110vvvvv 10vvvvvv
// 3 16 1110vvvv 10vvvvvv 10vvvvvv
// 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
// ----- ---- -----------------------------------
//
// Surrogate:
// Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + (LowSurrogate - 0xDC00) + 0x10000
//
2008-03-05 09:52:00 -05:00
private const int UTF8_CODEPAGE=65001;
// Yes, the idea of emitting U+FEFF as a UTF-8 identifier has made it into
// the standard.
private bool emitUTF8Identifier;
private bool isThrowException = false;
//| <include path='docs/doc[@for="UTF8Encoding.UTF8Encoding"]/*' />
public UTF8Encoding(): this(false) {
}
//| <include path='docs/doc[@for="UTF8Encoding.UTF8Encoding1"]/*' />
public UTF8Encoding(bool encoderShouldEmitUTF8Identifier):
this(encoderShouldEmitUTF8Identifier, false) {
}
//| <include path='docs/doc[@for="UTF8Encoding.UTF8Encoding2"]/*' />
public UTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes):
base(UTF8_CODEPAGE) {
this.emitUTF8Identifier = encoderShouldEmitUTF8Identifier;
this.isThrowException = throwOnInvalidBytes;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetByteCount"]/*' />
public override int GetByteCount(char[] chars, int index, int count) {
return (GetByteCount(chars, index, count, null));
}
internal unsafe int GetByteCount(char[] chars, int index, int count, UTF8Encoder encoder) {
if (chars == null) {
throw new ArgumentNullException("chars",
"ArgumentNull_Array");
}
if (index < 0 || count < 0) {
throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
"ArgumentOutOfRange_NeedNonNegNum");
}
if (chars.Length - index < count) {
throw new ArgumentOutOfRangeException("chars",
"ArgumentOutOfRange_IndexCountBuffer");
}
int retVal = -1;
if (chars.Length == 0) {
return 0;
}
fixed (char *p = chars) {
retVal = GetByteCount(p, index, count, encoder);
}
Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1");
return retVal;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetByteCount1"]/*' />
public unsafe override int GetByteCount(String chars) {
2008-11-17 18:29:00 -05:00
if (chars == null)
2008-03-05 09:52:00 -05:00
throw new ArgumentNullException("chars");
int retVal = -1;
fixed (char *p = chars) {
retVal = GetByteCount(p, 0, chars.Length, null);
}
Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1");
return retVal;
}
internal unsafe int GetByteCount(char *chars, int index, int count, UTF8Encoder encoder) {
Debug.Assert(chars!=null, "[UTF8Encoding.GetByteCount]chars!=null");
int end = index + count;
int byteCount = 0;
bool inSurrogate;
if (encoder == null || !encoder.storedSurrogate) {
inSurrogate = false;
}
else {
inSurrogate = true;
}
while (index < end && byteCount >= 0) {
char ch = chars[index++];
if (inSurrogate) {
//
// In previous char, we encounter a high surrogate, so we are expecting a low surrogate here.
//
if (CharacterInfo.IsLowSurrogate(ch)) {
inSurrogate = false;
//
// One surrogate pair will be translated into 4 bytes UTF8.
//
byteCount += 4;
}
else if (CharacterInfo.IsHighSurrogate(ch)) {
// We have two high surrogates.
if (isThrowException) {
throw new ArgumentException("Argument_InvalidHighSurrogate");
}
// Encode the previous high-surrogate char.
byteCount += 3;
// The isSurrogate is still true, because this could be the start of another valid surrogate pair.
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
if (isThrowException) {
throw new ArgumentException("Argument_InvalidHighSurrogate");
}
// Encode the previous high-surrogate char.
byteCount += 3;
// Not a surrogate. Put the char back so that we can restart the encoding.
inSurrogate = false;
index--;
}
2008-11-17 18:29:00 -05:00
}
else if (ch < 0x0080)
2008-03-05 09:52:00 -05:00
byteCount++;
else if (ch < 0x0800) {
byteCount += 2;
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
if (CharacterInfo.IsHighSurrogate(ch)) {
//
// Found the start of a surrogate.
//
inSurrogate = true;
}
else if (CharacterInfo.IsLowSurrogate(ch) && isThrowException) {
//
// Found a low surrogate without encountering a high surrogate first.
//
throw new ArgumentException("Argument_InvalidLowSurrogate");
}
else {
byteCount += 3;
}
}
}
// Check for overflows.
if (byteCount < 0)
throw new ArgumentOutOfRangeException("count", "ArgumentOutOfRange_GetByteCountOverflow");
if (inSurrogate) {
if (encoder == null || encoder.mustFlush) {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidHighSurrogate");
}
byteCount += 3;
}
}
return byteCount;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetBytes"]/*' />
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) {
return GetBytes(chars, charIndex, charCount, bytes, byteIndex, null);
}
private void EncodeThreeBytes(int ch, byte[] bytes, ref int byteIndex) {
bytes[byteIndex++] = (byte)(0xE0 | ch >> 12 & 0x0F);
bytes[byteIndex++] = (byte)(0x80 | ch >> 6 & 0x3F);
bytes[byteIndex++] = (byte)(0x80 | ch & 0x3F);
}
private unsafe int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, UTF8Encoder encoder) {
if (chars == null || bytes == null) {
throw new ArgumentNullException((chars == null ? "chars" : "bytes"),
"ArgumentNull_Array");
}
if (charIndex < 0 || charCount < 0) {
throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
"ArgumentOutOfRange_NeedNonNegNum");
}
if (chars.Length - charIndex < charCount) {
throw new ArgumentOutOfRangeException("chars",
"ArgumentOutOfRange_IndexCountBuffer");
}
if (byteIndex < 0 || byteIndex > bytes.Length) {
throw new ArgumentOutOfRangeException("byteIndex",
"ArgumentOutOfRange_Index");
}
int retVal = -1;
2008-11-17 18:29:00 -05:00
if (chars.Length == 0) {
2008-03-05 09:52:00 -05:00
return 0;
}
fixed (char *p = chars) {
retVal = GetBytes(p, charIndex, charCount, bytes, byteIndex, encoder);
}
Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1");
return retVal;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetBytes2"]/*' />
public override byte[] GetBytes(String s) {
if (s == null) {
throw new ArgumentNullException("s",
"ArgumentNull_String");
}
int byteLen = GetByteCount(s);
byte[] bytes = new byte[byteLen];
GetBytes(s, 0, s.Length, bytes, 0);
return bytes;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetBytes1"]/*' />
public unsafe override int GetBytes(String s, int charIndex, int charCount, byte[] bytes, int byteIndex) {
if (s == null || bytes == null) {
throw new ArgumentNullException((s == null ? "s" : "bytes"),
"ArgumentNull_String");
}
if (charIndex < 0 || charCount < 0) {
throw new ArgumentOutOfRangeException((charIndex<0 ? "charIndex" : "charCount"),
"ArgumentOutOfRange_NeedNonNegNum");
}
if (s.Length - charIndex < charCount) {
throw new ArgumentOutOfRangeException("s",
"ArgumentOutOfRange_IndexCount");
}
if (byteIndex < 0 || byteIndex > bytes.Length) {
throw new ArgumentOutOfRangeException("byteIndex",
"ArgumentOutOfRange_Index");
}
int retVal = -1;
fixed (char *p = s) {
retVal = GetBytes(p, charIndex, charCount, bytes, byteIndex, null);
}
Debug.Assert(retVal!=-1, "[UTF8Encoding.GetByteCount]retVal!=-1");
return retVal;
}
private unsafe int GetBytes(char *chars, int charIndex, int charCount, byte[] bytes, int byteIndex, UTF8Encoder encoder) {
Debug.Assert(chars!=null, "[UTF8Encoding.GetBytes]chars!=null");
int charEnd = charIndex + charCount;
int byteStart = byteIndex;
int surrogateChar;
if (encoder == null || !encoder.storedSurrogate) {
surrogateChar = -1;
}
else {
surrogateChar = encoder.surrogateChar;
encoder.storedSurrogate = false;
}
try {
while (charIndex < charEnd) {
char ch = chars[charIndex++];
//
// In previous byte, we encounter a high surrogate, so we are expecting a low surrogate here.
//
if (surrogateChar > 0) {
if (CharacterInfo.IsLowSurrogate(ch)) {
// We have a complete surrogate pair.
surrogateChar = (surrogateChar - CharacterInfo.HIGH_SURROGATE_START) << 10; // (ch - 0xd800) * 0x400
surrogateChar += (ch - CharacterInfo.LOW_SURROGATE_START);
surrogateChar += 0x10000;
bytes[byteIndex++] = (byte)(0xF0 | (surrogateChar >> 18) & 0x07);
bytes[byteIndex++] = (byte)(0x80 | (surrogateChar >> 12) & 0x3F);
bytes[byteIndex++] = (byte)(0x80 | (surrogateChar >> 6) & 0x3F);
bytes[byteIndex++] = (byte)(0x80 | surrogateChar & 0x3F);
surrogateChar = -1;
2008-11-17 18:29:00 -05:00
}
else if (CharacterInfo.IsHighSurrogate(ch)) {
2008-03-05 09:52:00 -05:00
// We have two high surrogate.
if (isThrowException) {
throw new ArgumentException("Argument_InvalidHighSurrogate");
}
// Encode the previous high-surrogate char.
EncodeThreeBytes(surrogateChar, bytes, ref byteIndex);
surrogateChar = ch;
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
if (isThrowException) {
throw new ArgumentException("Argument_InvalidHighSurrogate");
}
// Encode the previous high-surrogate char.
EncodeThreeBytes(surrogateChar, bytes, ref byteIndex);
// Not a surrogate. Put the char back so that we can restart the encoding.
surrogateChar = -1;
charIndex--;
}
2008-11-17 18:29:00 -05:00
}
else if (ch < 0x0080) {
2008-03-05 09:52:00 -05:00
bytes[byteIndex++] = (byte)ch;
2008-11-17 18:29:00 -05:00
}
else if (ch < 0x0800) {
2008-03-05 09:52:00 -05:00
bytes[byteIndex++] = (byte)(0xC0 | ch >> 6 & 0x1F);
bytes[byteIndex++] = (byte)(0x80 | ch & 0x3F);
2008-11-17 18:29:00 -05:00
}
else if (CharacterInfo.IsHighSurrogate(ch)) {
2008-03-05 09:52:00 -05:00
//
// Found the start of a surrogate.
//
surrogateChar = ch;
2008-11-17 18:29:00 -05:00
}
else if (CharacterInfo.IsLowSurrogate(ch) && isThrowException) {
2008-03-05 09:52:00 -05:00
throw new ArgumentException("Argument_InvalidLowSurrogate");
2008-11-17 18:29:00 -05:00
}
else { //we now know that the char is >= 0x0800 and isn't a high surrogate
2008-03-05 09:52:00 -05:00
bytes[byteIndex++] = (byte)(0xE0 | ch >> 12 & 0x0F);
bytes[byteIndex++] = (byte)(0x80 | ch >> 6 & 0x3F);
bytes[byteIndex++] = (byte)(0x80 | ch & 0x3F);
}
}
if (surrogateChar > 0) {
if (encoder != null && !encoder.mustFlush) {
encoder.surrogateChar = surrogateChar;
encoder.storedSurrogate = true;
}
else {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidHighSurrogate");
}
EncodeThreeBytes(surrogateChar, bytes, ref byteIndex);
}
}
2008-11-17 18:29:00 -05:00
}
catch (IndexOutOfRangeException) {
2008-03-05 09:52:00 -05:00
throw new ArgumentException("Argument_ConversionOverflow");
}
return byteIndex - byteStart;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetCharCount"]/*' />
public override int GetCharCount(byte[] bytes, int index, int count) {
return GetCharCount(bytes, index, count, null);
}
internal virtual int GetCharCount(byte[] bytes, int index, int count, UTF8Decoder decoder) {
if (bytes == null) {
throw new ArgumentNullException("bytes",
"ArgumentNull_Array");
}
if (index < 0 || count < 0) {
throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"),
"ArgumentOutOfRange_NeedNonNegNum");
}
if (bytes.Length - index < count) {
throw new ArgumentOutOfRangeException("bytes",
"ArgumentOutOfRange_IndexCountBuffer");
}
int charCount = 0;
int trailCount = 0;
// Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence.
// This is used to detect non-shortest form.
// It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that
// we don't check for non-shortest form again.
int byteSequence = 0;
bool isSurrogate = false;
int bits = 0;
if (decoder != null) {
trailCount = decoder.trailCount;
isSurrogate = decoder.isSurrogate;
byteSequence = decoder.byteSequence;
bits = decoder.bits;
}
int end = index + count;
while (index < end) {
byte b = bytes[index++];
if (trailCount == 0) {
if ((b & 0x80) == 0) {
// This is an ASCII.
charCount++;
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
byte temp = b;
while ((temp & 0x80) != 0) {
temp <<= 1;
trailCount++;
}
switch (trailCount) {
case 1:
trailCount = 0;
break;
case 2:
// Make sure that bit 8 ~ bit 11 is not all zero.
// 110XXXXx 10xxxxxx
if ((b & 0x1e) == 0) {
trailCount = 0;
}
break;
case 3:
byteSequence = 3;
break;
case 4:
isSurrogate = true;
byteSequence = 4;
break;
default:
trailCount = 0;
break;
}
if (trailCount == 0) {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
bits = temp >> trailCount;
trailCount--;
}
}
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
// We are expecting to see trailing bytes like 10vvvvvv
if ((b & 0xC0) != 0x80) {
// If not, this is NOT a valid sequence.
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
index--;
trailCount = 0;
isSurrogate = false;
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
switch (byteSequence) {
case 3:
// Check 3-byte sequence for non-shortest form.
// 1110XXXX 10Xxxxxx 10xxxxxx
if (bits == 0 && (b & 0x20) == 0) {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
trailCount = -1;
}
// We are done checking the non-shortest form, reset byteSequence to 0, so that we don't
// do the extra check for the remaining byte of the 3-byte chunk.
byteSequence = 0;
break;
case 4:
// Check 4-byte sequence for non-shortest form.
// 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
if (bits == 0) {
if ((b & 0x30) == 0) {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
trailCount = -1;
}
2008-11-17 18:29:00 -05:00
}
else if ((bits & 0x04) != 0) {
2008-03-05 09:52:00 -05:00
// Make sure that the resulting Unicode is within the valid surrogate range.
// The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range
// that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
// Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
// bit are all zero.
// I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
// XXXXX can only be 10000.
if ((bits & 0x03) != 0 || (b & 0x30) != 0) {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
trailCount = -1;
}
}
byteSequence = 0;
break;
}
if (--trailCount == 0) {
charCount++;
if (isSurrogate) {
charCount++;
isSurrogate = false;
}
}
}
}
}
return charCount;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetChars"]/*' />
public override int GetChars(byte[] bytes, int byteIndex, int byteCount,
char[] chars, int charIndex) {
return GetChars(bytes, byteIndex, byteCount, chars, charIndex, null);
}
internal virtual int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, UTF8Decoder decoder) {
if (bytes == null || chars == null) {
throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
"ArgumentNull_Array");
}
if (byteIndex < 0 || byteCount < 0) {
throw new ArgumentOutOfRangeException((byteIndex<0 ? "byteIndex" : "byteCount"),
"ArgumentOutOfRange_NeedNonNegNum");
}
2008-11-17 18:29:00 -05:00
if (bytes.Length - byteIndex < byteCount) {
2008-03-05 09:52:00 -05:00
throw new ArgumentOutOfRangeException("bytes",
"ArgumentOutOfRange_IndexCountBuffer");
}
if (charIndex < 0 || charIndex > chars.Length) {
throw new ArgumentOutOfRangeException("charIndex",
"ArgumentOutOfRange_Index");
}
int bits = 0;
int trailCount = 0;
bool isSurrogate = false;
// Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence.
// This is used to detect non-shortest form.
// It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that
// we don't check for non-shortest form again.
int byteSequence = 0;
if (decoder != null) {
bits = decoder.bits;
trailCount = decoder.trailCount;
isSurrogate = decoder.isSurrogate;
byteSequence = decoder.byteSequence;
}
int byteEnd = byteIndex + byteCount;
int charStart = charIndex;
try {
while (byteIndex < byteEnd) {
byte b = bytes[byteIndex++];
if (trailCount == 0) {
//
// We are not at a trailing byte.
//
if ((b & 0x80) == 0) {
// This is the ASCII case.
// 1 7 0vvvvvvv
//
// Found an ASCII character.
//
chars[charIndex++] = (char)b;
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
// Check if this is a valid starting byte.
byte temp = (byte)b;
while ((temp & 0x80) != 0) {
temp <<= 1;
trailCount++;
}
switch (trailCount) {
case 1:
trailCount = 0;
break;
case 2:
// Make sure that bit 8 ~ bit 11 is not all zero.
// 110XXXXx 10xxxxxx
if ((b & 0x1e) == 0) {
trailCount = 0;
}
break;
case 3:
byteSequence = 3;
break;
case 4:
//
// This is a surrogate unicode pair
//
byteSequence = 4;
break;
default:
trailCount = 0;
break;
}
if (trailCount == 0) {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
isSurrogate = (trailCount == 4);
bits = temp >> trailCount;
trailCount--;
}
}
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
// We are expecting to see bytes like 10vvvvvv
if ((b & 0xC0) != 0x80) {
// If not, this is NOT a valid sequence.
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
// At this point, we are seeing an invalid trailing byte.
// However, this can be a valid starting byte for another UTF8 byte sequence (e.g.
// this character could be under 0x7f, or a valid leading byte like 110xxxxx).
// So let's put the current byte back, and try to see if this is a valid byte
// for another UTF8 byte sequence.
byteIndex--;
bits = 0;
trailCount = 0;
2008-11-17 18:29:00 -05:00
}
else {
2008-03-05 09:52:00 -05:00
switch (byteSequence) {
case 3:
// Check 3-byte sequence for non-shortest form.
// 1110XXXX 10Xxxxxx 10xxxxxx
if (bits == 0 && (b & 0x20) == 0) {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
trailCount = -1;
}
// Rest byteSequence to zero since we are done with non-shortest form check.
byteSequence = 0;
break;
case 4:
// Check 4-byte sequence for non-shortest form.
// 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
if (bits == 0) {
if ((b & 0x30) == 0) {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
trailCount = -1;
}
2008-11-17 18:29:00 -05:00
}
else if ((bits & 0x04) != 0) {
2008-03-05 09:52:00 -05:00
// Make sure that the resulting Unicode is within the valid surrogate range.
// The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range
// that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
// Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
// bit are all zero.
// I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
// XXXXX can only be 10000.
if ((bits & 0x03) != 0 || (b & 0x30) != 0) {
if (isThrowException) {
throw new ArgumentException("Argument_InvalidByteSequence");
}
trailCount = -1;
}
}
byteSequence = 0;
break;
}
if (--trailCount >= 0) {
bits = bits << 6 | (b & 0x3F);
if (trailCount == 0) {
if (!isSurrogate) {
chars[charIndex++] = (char)bits;
}
else {
//
// bits >= 0x10000, use surrogate.
//
chars[charIndex++] = (char)(0xD7C0 + (bits >> 10));
chars[charIndex++] = (char)(CharacterInfo.LOW_SURROGATE_START + (bits & 0x3FF));
}
}
}
}
}
}
2008-11-17 18:29:00 -05:00
}
catch (IndexOutOfRangeException) {
2008-03-05 09:52:00 -05:00
throw new ArgumentException("Argument_ConversionOverflow");
}
if (decoder != null) {
decoder.bits = bits;
decoder.trailCount = trailCount;
decoder.isSurrogate = isSurrogate;
decoder.byteSequence = byteSequence;
}
return charIndex - charStart;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetDecoder"]/*' />
public override Decoder GetDecoder() {
return new UTF8Decoder(this);
}
//| <include path='docs/doc[@for="UTF8Encoding.GetEncoder"]/*' />
public override Encoder GetEncoder() {
return new UTF8Encoder(this);
}
//| <include path='docs/doc[@for="UTF8Encoding.GetMaxByteCount"]/*' />
public override int GetMaxByteCount(int charCount) {
if (charCount < 0) {
throw new ArgumentOutOfRangeException("charCount",
"ArgumentOutOfRange_NeedNonNegNum");
}
long byteCount = (long)charCount * 4;
if (byteCount > 0x7fffffff)
throw new ArgumentOutOfRangeException("charCount", "ArgumentOutOfRange_GetByteCountOverflow");
return (int)byteCount;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetMaxCharCount"]/*' />
public override int GetMaxCharCount(int byteCount) {
if (byteCount < 0) {
throw new ArgumentOutOfRangeException("byteCount",
"ArgumentOutOfRange_NeedNonNegNum");
}
return byteCount;
}
//| <include path='docs/doc[@for="UTF8Encoding.GetPreamble"]/*' />
public override byte[] GetPreamble()
{
if (emitUTF8Identifier) {
// Allocate new array to prevent users from modifying it.
return new byte[3] { 0xEF, 0xBB, 0xBF };
}
else
return Encoding.emptyByteArray;
}
//| <include path='docs/doc[@for="UTF8Encoding.Equals"]/*' />
public override bool Equals(Object value) {
UTF8Encoding that = value as UTF8Encoding;
if (that != null) {
return (emitUTF8Identifier == that.emitUTF8Identifier);
}
return (false);
}
//| <include path='docs/doc[@for="UTF8Encoding.GetHashCode"]/*' />
public override int GetHashCode() {
//Not great distribution, but this is relatively unlikely to be used as the key in a hashtable.
return UTF8_CODEPAGE + (isThrowException?1:0) + (emitUTF8Identifier?1:0);
}
internal class UTF8Encoder : Encoder
{
private UTF8Encoding encoding;
// We must save a high surrogate value until the next call, looking
// for a low surrogate value. surrogateChar is the bitshifted value,
// which can validly be 0. Since it can be 0, we need storedSurrogate.
internal int surrogateChar;
internal bool storedSurrogate;
// The mustFlush parameter means whether we should throw for a dangling
// high surrogate at the end of the char[]. It is only true when
// the user of this encoding is writing the last block.
internal bool mustFlush;
public UTF8Encoder(UTF8Encoding encoding) {
this.encoding = encoding;
surrogateChar = 0;
storedSurrogate = false;
}
public override int GetByteCount(char[] chars, int index, int count, bool flush) {
mustFlush = flush;
return encoding.GetByteCount(chars, index, count, this);
}
public override int GetBytes(char[] chars, int charIndex, int charCount,
byte[] bytes, int byteIndex, bool flush) {
mustFlush = flush;
return encoding.GetBytes(chars, charIndex, charCount, bytes, byteIndex, this);
}
}
internal class UTF8Decoder : Decoder
{
private UTF8Encoding encoding;
internal int bits;
internal int trailCount;
// We need to maintain the status that if we are decoding a surrogate (which has 4-byte UTF8), so
// that GetCharCount() can generate correct char count.
// The flag is needed because GetCharCount(), unlike GetChars(), does not really calculate the bits, so it has no way
// to know if the decoder bytes is a surrogate or not.
internal bool isSurrogate;
internal int byteSequence;
public UTF8Decoder(UTF8Encoding encoding) {
this.encoding = encoding;
}
//| <include path='docs/doc[@for="UTF8Encoding.UTF8Decoder.GetCharCount"]/*' />
public override int GetCharCount(byte[] bytes, int index, int count) {
return encoding.GetCharCount(bytes, index, count, this);
}
public override int GetChars(byte[] bytes, int byteIndex, int byteCount,
char[] chars, int charIndex) {
return encoding.GetChars(bytes, byteIndex, byteCount, chars,
charIndex, this);
}
}
}
}