2008-03-05 09:52:00 -05:00
//------------------------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
//------------------------------------------------------------------------------
// This RegexCharClass class provides the "set of Unicode chars" functionality
// used by the regexp engine.
/ /
// RegexCharClass supports a "string representation" of a character class.
// The string representation is NOT human-readable. It is a sequence of
// strictly increasing Unicode characters that begin ranges of characters
// that are alternately included in and excluded from the class.
/ /
// Membership of a character in the class can be determined by binary
// searching the string representation and determining if the including
// range is at an even or odd index.
/ /
// The RegexCharClass class itself is a builder class. One can add char ranges
// or sets or invert the class; then, the class can be converted to its
// string representation via RegexCharClass.ToSet().
/ /
#define ECMA
2008-11-17 18:29:00 -05:00
namespace System.Text.RegularExpressions
{
2008-03-05 09:52:00 -05:00
using System.Collections ;
using System.Globalization ;
using System.Diagnostics ;
internal sealed class RegexCharClass {
internal const char Nullchar = '\0' ;
internal const char Lastchar = ' \ uFFFF ' ;
internal const String Any = "\0" ;
internal const String Empty = "" ;
internal const char GroupChar = ( char ) 0 ;
internal static readonly RegexCharClass AnyClass = new RegexCharClass ( "\0" ) ;
internal static readonly RegexCharClass EmptyClass = new RegexCharClass ( String . Empty ) ;
internal static readonly String Word ;
internal static readonly String NotWord ;
internal const short SpaceConst = 100 ;
internal const short NotSpaceConst = - 100 ;
internal static readonly String Space = ( ( char ) SpaceConst ) . ToString ( ) ;
internal static readonly String NotSpace = NegateCategory ( Space ) ;
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
internal const String ECMASpace = "\u0009\u000E\u0020\u0021" ;
internal const String NotECMASpace = "\0\u0009\u000E\u0020\u0021" ;
internal const String ECMAWord = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131" ;
internal const String NotECMAWord = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131" ;
internal const String ECMADigit = "\u0030\u003A" ;
internal const String NotECMADigit = "\0\u0030\u003A" ;
internal static Hashtable _definedCategories ;
internal ArrayList _rangelist ;
internal StringBuilder _categories ;
internal bool _canonical ;
internal bool _negate ;
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
static RegexCharClass ( ) {
_definedCategories = new Hashtable ( 31 ) ;
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
char [ ] groups = new char [ 9 ] ;
StringBuilder word = new StringBuilder ( 11 ) ;
word . Append ( GroupChar ) ;
groups [ 0 ] = GroupChar ;
// We need the UnicodeCategory enum values as a char so we can put them in a string
2008-11-17 18:29:00 -05:00
// in the hashtable. In order to get there, we first must cast to an int,
2008-03-05 09:52:00 -05:00
// then cast to a char
2008-11-17 18:29:00 -05:00
// Also need to distinguish between positive and negative values. UnicodeCategory is zero
2008-03-05 09:52:00 -05:00
// based, so we add one to each value and subtract it off later
// Others
groups [ 1 ] = ( char ) ( ( int ) UnicodeCategory . Control + 1 ) ;
2008-11-17 18:29:00 -05:00
_definedCategories [ "Cc" ] = groups [ 1 ] . ToString ( ) ; // Control
2008-03-05 09:52:00 -05:00
groups [ 2 ] = ( char ) ( ( int ) UnicodeCategory . Format + 1 ) ;
_definedCategories [ "Cf" ] = groups [ 2 ] . ToString ( ) ; // Format
groups [ 3 ] = ( char ) ( ( int ) UnicodeCategory . OtherNotAssigned + 1 ) ;
_definedCategories [ "Cn" ] = groups [ 3 ] . ToString ( ) ; // Not assigned
groups [ 4 ] = ( char ) ( ( int ) UnicodeCategory . PrivateUse + 1 ) ;
_definedCategories [ "Co" ] = groups [ 4 ] . ToString ( ) ; // Private use
groups [ 5 ] = ( char ) ( ( int ) UnicodeCategory . Surrogate + 1 ) ;
_definedCategories [ "Cs" ] = groups [ 5 ] . ToString ( ) ; // Surrogate
groups [ 6 ] = GroupChar ;
_definedCategories [ "C" ] = new String ( groups , 0 , 7 ) ;
// Letters
groups [ 1 ] = ( char ) ( ( int ) UnicodeCategory . LowercaseLetter + 1 ) ;
_definedCategories [ "Ll" ] = groups [ 1 ] . ToString ( ) ; // Lowercase
groups [ 2 ] = ( char ) ( ( int ) UnicodeCategory . ModifierLetter + 1 ) ;
_definedCategories [ "Lm" ] = groups [ 2 ] . ToString ( ) ; // Modifier
groups [ 3 ] = ( char ) ( ( int ) UnicodeCategory . OtherLetter + 1 ) ;
2008-11-17 18:29:00 -05:00
_definedCategories [ "Lo" ] = groups [ 3 ] . ToString ( ) ; // Other
2008-03-05 09:52:00 -05:00
groups [ 4 ] = ( char ) ( ( int ) UnicodeCategory . TitlecaseLetter + 1 ) ;
_definedCategories [ "Lt" ] = groups [ 4 ] . ToString ( ) ; // Titlecase
groups [ 5 ] = ( char ) ( ( int ) UnicodeCategory . UppercaseLetter + 1 ) ;
_definedCategories [ "Lu" ] = groups [ 5 ] . ToString ( ) ; // Uppercase
//groups[6] = GroupChar;
_definedCategories [ "L" ] = new String ( groups , 0 , 7 ) ;
word . Append ( groups [ 1 ] ) ;
word . Append ( new String ( groups , 3 , 3 ) ) ;
2008-11-17 18:29:00 -05:00
// Marks
2008-03-05 09:52:00 -05:00
groups [ 1 ] = ( char ) ( ( int ) UnicodeCategory . SpacingCombiningMark + 1 ) ;
_definedCategories [ "Mc" ] = groups [ 1 ] . ToString ( ) ; // Spacing combining
groups [ 2 ] = ( char ) ( ( int ) UnicodeCategory . EnclosingMark + 1 ) ;
_definedCategories [ "Me" ] = groups [ 2 ] . ToString ( ) ; // Enclosing
groups [ 3 ] = ( char ) ( ( int ) UnicodeCategory . NonSpacingMark + 1 ) ;
_definedCategories [ "Mn" ] = groups [ 3 ] . ToString ( ) ; // Non-spacing
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
groups [ 4 ] = GroupChar ;
_definedCategories [ "M" ] = new String ( groups , 0 , 5 ) ;
//word.Append(groups[1]);
//word.Append(groups[3]);
// Numbers
groups [ 1 ] = ( char ) ( ( int ) UnicodeCategory . DecimalDigitNumber + 1 ) ;
_definedCategories [ "Nd" ] = groups [ 1 ] . ToString ( ) ; // Decimal digit
groups [ 2 ] = ( char ) ( ( int ) UnicodeCategory . LetterNumber + 1 ) ;
_definedCategories [ "Nl" ] = groups [ 2 ] . ToString ( ) ; // Letter
groups [ 3 ] = ( char ) ( ( int ) UnicodeCategory . OtherNumber + 1 ) ;
2008-11-17 18:29:00 -05:00
_definedCategories [ "No" ] = groups [ 3 ] . ToString ( ) ; // Other
2008-03-05 09:52:00 -05:00
//groups[4] = GroupChar;
_definedCategories [ "N" ] = new String ( groups , 0 , 5 ) ;
word . Append ( groups [ 1 ] ) ;
//word.Append(new String(groups, 1, 3));
// Punctuation
groups [ 1 ] = ( char ) ( ( int ) UnicodeCategory . ConnectorPunctuation + 1 ) ;
_definedCategories [ "Pc" ] = groups [ 1 ] . ToString ( ) ; // Connector
groups [ 2 ] = ( char ) ( ( int ) UnicodeCategory . DashPunctuation + 1 ) ;
_definedCategories [ "Pd" ] = groups [ 2 ] . ToString ( ) ; // Dash
groups [ 3 ] = ( char ) ( ( int ) UnicodeCategory . ClosePunctuation + 1 ) ;
_definedCategories [ "Pe" ] = groups [ 3 ] . ToString ( ) ; // Close
groups [ 4 ] = ( char ) ( ( int ) UnicodeCategory . OtherPunctuation + 1 ) ;
_definedCategories [ "Po" ] = groups [ 4 ] . ToString ( ) ; // Other
groups [ 5 ] = ( char ) ( ( int ) UnicodeCategory . OpenPunctuation + 1 ) ;
_definedCategories [ "Ps" ] = groups [ 5 ] . ToString ( ) ; // Open
groups [ 6 ] = ( char ) ( ( int ) UnicodeCategory . FinalQuotePunctuation + 1 ) ;
_definedCategories [ "Pi" ] = groups [ 6 ] . ToString ( ) ; // Initial quote
groups [ 7 ] = ( char ) ( ( int ) UnicodeCategory . InitialQuotePunctuation + 1 ) ;
_definedCategories [ "Pf" ] = groups [ 7 ] . ToString ( ) ; // Final quote
groups [ 8 ] = GroupChar ;
_definedCategories [ "P" ] = new String ( groups , 0 , 9 ) ;
word . Append ( groups [ 1 ] ) ;
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
// Symbols
groups [ 1 ] = ( char ) ( ( int ) UnicodeCategory . CurrencySymbol + 1 ) ;
_definedCategories [ "Sc" ] = groups [ 1 ] . ToString ( ) ; // Currency
groups [ 2 ] = ( char ) ( ( int ) UnicodeCategory . ModifierSymbol + 1 ) ;
_definedCategories [ "Sk" ] = groups [ 2 ] . ToString ( ) ; // Modifier
groups [ 3 ] = ( char ) ( ( int ) UnicodeCategory . MathSymbol + 1 ) ;
_definedCategories [ "Sm" ] = groups [ 3 ] . ToString ( ) ; // Math
groups [ 4 ] = ( char ) ( ( int ) UnicodeCategory . OtherSymbol + 1 ) ;
_definedCategories [ "So" ] = groups [ 4 ] . ToString ( ) ; // Other
groups [ 5 ] = GroupChar ;
_definedCategories [ "S" ] = new String ( groups , 0 , 6 ) ;
// Separators
groups [ 1 ] = ( char ) ( ( int ) UnicodeCategory . LineSeparator + 1 ) ;
_definedCategories [ "Zl" ] = groups [ 1 ] . ToString ( ) ; // Line
groups [ 2 ] = ( char ) ( ( int ) UnicodeCategory . ParagraphSeparator + 1 ) ;
_definedCategories [ "Zp" ] = groups [ 2 ] . ToString ( ) ; // Paragraph
groups [ 3 ] = ( char ) ( ( int ) UnicodeCategory . SpaceSeparator + 1 ) ;
_definedCategories [ "Zs" ] = groups [ 3 ] . ToString ( ) ; // Space
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
groups [ 4 ] = GroupChar ;
_definedCategories [ "Z" ] = new String ( groups , 0 , 5 ) ;
word . Append ( GroupChar ) ;
Word = word . ToString ( ) ;
NotWord = NegateCategory ( Word ) ;
#if DBG
// make sure the _propTable is correctly ordered
int len = _propTable . GetLength ( 0 ) ;
2008-11-17 18:29:00 -05:00
for ( int i = 0 ; i < len - 1 ; i + + )
2008-03-05 09:52:00 -05:00
Debug . Assert ( String . Compare ( _propTable [ i , 0 ] , _propTable [ i + 1 , 0 ] , false , CultureInfo . InvariantCulture ) < 0 , "RegexCharClass _propTable is out of order at (" + _propTable [ i , 0 ] + ", " + _propTable [ i + 1 , 0 ] + ")" ) ;
2008-11-17 18:29:00 -05:00
#endif
2008-03-05 09:52:00 -05:00
}
// RegexCharClass()
/ /
// Creates an empty character class.
internal RegexCharClass ( ) {
_rangelist = new ArrayList ( 6 ) ;
_canonical = true ;
_categories = new StringBuilder ( ) ;
}
// RegexCharClass()
/ /
// Creates a character class out of a string representation.
internal RegexCharClass ( String set ) {
_rangelist = new ArrayList ( ( set . Length + 1 ) / 2 ) ;
_canonical = true ;
_categories = new StringBuilder ( ) ;
AddSet ( set ) ;
}
// RegexCharClass()
/ /
// Creates a character class with a single range.
internal RegexCharClass ( char first , char last ) {
_rangelist = new ArrayList ( 1 ) ;
_rangelist . Add ( new SingleRange ( first , last ) ) ;
_canonical = true ;
_categories = new StringBuilder ( ) ;
}
internal static RegexCharClass CreateFromCategory ( string categoryName , bool invert , bool caseInsensitive , string pattern ) {
RegexCharClass cc = new RegexCharClass ( ) ;
cc . AddCategoryFromName ( categoryName , invert , caseInsensitive , pattern ) ;
return cc ;
}
// AddCharClass()
/ /
// Adds a regex char class
internal void AddCharClass ( RegexCharClass cc ) {
int i ;
2008-11-17 18:29:00 -05:00
if ( _canonical & & RangeCount ( ) > 0 & & cc . RangeCount ( ) > 0 & &
2008-03-05 09:52:00 -05:00
cc . Range ( cc . RangeCount ( ) - 1 ) . _last < = Range ( RangeCount ( ) - 1 ) . _last )
_canonical = false ;
for ( i = 0 ; i < cc . RangeCount ( ) ; i + = 1 ) {
_rangelist . Add ( cc . Range ( i ) ) ;
}
_categories . Append ( cc . _categories . ToString ( ) ) ;
}
// AddSet()
/ /
// Adds a set (specified by its string representation) to the class.
internal void AddSet ( String set ) {
int i ;
2008-11-17 18:29:00 -05:00
if ( _canonical & & RangeCount ( ) > 0 & & set . Length > 0 & &
2008-03-05 09:52:00 -05:00
set [ 0 ] < = Range ( RangeCount ( ) - 1 ) . _last )
_canonical = false ;
for ( i = 0 ; i < set . Length - 1 ; i + = 2 ) {
_rangelist . Add ( new SingleRange ( set [ i ] , ( char ) ( set [ i + 1 ] - 1 ) ) ) ;
}
if ( i < set . Length ) {
_rangelist . Add ( new SingleRange ( set [ i ] , Lastchar ) ) ;
}
}
// AddRange()
/ /
// Adds a single range of characters to the class.
internal void AddRange ( char first , char last ) {
_rangelist . Add ( new SingleRange ( first , last ) ) ;
if ( _canonical & & _rangelist . Count > 0 & &
first < = ( ( SingleRange ) _rangelist [ _rangelist . Count - 1 ] ) . _last ) {
_canonical = false ;
}
}
internal string Category {
2008-11-17 18:29:00 -05:00
get {
2008-03-05 09:52:00 -05:00
//if (_negate)
// return NegateCategory(_categories.ToString());
//else
2008-11-17 18:29:00 -05:00
return _categories . ToString ( ) ;
2008-03-05 09:52:00 -05:00
}
}
internal bool Negate {
set { _negate = value ; }
}
internal void AddCategoryFromName ( string categoryName , bool invert , bool caseInsensitive , string pattern ) {
object cat = _definedCategories [ categoryName ] ;
if ( cat ! = null ) {
string catstr = ( string ) cat ;
if ( caseInsensitive ) {
if ( categoryName . Equals ( "Lu" ) | | categoryName . Equals ( "Lt" ) )
catstr = /*catstr +*/ ( string ) _definedCategories [ "Ll" ] ;
}
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
if ( invert )
catstr = NegateCategory ( catstr ) ; // negate the category
_categories . Append ( ( string ) catstr ) ;
}
else
AddSet ( SetFromProperty ( categoryName , invert , pattern ) ) ;
}
internal void AddCategory ( string category ) {
_categories . Append ( category ) ;
}
2008-11-17 18:29:00 -05:00
////////////////////////////////////////////////////////////////////////////
// Let U be the set of Unicode character values and let L be the lowercase
// function, mapping from U to U. To perform case insensitive matching of
// character sets, we need to be able to map an interval I in U, say
/ /
// I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
/ /
// to a set A such that A contains L(I) and A is contained in the union of
// I and L(I).
/ /
// The table below partitions U into intervals on which L is non-decreasing.
// Thus, for any interval J = [a, b] contained in one of these intervals,
// L(J) is contained in [L(a), L(b)].
/ /
// It is also true that for any such J, [L(a), L(b)] is contained in the
// union of J and L(J). This does not follow from L being non-decreasing on
// these intervals. It follows from the nature of the L on each interval.
// On each interval, L has one of the following forms:
/ /
// (1) L(ch) = constant (LowercaseSet)
// (2) L(ch) = ch + offset (LowercaseAdd)
// (3) L(ch) = ch | 1 (LowercaseBor)
// (4) L(ch) = ch + (ch & 1) (LowercaseBad)
/ /
// It is easy to verify that for any of these forms [L(a), L(b)] is
// contained in the union of [a, b] and L([a, b]).
//////////////////////////////////////////////////////////////////////////
2008-03-05 09:52:00 -05:00
internal const int LowercaseSet = 0 ; // Set to arg.
internal const int LowercaseAdd = 1 ; // Add arg.
internal const int LowercaseBor = 2 ; // Bitwise or with 1.
internal const int LowercaseBad = 3 ; // Bitwise and with 1 and add original.
// Lower case mapping descriptor.
private sealed class LC {
internal LC ( char chMin , char chMax , int lcOp , int data ) {
_chMin = chMin ;
_chMax = chMax ;
_lcOp = lcOp ;
_data = data ;
}
internal char _chMin ;
internal char _chMax ;
internal int _lcOp ;
internal int _data ;
}
private static readonly LC [ ] _lcTable = new LC [ ]
{
new LC ( ' \ u0041 ' , ' \ u005A ' , LowercaseAdd , 32 ) ,
new LC ( ' \ u00C0 ' , ' \ u00DE ' , LowercaseAdd , 32 ) ,
new LC ( ' \ u0100 ' , ' \ u012E ' , LowercaseBor , 0 ) ,
new LC ( ' \ u0130 ' , ' \ u0130 ' , LowercaseSet , 0x0069 ) ,
new LC ( ' \ u0132 ' , ' \ u0136 ' , LowercaseBor , 0 ) ,
new LC ( ' \ u0139 ' , ' \ u0147 ' , LowercaseBad , 0 ) ,
new LC ( ' \ u014A ' , ' \ u0176 ' , LowercaseBor , 0 ) ,
new LC ( ' \ u0178 ' , ' \ u0178 ' , LowercaseSet , 0x00FF ) ,
new LC ( ' \ u0179 ' , ' \ u017D ' , LowercaseBad , 0 ) ,
new LC ( ' \ u0181 ' , ' \ u0181 ' , LowercaseSet , 0x0253 ) ,
new LC ( ' \ u0182 ' , ' \ u0184 ' , LowercaseBor , 0 ) ,
new LC ( ' \ u0186 ' , ' \ u0186 ' , LowercaseSet , 0x0254 ) ,
new LC ( ' \ u0187 ' , ' \ u0187 ' , LowercaseSet , 0x0188 ) ,
new LC ( ' \ u0189 ' , ' \ u018A ' , LowercaseAdd , 205 ) ,
new LC ( ' \ u018B ' , ' \ u018B ' , LowercaseSet , 0x018C ) ,
new LC ( ' \ u018E ' , ' \ u018F ' , LowercaseAdd , 202 ) ,
new LC ( ' \ u0190 ' , ' \ u0190 ' , LowercaseSet , 0x025B ) ,
new LC ( ' \ u0191 ' , ' \ u0191 ' , LowercaseSet , 0x0192 ) ,
new LC ( ' \ u0193 ' , ' \ u0193 ' , LowercaseSet , 0x0260 ) ,
new LC ( ' \ u0194 ' , ' \ u0194 ' , LowercaseSet , 0x0263 ) ,
new LC ( ' \ u0196 ' , ' \ u0196 ' , LowercaseSet , 0x0269 ) ,
new LC ( ' \ u0197 ' , ' \ u0197 ' , LowercaseSet , 0x0268 ) ,
new LC ( ' \ u0198 ' , ' \ u0198 ' , LowercaseSet , 0x0199 ) ,
new LC ( ' \ u019C ' , ' \ u019C ' , LowercaseSet , 0x026F ) ,
new LC ( ' \ u019D ' , ' \ u019D ' , LowercaseSet , 0x0272 ) ,
new LC ( ' \ u01A0 ' , ' \ u01A4 ' , LowercaseBor , 0 ) ,
new LC ( ' \ u01A7 ' , ' \ u01A7 ' , LowercaseSet , 0x01A8 ) ,
new LC ( ' \ u01A9 ' , ' \ u01A9 ' , LowercaseSet , 0x0283 ) ,
new LC ( ' \ u01AC ' , ' \ u01AC ' , LowercaseSet , 0x01AD ) ,
new LC ( ' \ u01AE ' , ' \ u01AE ' , LowercaseSet , 0x0288 ) ,
new LC ( ' \ u01AF ' , ' \ u01AF ' , LowercaseSet , 0x01B0 ) ,
new LC ( ' \ u01B1 ' , ' \ u01B2 ' , LowercaseAdd , 217 ) ,
new LC ( ' \ u01B3 ' , ' \ u01B5 ' , LowercaseBad , 0 ) ,
new LC ( ' \ u01B7 ' , ' \ u01B7 ' , LowercaseSet , 0x0292 ) ,
new LC ( ' \ u01B8 ' , ' \ u01B8 ' , LowercaseSet , 0x01B9 ) ,
new LC ( ' \ u01BC ' , ' \ u01BC ' , LowercaseSet , 0x01BD ) ,
new LC ( ' \ u01C4 ' , ' \ u01C5 ' , LowercaseSet , 0x01C6 ) ,
new LC ( ' \ u01C7 ' , ' \ u01C8 ' , LowercaseSet , 0x01C9 ) ,
new LC ( ' \ u01CA ' , ' \ u01CB ' , LowercaseSet , 0x01CC ) ,
new LC ( ' \ u01CD ' , ' \ u01DB ' , LowercaseBad , 0 ) ,
new LC ( ' \ u01DE ' , ' \ u01EE ' , LowercaseBor , 0 ) ,
new LC ( ' \ u01F1 ' , ' \ u01F2 ' , LowercaseSet , 0x01F3 ) ,
new LC ( ' \ u01F4 ' , ' \ u01F4 ' , LowercaseSet , 0x01F5 ) ,
new LC ( ' \ u01FA ' , ' \ u0216 ' , LowercaseBor , 0 ) ,
new LC ( ' \ u0386 ' , ' \ u0386 ' , LowercaseSet , 0x03AC ) ,
new LC ( ' \ u0388 ' , ' \ u038A ' , LowercaseAdd , 37 ) ,
new LC ( ' \ u038C ' , ' \ u038C ' , LowercaseSet , 0x03CC ) ,
new LC ( ' \ u038E ' , ' \ u038F ' , LowercaseAdd , 63 ) ,
new LC ( ' \ u0391 ' , ' \ u03AB ' , LowercaseAdd , 32 ) ,
new LC ( ' \ u03E2 ' , ' \ u03EE ' , LowercaseBor , 0 ) ,
new LC ( ' \ u0401 ' , ' \ u040F ' , LowercaseAdd , 80 ) ,
new LC ( ' \ u0410 ' , ' \ u042F ' , LowercaseAdd , 32 ) ,
new LC ( ' \ u0460 ' , ' \ u0480 ' , LowercaseBor , 0 ) ,
new LC ( ' \ u0490 ' , ' \ u04BE ' , LowercaseBor , 0 ) ,
new LC ( ' \ u04C1 ' , ' \ u04C3 ' , LowercaseBad , 0 ) ,
new LC ( ' \ u04C7 ' , ' \ u04C7 ' , LowercaseSet , 0x04C8 ) ,
new LC ( ' \ u04CB ' , ' \ u04CB ' , LowercaseSet , 0x04CC ) ,
new LC ( ' \ u04D0 ' , ' \ u04EA ' , LowercaseBor , 0 ) ,
new LC ( ' \ u04EE ' , ' \ u04F4 ' , LowercaseBor , 0 ) ,
new LC ( ' \ u04F8 ' , ' \ u04F8 ' , LowercaseSet , 0x04F9 ) ,
new LC ( ' \ u0531 ' , ' \ u0556 ' , LowercaseAdd , 48 ) ,
new LC ( ' \ u10A0 ' , ' \ u10C5 ' , LowercaseAdd , 48 ) ,
new LC ( ' \ u1E00 ' , ' \ u1EF8 ' , LowercaseBor , 0 ) ,
new LC ( ' \ u1F08 ' , ' \ u1F0F ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1F18 ' , ' \ u1F1F ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1F28 ' , ' \ u1F2F ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1F38 ' , ' \ u1F3F ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1F48 ' , ' \ u1F4D ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1F59 ' , ' \ u1F59 ' , LowercaseSet , 0x1F51 ) ,
new LC ( ' \ u1F5B ' , ' \ u1F5B ' , LowercaseSet , 0x1F53 ) ,
new LC ( ' \ u1F5D ' , ' \ u1F5D ' , LowercaseSet , 0x1F55 ) ,
new LC ( ' \ u1F5F ' , ' \ u1F5F ' , LowercaseSet , 0x1F57 ) ,
new LC ( ' \ u1F68 ' , ' \ u1F6F ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1F88 ' , ' \ u1F8F ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1F98 ' , ' \ u1F9F ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1FA8 ' , ' \ u1FAF ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1FB8 ' , ' \ u1FB9 ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1FBA ' , ' \ u1FBB ' , LowercaseAdd , - 74 ) ,
new LC ( ' \ u1FBC ' , ' \ u1FBC ' , LowercaseSet , 0x1FB3 ) ,
new LC ( ' \ u1FC8 ' , ' \ u1FCB ' , LowercaseAdd , - 86 ) ,
new LC ( ' \ u1FCC ' , ' \ u1FCC ' , LowercaseSet , 0x1FC3 ) ,
new LC ( ' \ u1FD8 ' , ' \ u1FD9 ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1FDA ' , ' \ u1FDB ' , LowercaseAdd , - 100 ) ,
new LC ( ' \ u1FE8 ' , ' \ u1FE9 ' , LowercaseAdd , - 8 ) ,
new LC ( ' \ u1FEA ' , ' \ u1FEB ' , LowercaseAdd , - 112 ) ,
new LC ( ' \ u1FEC ' , ' \ u1FEC ' , LowercaseSet , 0x1FE5 ) ,
new LC ( ' \ u1FF8 ' , ' \ u1FF9 ' , LowercaseAdd , - 128 ) ,
new LC ( ' \ u1FFA ' , ' \ u1FFB ' , LowercaseAdd , - 126 ) ,
new LC ( ' \ u1FFC ' , ' \ u1FFC ' , LowercaseSet , 0x1FF3 ) ,
new LC ( ' \ u2160 ' , ' \ u216F ' , LowercaseAdd , 16 ) ,
new LC ( ' \ u24B6 ' , ' \ u24D0 ' , LowercaseAdd , 26 ) ,
new LC ( ' \ uFF21 ' , ' \ uFF3A ' , LowercaseAdd , 32 ) ,
} ;
// AddLowerCase()
/ /
// Adds to the class any lowercase versions of characters already
// in the class. Used for case-insensitivity.
internal void AddLowercase ( CultureInfo culture ) {
int i ;
int origSize ;
SingleRange range ;
_canonical = false ;
for ( i = 0 , origSize = _rangelist . Count ; i < origSize ; i + + ) {
range = ( SingleRange ) _rangelist [ i ] ;
if ( range . _first = = range . _last )
range . _first = range . _last = Char . ToLower ( range . _first ) ; //, culture);
else
AddLowercaseImpl ( range . _first , range . _last , culture ) ;
}
}
// AddLowerCaseImpl()
/ /
// For a single range that's in the set, adds any additional ranges
// necessary to ensure that lowercase equivalents are also included.
internal void AddLowercaseImpl ( char chMin , char chMax , CultureInfo culture ) {
int i , iMax , iMid ;
char chMinT , chMaxT ;
LC lc ;
if ( chMin = = chMax ) {
chMin = Char . ToLower ( chMin ) ; //, culture);
if ( chMin ! = chMax )
AddRange ( chMin , chMin ) ;
return ;
}
2008-11-17 18:29:00 -05:00
for ( i = 0 , iMax = _lcTable . Length ; i < iMax ; ) {
2008-03-05 09:52:00 -05:00
iMid = ( i + iMax ) / 2 ;
if ( _lcTable [ iMid ] . _chMax < chMin )
i = iMid + 1 ;
else
iMax = iMid ;
}
if ( i > = _lcTable . Length )
return ;
2008-11-17 18:29:00 -05:00
for ( ; i < _lcTable . Length & & ( lc = _lcTable [ i ] ) . _chMin < = chMax ; i + + ) {
2008-03-05 09:52:00 -05:00
if ( ( chMinT = lc . _chMin ) < chMin )
chMinT = chMin ;
if ( ( chMaxT = lc . _chMax ) > chMax )
chMaxT = chMax ;
switch ( lc . _lcOp ) {
case LowercaseSet :
chMinT = ( char ) lc . _data ;
chMaxT = ( char ) lc . _data ;
break ;
case LowercaseAdd :
chMinT + = ( char ) lc . _data ;
chMaxT + = ( char ) lc . _data ;
break ;
case LowercaseBor :
chMinT | = ( char ) 1 ;
chMaxT | = ( char ) 1 ;
break ;
case LowercaseBad :
chMinT + = ( char ) ( chMinT & 1 ) ;
chMaxT + = ( char ) ( chMaxT & 1 ) ;
break ;
}
if ( chMinT < chMin | | chMaxT > chMax )
AddRange ( chMinT , chMaxT ) ;
}
}
// ToSet()
/ /
// Constructs the string representation of the class.
internal String ToSet ( ) {
int i ;
StringBuilder sb ;
if ( ! _canonical )
Canonicalize ( ) ;
if ( _negate ) {
sb = new StringBuilder ( _rangelist . Count * 2 + 2 ) ;
sb . Append ( Nullchar ) ;
sb . Append ( Nullchar ) ;
}
else
sb = new StringBuilder ( _rangelist . Count * 2 ) ;
for ( i = 0 ; i < _rangelist . Count ; i + + ) {
sb . Append ( ( ( SingleRange ) _rangelist [ i ] ) . _first ) ;
if ( ( ( SingleRange ) _rangelist [ i ] ) . _last ! = Lastchar )
sb . Append ( ( char ) ( ( ( SingleRange ) _rangelist [ i ] ) . _last + 1 ) ) ;
}
return sb . ToString ( ) ;
}
// ToSetCi()
/ /
// Constructs the string representation of the class.
internal String ToSetCi ( bool caseInsensitive , CultureInfo culture ) {
if ( caseInsensitive )
AddLowercase ( culture ) ;
return ToSet ( ) ;
}
// SetSize()
/ /
// Returns the number of characters included in the set.
internal static int SetSize ( String set ) {
int i ;
int c ;
c = 0 ;
for ( i = 0 ; i < set . Length - 1 ; i + = 2 ) {
c + = set [ i + 1 ] - set [ i ] ;
}
if ( i < set . Length ) {
c + = 0x10000 - set [ i ] ;
}
return c ;
}
// SetInverse()
/ /
// Inverts a string representation of a class directly.
internal static String SetInverse ( String set ) {
if ( set . Length = = 0 | | set [ 0 ] ! = Nullchar )
return Any + set ;
if ( set . Length = = 1 )
return Empty ;
return set . Substring ( 1 , set . Length - 1 ) ;
}
// SetUnion()
/ /
// Builds the union of two string representations of a class directly.
internal static String SetUnion ( String setI , String setJ ) {
int i ;
int j ;
int s ;
String swap ;
StringBuilder sb ;
char chExc ;
if ( setI . Equals ( Empty ) | | setJ . Equals ( Any ) )
return setJ ;
if ( setJ . Equals ( Empty ) | | setI . Equals ( Any ) )
return setI ;
if ( setI = = setJ )
return setI ;
i = 0 ;
j = 0 ;
sb = new StringBuilder ( setI . Length + setJ . Length ) ;
for ( ; ; ) {
if ( j = = setJ . Length ) {
sb . Append ( setI , i , setI . Length - i ) ;
break ;
}
if ( i = = setI . Length ) {
sb . Append ( setJ , j , setJ . Length - j ) ;
break ;
}
if ( setJ [ j ] > setI [ i ] ) {
s = i ;
i = j ;
j = s ;
swap = setI ;
setI = setJ ;
setJ = swap ;
}
sb . Append ( setJ [ j + + ] ) ;
if ( j = = setJ . Length )
break ;
chExc = setJ [ j + + ] ;
for ( ; ; ) {
while ( i < setI . Length & & setI [ i ] < = chExc )
i + + ;
if ( ( i & 0x1 ) = = 0 ) {
sb . Append ( chExc ) ;
goto OuterContinue ;
}
else {
if ( i = = setI . Length )
goto OuterBreak ;
chExc = setI [ i + + ] ;
}
s = i ;
i = j ;
j = s ;
swap = setI ;
setI = setJ ;
setJ = swap ;
}
2008-11-17 18:29:00 -05:00
OuterContinue :
2008-03-05 09:52:00 -05:00
;
}
2008-11-17 18:29:00 -05:00
OuterBreak :
2008-03-05 09:52:00 -05:00
;
return sb . ToString ( ) ;
}
internal static String CategoryUnion ( string catI , string catJ ) {
return catI + catJ ;
}
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
// SetFromChar()
/ /
// Builds the string representations of a class with a single character.
internal static String SetFromChar ( char ch ) {
StringBuilder sb = new StringBuilder ( 2 ) ;
sb . Append ( ch ) ;
if ( ch ! = Lastchar )
sb . Append ( ( char ) ( ch + 1 ) ) ;
return sb . ToString ( ) ;
}
// SetInverseFromChar()
/ /
// Builds the string representation of a class that omits a single character.
internal static String SetInverseFromChar ( char ch ) {
StringBuilder sb = new StringBuilder ( 3 ) ;
if ( ch ! = Nullchar ) {
sb . Append ( Nullchar ) ;
sb . Append ( ch ) ;
}
if ( ch ! = Lastchar )
sb . Append ( ( char ) ( ch + 1 ) ) ;
return sb . ToString ( ) ;
}
// IsSingleton()
/ /
// True if the set contains a single character only
internal static bool IsSingleton ( String set ) {
return ( set . Length = = 2 & & set [ 0 ] = = set [ 1 ] - 1 ) ; // && _categories.Length == 0);
}
// SingletonChar()
/ /
// Returns the char
internal static char SingletonChar ( String set ) {
return set [ 0 ] ;
}
internal static bool IsECMAWordChar ( char ch ) {
return CharInSet ( ch , ECMAWord , String . Empty ) ;
}
internal static bool IsWordChar ( char ch ) {
return CharInCategory ( ch , Word ) ;
}
internal static bool CharInSet ( char ch , String set , String category ) {
bool b = CharInSetInternal ( ch , set , category ) ;
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
if ( set . Length > = 2 & & ( set [ 0 ] = = 0 ) & & ( set [ 1 ] = = 0 ) )
return ! b ;
else
return b ;
}
// CharInSet()
/ /
// Determines a character's membership in a character class (via the
// string representation of the class).
internal static bool CharInSetInternal ( char ch , string set , String category ) {
int min ;
int max ;
int mid ;
min = 0 ;
max = set . Length ;
while ( min ! = max ) {
mid = ( min + max ) / 2 ;
if ( ch < set [ mid ] )
max = mid ;
else
min = mid + 1 ;
}
2008-11-17 18:29:00 -05:00
if ( ( min & 0x1 ) ! = 0 )
2008-03-05 09:52:00 -05:00
return true ;
else
return CharInCategory ( ch , category ) ;
}
internal static bool CharInCategory ( char ch , string category ) {
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
if ( category . Length = = 0 )
return false ;
UnicodeCategory chcategory = char . GetUnicodeCategory ( ch ) ;
int i = 0 ;
2008-11-17 18:29:00 -05:00
while ( i < category . Length ) {
2008-03-05 09:52:00 -05:00
int curcat = ( short ) category [ i ] ;
if ( curcat = = 0 ) {
// zero is our marker for a group of categories - treated as a unit
if ( CharInCategoryGroup ( ch , chcategory , category , ref i ) )
return true ;
}
else if ( curcat > 0 ) {
// greater than zero is a positive case
2008-11-17 18:29:00 -05:00
if ( curcat = = SpaceConst ) {
2008-03-05 09:52:00 -05:00
if ( Char . IsWhiteSpace ( ch ) )
return true ;
else {
i + + ;
continue ;
}
}
- - curcat ;
if ( chcategory = = ( UnicodeCategory ) curcat )
return true ;
}
else {
// less than zero is a negative case
if ( curcat = = NotSpaceConst ) {
if ( ! Char . IsWhiteSpace ( ch ) )
return true ;
else {
i + + ;
continue ;
}
}
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
curcat = - curcat ;
- - curcat ;
if ( chcategory ! = ( UnicodeCategory ) curcat )
return true ;
}
i + + ;
}
return false ;
}
// CharInCategoryGroup
// This is used for categories which are composed of other categories - L, N, Z, W...
// These groups need special treatment when they are negated
private static bool CharInCategoryGroup ( char ch , UnicodeCategory chcategory , string category , ref int i ) {
i + + ;
int curcat = ( short ) category [ i ] ;
if ( curcat > 0 ) {
// positive case - the character must be in ANY of the categories in the group
bool answer = false ;
while ( curcat ! = 0 ) {
if ( ! answer ) {
- - curcat ;
if ( chcategory = = ( UnicodeCategory ) curcat )
answer = true ;
}
i + + ;
curcat = ( short ) category [ i ] ;
}
return answer ;
}
else {
// negative case - the character must be in NONE of the categories in the group
bool answer = true ;
while ( curcat ! = 0 ) {
if ( answer ) {
curcat = - curcat ;
- - curcat ;
if ( chcategory = = ( UnicodeCategory ) curcat )
answer = false ;
}
i + + ;
curcat = ( short ) category [ i ] ;
}
return answer ;
}
}
internal static string NegateCategory ( string category ) {
if ( category = = null )
return null ;
StringBuilder sb = new StringBuilder ( ) ;
2008-11-17 18:29:00 -05:00
for ( int i = 0 ; i < category . Length ; i + + ) {
2008-03-05 09:52:00 -05:00
short ch = ( short ) category [ i ] ;
sb . Append ( ( char ) - ch ) ;
}
return sb . ToString ( ) ;
}
// RangeCount()
/ /
// The number of single ranges that have been accumulated so far.
private int RangeCount ( ) {
return _rangelist . Count ;
}
// Range(int i)
/ /
// The ith range.
private SingleRange Range ( int i ) {
return ( SingleRange ) _rangelist [ i ] ;
}
// SingleRangeComparer
/ /
// For sorting ranges; compare based on the first char in the range.
private sealed class SingleRangeComparer : IComparer {
public int Compare ( Object x , Object y ) {
return ( ( ( SingleRange ) x ) . _first < ( ( SingleRange ) y ) . _first ? - 1
: ( ( ( SingleRange ) x ) . _first > ( ( SingleRange ) y ) . _first ? 1 : 0 ) ) ;
}
}
// SingleRange
/ /
// A first/last pair representing a single range of characters.
private sealed class SingleRange {
internal SingleRange ( char first , char last ) {
_first = first ;
_last = last ;
}
internal char _first ;
internal char _last ;
}
// Canonicalize()
/ /
// Logic to reduce a character class to a unique, sorted form.
private void Canonicalize ( ) {
SingleRange CurrentRange ;
int i ;
int j ;
char last ;
bool Done ;
_canonical = true ;
_rangelist . Sort ( 0 , _rangelist . Count , new SingleRangeComparer ( ) ) ;
/ /
// Find and eliminate overlapping or abutting ranges
/ /
if ( _rangelist . Count > 1 ) {
Done = false ;
2008-11-17 18:29:00 -05:00
for ( i = 1 , j = 0 ; ; i + + ) {
for ( last = ( ( SingleRange ) _rangelist [ j ] ) . _last ; ; i + + ) {
2008-03-05 09:52:00 -05:00
if ( i = = _rangelist . Count | | last = = Lastchar ) {
Done = true ;
break ;
}
if ( ( CurrentRange = ( SingleRange ) _rangelist [ i ] ) . _first > last + 1 )
break ;
if ( last < CurrentRange . _last )
last = CurrentRange . _last ;
}
( ( SingleRange ) _rangelist [ j ] ) . _last = last ;
j + + ;
if ( Done )
break ;
if ( j < i )
_rangelist [ j ] = _rangelist [ i ] ;
}
_rangelist . RemoveRange ( j , _rangelist . Count - j ) ;
}
}
2008-11-17 18:29:00 -05:00
// The property table contains all the block definitions defined in the
// XML schema spec (http://www.w3.org/TR/2001/PR-xmlschema-2-20010316/#charcter-classes), Unicode 3.0 spec (www.unicode.org),
// and Perl 5.6 (see Programming Perl, 3rd edition page 167). Three blocks defined by Perl (and here) may
// not be in the Unicode: IsHighPrivateUseSurrogates, IsHighSurrogates, and IsLowSurrogates.
//
// In addition, there was some inconsistency in the definition of IsTibetan and IsArabicPresentationForms-B.
2008-03-05 09:52:00 -05:00
// Regex goes with with the XML spec on both of these, since it seems to be (oddly enough) more correct than the Unicode spec!
/ /
// This is what we use:
// IsTibetan: 0xF00 - 0x0FFF
// IsArabicPresentationForms-B: 0xFE70-0xFEFE
/ /
2008-11-17 18:29:00 -05:00
// The Unicode spec is inconsistent for IsTibetan. Its range is 0x0F00 - 0x0FBF. However, it clearly defines
2008-03-05 09:52:00 -05:00
// Tibetan characters above 0x0FBF. This appears to be an error between the 2.0 and 3.0 spec.
/ /
2008-11-17 18:29:00 -05:00
// The Unicode spec is also unclear on IsArabicPresentationForms-B, defining it as 0xFE70-0xFEFF.
// There is only one character different here, 0xFEFF, which is a byte-order mark character and
2008-03-05 09:52:00 -05:00
// is labeled in the spec as special. I have excluded it from IsArabicPresentationForms-B and left it in IsSpecial.
// Has to be sorted by the first column
private static readonly String [ , ] _propTable = {
{ "_xmlC" , /* Name Char */ "\u002D\u002F\u0030\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00AA\u00AB\u00B2\u00B4\u00B5\u00B6\u00B9\u00BB\u00BC\u00BF\u00C0\u00D7\u00D8\u00F7\u00F8\u01AA\u01AB\u01BB\u01BC\u01BE\u01C4\u01F6\u01FA\u0218\u0250\u02A9\u0386\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F3\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0490\u04C0\u04C1\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0561\u0588\u0660\u066A\u06F0\u06FA\u0966\u0970\u09E6\u09F0\u09F4\u09FA\u0A66\u0A70\u0AE6\u0AF0\u0B66\u0B70\u0BE7\u0BF3\u0C66\u0C70\u0CE6\u0CF0\u0D66\u0D70\u0E50\u0E5A\u0ED0\u0EDA\u0F20\u0F34\u10A0\u10C6\u10D0\u10F7\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u2070\u2071\u2074"
+ "\u207A\u207F\u208A\u20A8\u20A9\u2102\u2103\u2107\u2108\u210A\u2114\u2115\u211E\u2120\u2123\u2124\u2125\u2126\u2127\u2128\u2129\u212A\u2132\u2133\u2135\u2153\u2183\u2460\u249C\u24B6\u24EB\u2776\u2794\u3007\u3008\u3021\u302A\u3280\u328A\u3372\u3375\u3376\u3377\u3385\u338A\u338D\u3391\u3399\u339F\u33A9\u33AA\u33AD\u33AE\u33B0\u33B4\u33B9\u33BA\u33BF\u33C0\u33C1\u33C2\u33C3\u33C6\u33C7\u33C8\u33C9\u33D8\u33D9\u33DE" } ,
{ "_xmlD" , "\u0030\u003A\u0660\u066A\u06F0\u06FA\u0966\u0970\u09E6\u09F0\u0A66\u0A70\u0AE6\u0AF0\u0B66\u0B70\u0BE7\u0BF0\u0C66\u0C70\u0CE6\u0CF0\u0D66\u0D70\u0E50\u0E5A\u0ED0\u0EDA\u0F20\u0F2A\u2070\u2071\u2074\u207A\u2080\u208A" } ,
{ "_xmlI" , /* Start Name Char */ "\u003A\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00A8\u00A9\u00AA\u00AB\u00AF\u00B0\u00B4\u00B6\u00B8\u00B9\u00BA\u00BB\u00C0\u00D7\u00D8\u00F7\u00F8\u01F6\u01FA\u0218\u0250\u02A9\u02B0\u02DF\u02E0\u02EA\u0374\u0375\u037A\u037B\u0384\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0588\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u064B\u0671\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06D6\u06E5\u06E7\u0905\u093A\u0958\u0962\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09DC\u09DE\u09DF\u09E2\u09F0\u09F2\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A59\u0A5D\u0A5E\u0A5F\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0AE0\u0AE1\u0B05"
+ "\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B5C\u0B5E\u0B5F\u0B62\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C60\u0C62\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CDE\u0CDF\u0CE0\u0CE2\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D60\u0D62\u0E01\u0E31\u0E32\u0E34\u0E40\u0E47\u0E4F\u0E50\u0E5A\u0E5C\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EB1\u0EB2\u0EB4\u0EBD\u0EBE\u0EC0\u0EC5\u0EDC\u0EDE\u0F18\u0F1A\u0F40\u0F48\u0F49\u0F6A\u10A0\u10C6\u10D0\u10F7\u1100\u115A\u115F\u11A3\u11A8\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FC5\u1FC6\u1FD4\u1FD6\u1FDC\u1FDD\u1FF0"
+ "\u1FF2\u1FF5\u1FF6\u1FFF\u207F\u2080\u20A8\u20A9\u2102\u2103\u2107\u2108\u210A\u2114\u2115\u211E\u2120\u2123\u2124\u2125\u2126\u2127\u2128\u2129\u212A\u2132\u2133\u2139\u24B6\u24EA\u3041\u3095\u309B\u309F\u30A1\u30FB\u30FC\u30FF\u3105\u312D\u3131\u318F\u3192\u31A0\u3260\u327C\u328A\u32B1\u32D0\u32FF\u3300\u3358\u3371\u3377\u337B\u3395\u3399\u339F\u33A9\u33AE\u33B0\u33C2\u33C3\u33C6\u33C7\u33D8\u33D9\u33DE\u4E00\u4E01\u9FA5\u9FA6\uAC00\uAC01\uD7A3\uD7A4\uF900" } ,
{ "_xmlW" , "\u0023\u0025\u0026\u0027\u002A\u002C\u0030\u003A\u003C\u003F\u0040\u005B\u005E\u007B\u007C\u007D\u007E\u007F\u00A2\u00AB\u00AC\u00AD\u00AE\u00B7\u00B8\u00BB\u00BC\u00BF\u00C0\u037E\u037F\u0387\u0388\u055A\u0560\u0589\u058A\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05F3\u05F5\u060C\u060D\u061B\u061C\u061F\u0620\u06D4\u06D5\u093D\u093E\u0970\u0971\u0ABD\u0ABE\u0B3D\u0B3E\u0EAF\u0EB0\u0F04\u0F13\u0F3A\u0F3E\u0F85\u0F86\u10FB\u10FC\u2000\u202F\u2030\u203D\u2045\u2047\u206A\u2070\u207D\u207F\u208D\u208F\u2329\u232B\u3000\u3004\u3005\u3007\u3008\u3012\u3014\u301D\u3030\u3031\u30FB\u30FC\uD800\uD801\uDB7F\uDB81\uDBFF\uDC01\uDFFF\uE001\uF8FF\uF900\uFD3E\uFD40\uFE30\uFE33\uFE35\uFE45\uFE50\uFE53\uFE54\uFE5F\uFE63\uFE64\uFE68\uFE69\uFE6A\uFE6B\uFEFF\uFF00\uFF01\uFF03\uFF05\uFF06\uFF07\uFF0A\uFF0C\uFF10\uFF1A\uFF1C\uFF1F\uFF20\uFF3B\uFF3E\uFF5B\uFF5C\uFF5D\uFF5E\uFF61\uFF66" } ,
{ "IsAlphabeticPresentationForms" , "\uFB00\uFB50" } ,
{ "IsArabic" , "\u0600\u0700" } ,
{ "IsArabicPresentationForms-A" , "\uFB50\uFE00" } ,
{ "IsArabicPresentationForms-B" , "\uFE70\uFEFF" } ,
{ "IsArmenian" , "\u0530\u0590" } ,
{ "IsArrows" , "\u2190\u2200" } ,
{ "IsBasicLatin" , "\u0000\u0080" } ,
{ "IsBengali" , "\u0980\u0A00" } ,
{ "IsBlockElements" , "\u2580\u25A0" } ,
{ "IsBopomofo" , "\u3100\u3130" } ,
{ "IsBopomofoExtended" , "\u31A0\u31C0" } ,
{ "IsBoxDrawing" , "\u2500\u2580" } ,
{ "IsBraillePatterns" , "\u2800\u2900" } ,
{ "IsCherokee" , "\u13A0\u1400" } ,
{ "IsCJKCompatibility" , "\u3300\u3400" } ,
{ "IsCJKCompatibilityForms" , "\uFE30\uFE50" } ,
{ "IsCJKCompatibilityIdeographs" , "\uF900\uFB00" } ,
{ "IsCJKRadicalsSupplement" , "\u2E80\u2F00" } ,
{ "IsCJKSymbolsandPunctuation" , "\u3000\u3040" } ,
{ "IsCJKUnifiedIdeographs" , "\u4E00\uA000" } ,
{ "IsCJKUnifiedIdeographsExtensionA" , "\u3400\u4DB6" } ,
{ "IsCombiningDiacriticalMarks" , "\u0300\u0370" } ,
{ "IsCombiningHalfMarks" , "\uFE20\uFE30" } ,
{ "IsCombiningMarksforSymbols" , "\u20D0\u2100" } ,
{ "IsControlPictures" , "\u2400\u2440" } ,
{ "IsCurrencySymbols" , "\u20A0\u20D0" } ,
{ "IsCyrillic" , "\u0400\u0500" } ,
{ "IsDevanagari" , "\u0900\u0980" } ,
{ "IsDingbats" , "\u2700\u27C0" } ,
{ "IsEnclosedAlphanumerics" , "\u2460\u2500" } ,
{ "IsEnclosedCJKLettersandMonths" , "\u3200\u3300" } ,
{ "IsEthiopic" , "\u1200\u1380" } ,
{ "IsGeneralPunctuation" , "\u2000\u2070" } ,
{ "IsGeometricShapes" , "\u25A0\u2600" } ,
{ "IsGeorgian" , "\u10A0\u1100" } ,
{ "IsGreek" , "\u0370\u0400" } ,
{ "IsGreekExtended" , "\u1F00\u2000" } ,
{ "IsGujarati" , "\u0A80\u0B00" } ,
{ "IsGurmukhi" , "\u0A00\u0A80" } ,
{ "IsHalfwidthandFullwidthForms" , "\uFF00\uFFF0" } ,
{ "IsHangulCompatibilityJamo" , "\u3130\u3190" } ,
{ "IsHangulJamo" , "\u1100\u1200" } ,
{ "IsHangulSyllables" , "\uAC00\uD7A4" } ,
{ "IsHebrew" , "\u0590\u0600" } ,
{ "IsHighPrivateUseSurrogates" , "\uDB80\uDC00" } ,
{ "IsHighSurrogates" , "\uD800\uDB80" } ,
{ "IsHiragana" , "\u3040\u30A0" } ,
{ "IsIdeographicDescriptionCharacters" , "\u2FF0\u3000" } ,
{ "IsIPAExtensions" , "\u0250\u02B0" } ,
{ "IsKanbun" , "\u3190\u31A0" } ,
{ "IsKangxiRadicals" , "\u2F00\u2FE0" } ,
{ "IsKannada" , "\u0C80\u0D00" } ,
{ "IsKatakana" , "\u30A0\u3100" } ,
{ "IsKhmer" , "\u1780\u1800" } ,
{ "IsLao" , "\u0E80\u0F00" } ,
{ "IsLatin-1Supplement" , "\u0080\u0100" } ,
{ "IsLatinExtended-A" , "\u0100\u0180" } ,
{ "IsLatinExtendedAdditional" , "\u1E00\u1F00" } ,
{ "IsLatinExtended-B" , "\u0180\u0250" } ,
{ "IsLetterlikeSymbols" , "\u2100\u2150" } ,
{ "IsLowSurrogates" , "\uDC00\uE000" } ,
{ "IsMalayalam" , "\u0D00\u0D80" } ,
{ "IsMathematicalOperators" , "\u2200\u2300" } ,
{ "IsMiscellaneousSymbols" , "\u2600\u2700" } ,
{ "IsMiscellaneousTechnical" , "\u2300\u2400" } ,
{ "IsMongolian" , "\u1800\u18B0" } ,
{ "IsMyanmar" , "\u1000\u10A0" } ,
{ "IsNumberForms" , "\u2150\u2190" } ,
{ "IsOgham" , "\u1680\u16A0" } ,
{ "IsOpticalCharacterRecognition" , "\u2440\u2460" } ,
{ "IsOriya" , "\u0B00\u0B80" } ,
{ "IsPrivateUse" , "\uE000\uF900" } ,
{ "IsRunic" , "\u16A0\u1700" } ,
{ "IsSinhala" , "\u0D80\u0E00" } ,
{ "IsSmallFormVariants" , "\uFE50\uFE70" } ,
{ "IsSpacingModifierLetters" , "\u02B0\u0300" } ,
{ "IsSpecials" , "\uFEFF\uFF00\uFFF0\uFFFE" } ,
{ "IsSuperscriptsandSubscripts" , "\u2070\u20A0" } ,
{ "IsSyriac" , "\u0700\u0750" } ,
{ "IsTamil" , "\u0B80\u0C00" } ,
{ "IsTelugu" , "\u0C00\u0C80" } ,
{ "IsThaana" , "\u0780\u07C0" } ,
{ "IsThai" , "\u0E00\u0E80" } ,
{ "IsTibetan" , "\u0F00\u1000" } ,
{ "IsUnifiedCanadianAboriginalSyllabics" , "\u1400\u1680" } ,
{ "IsYiRadicals" , "\uA490\uA4D0" } ,
{ "IsYiSyllables" , "\uA000\uA490" } ,
} ;
internal static String SetFromProperty ( String capname , bool invert , string pattern ) {
int min = 0 ;
int max = _propTable . GetLength ( 0 ) ;
while ( min ! = max ) {
int mid = ( min + max ) / 2 ;
int res = String . Compare ( capname , _propTable [ mid , 0 ] , false ) ; //, CultureInfo.InvariantCulture);
if ( res < 0 )
max = mid ;
else if ( res > 0 )
min = mid + 1 ;
else {
String set = _propTable [ mid , 1 ] ;
return invert ? SetInverse ( set ) : set ;
}
}
2008-11-17 18:29:00 -05:00
throw new ArgumentException ( "Unknown property" ) ; //XXX: SR.GetString(SR.MakeException, pattern, SR.GetString(SR.UnknownProperty, capname)), pattern);
2008-03-05 09:52:00 -05:00
//return invert ? Any : Empty ;
}
#if DBG
// SetDescription()
/ /
// Produces a human-readable description for a set string.
internal static String SetDescription ( String set ) {
if ( set . Equals ( Any ) )
return "[^]" ;
if ( set . Equals ( Empty ) )
return "[]" ;
StringBuilder desc = new StringBuilder ( "[" ) ;
int index ;
char ch1 ;
char ch2 ;
if ( set [ 0 ] = = Nullchar ) {
index = 1 ;
desc . Append ( '^' ) ;
}
else {
index = 0 ;
}
while ( index < set . Length ) {
ch1 = set [ index ] ;
if ( index + 1 < set . Length )
ch2 = ( char ) ( set [ index + 1 ] - 1 ) ;
else
ch2 = Lastchar ;
desc . Append ( CharDescription ( ch1 ) ) ;
if ( ch2 ! = ch1 ) {
if ( ch1 + 1 ! = ch2 )
desc . Append ( '-' ) ;
desc . Append ( CharDescription ( ch2 ) ) ;
}
index + = 2 ;
}
desc . Append ( ']' ) ;
return desc . ToString ( ) ;
}
internal static readonly char [ ] Hex = new char [ ] { '0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' , '8' , '9' , 'a' , 'b' , 'c' , 'd' , 'e' , 'f' } ;
// CharDescription()
/ /
// Produces a human-readable description for a single character.
internal static String CharDescription ( char ch ) {
StringBuilder sb = new StringBuilder ( ) ;
int shift ;
if ( ch = = '\\' )
return "\\\\" ;
if ( ch > = ' ' & & ch < = '~' ) {
sb . Append ( ch ) ;
return sb . ToString ( ) ;
}
if ( ch < 256 ) {
sb . Append ( "\\x" ) ;
shift = 8 ;
}
else {
sb . Append ( "\\u" ) ;
shift = 16 ;
}
while ( shift > 0 ) {
shift - = 4 ;
sb . Append ( Hex [ ( ch > > shift ) & 0xF ] ) ;
}
return sb . ToString ( ) ;
}
#endif
}
}