singrdk/base/Applications/Runtime/Full/System/Text/RegularExpressions/regexcharclass.cs

1202 lines
53 KiB
C#
Raw Normal View History

2008-03-05 09:52:00 -05:00
//------------------------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
//------------------------------------------------------------------------------
// This RegexCharClass class provides the "set of Unicode chars" functionality
// used by the regexp engine.
//
// RegexCharClass supports a "string representation" of a character class.
// The string representation is NOT human-readable. It is a sequence of
// strictly increasing Unicode characters that begin ranges of characters
// that are alternately included in and excluded from the class.
//
// Membership of a character in the class can be determined by binary
// searching the string representation and determining if the including
// range is at an even or odd index.
//
// The RegexCharClass class itself is a builder class. One can add char ranges
// or sets or invert the class; then, the class can be converted to its
// string representation via RegexCharClass.ToSet().
//
#define ECMA
2008-11-17 18:29:00 -05:00
namespace System.Text.RegularExpressions
{
2008-03-05 09:52:00 -05:00
using System.Collections;
using System.Globalization;
using System.Diagnostics;
internal sealed class RegexCharClass {
internal const char Nullchar = '\0';
internal const char Lastchar = '\uFFFF';
internal const String Any = "\0";
internal const String Empty = "";
internal const char GroupChar = (char) 0;
internal static readonly RegexCharClass AnyClass = new RegexCharClass("\0");
internal static readonly RegexCharClass EmptyClass = new RegexCharClass(String.Empty);
internal static readonly String Word;
internal static readonly String NotWord;
internal const short SpaceConst = 100;
internal const short NotSpaceConst = -100;
internal static readonly String Space = ((char) SpaceConst).ToString();
internal static readonly String NotSpace = NegateCategory(Space);
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
internal const String ECMASpace = "\u0009\u000E\u0020\u0021";
internal const String NotECMASpace = "\0\u0009\u000E\u0020\u0021";
internal const String ECMAWord = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
internal const String NotECMAWord = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
internal const String ECMADigit = "\u0030\u003A";
internal const String NotECMADigit = "\0\u0030\u003A";
internal static Hashtable _definedCategories;
internal ArrayList _rangelist;
internal StringBuilder _categories;
internal bool _canonical;
internal bool _negate;
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
static RegexCharClass() {
_definedCategories = new Hashtable(31);
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
char[] groups = new char[9];
StringBuilder word = new StringBuilder(11);
word.Append(GroupChar);
groups[0] = GroupChar;
// We need the UnicodeCategory enum values as a char so we can put them in a string
2008-11-17 18:29:00 -05:00
// in the hashtable. In order to get there, we first must cast to an int,
2008-03-05 09:52:00 -05:00
// then cast to a char
2008-11-17 18:29:00 -05:00
// Also need to distinguish between positive and negative values. UnicodeCategory is zero
2008-03-05 09:52:00 -05:00
// based, so we add one to each value and subtract it off later
// Others
groups[1] = (char) ((int) UnicodeCategory.Control + 1);
2008-11-17 18:29:00 -05:00
_definedCategories["Cc"] = groups[1].ToString(); // Control
2008-03-05 09:52:00 -05:00
groups[2] = (char) ((int) UnicodeCategory.Format + 1);
_definedCategories["Cf"] = groups[2].ToString(); // Format
groups[3] = (char) ((int) UnicodeCategory.OtherNotAssigned + 1);
_definedCategories["Cn"] = groups[3].ToString(); // Not assigned
groups[4] = (char) ((int) UnicodeCategory.PrivateUse + 1);
_definedCategories["Co"] = groups[4].ToString(); // Private use
groups[5] = (char) ((int) UnicodeCategory.Surrogate + 1);
_definedCategories["Cs"] = groups[5].ToString(); // Surrogate
groups[6] = GroupChar;
_definedCategories["C"] = new String(groups, 0, 7);
// Letters
groups[1] = (char) ((int) UnicodeCategory.LowercaseLetter + 1);
_definedCategories["Ll"] = groups[1].ToString(); // Lowercase
groups[2] = (char) ((int) UnicodeCategory.ModifierLetter + 1);
_definedCategories["Lm"] = groups[2].ToString(); // Modifier
groups[3] = (char) ((int) UnicodeCategory.OtherLetter + 1);
2008-11-17 18:29:00 -05:00
_definedCategories["Lo"] = groups[3].ToString(); // Other
2008-03-05 09:52:00 -05:00
groups[4] = (char) ((int) UnicodeCategory.TitlecaseLetter + 1);
_definedCategories["Lt"] = groups[4].ToString(); // Titlecase
groups[5] = (char) ((int) UnicodeCategory.UppercaseLetter + 1);
_definedCategories["Lu"] = groups[5].ToString(); // Uppercase
//groups[6] = GroupChar;
_definedCategories["L"] = new String(groups, 0, 7);
word.Append(groups[1]);
word.Append(new String(groups, 3, 3));
2008-11-17 18:29:00 -05:00
// Marks
2008-03-05 09:52:00 -05:00
groups[1] = (char) ((int) UnicodeCategory.SpacingCombiningMark + 1);
_definedCategories["Mc"] = groups[1].ToString(); // Spacing combining
groups[2] = (char) ((int) UnicodeCategory.EnclosingMark + 1);
_definedCategories["Me"] = groups[2].ToString(); // Enclosing
groups[3] = (char) ((int) UnicodeCategory.NonSpacingMark + 1);
_definedCategories["Mn"] = groups[3].ToString(); // Non-spacing
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
groups[4] = GroupChar;
_definedCategories["M"] = new String(groups, 0, 5);
//word.Append(groups[1]);
//word.Append(groups[3]);
// Numbers
groups[1] = (char) ((int) UnicodeCategory.DecimalDigitNumber + 1);
_definedCategories["Nd"] = groups[1].ToString(); // Decimal digit
groups[2] = (char) ((int) UnicodeCategory.LetterNumber + 1);
_definedCategories["Nl"] = groups[2].ToString(); // Letter
groups[3] = (char) ((int) UnicodeCategory.OtherNumber + 1);
2008-11-17 18:29:00 -05:00
_definedCategories["No"] = groups[3].ToString(); // Other
2008-03-05 09:52:00 -05:00
//groups[4] = GroupChar;
_definedCategories["N"] = new String(groups, 0, 5);
word.Append(groups[1]);
//word.Append(new String(groups, 1, 3));
// Punctuation
groups[1] = (char) ((int) UnicodeCategory.ConnectorPunctuation + 1);
_definedCategories["Pc"] = groups[1].ToString(); // Connector
groups[2] = (char) ((int) UnicodeCategory.DashPunctuation + 1);
_definedCategories["Pd"] = groups[2].ToString(); // Dash
groups[3] = (char) ((int) UnicodeCategory.ClosePunctuation + 1);
_definedCategories["Pe"] = groups[3].ToString(); // Close
groups[4] = (char) ((int) UnicodeCategory.OtherPunctuation + 1);
_definedCategories["Po"] = groups[4].ToString(); // Other
groups[5] = (char) ((int) UnicodeCategory.OpenPunctuation + 1);
_definedCategories["Ps"] = groups[5].ToString(); // Open
groups[6] = (char) ((int) UnicodeCategory.FinalQuotePunctuation + 1);
_definedCategories["Pi"] = groups[6].ToString(); // Initial quote
groups[7] = (char) ((int) UnicodeCategory.InitialQuotePunctuation + 1);
_definedCategories["Pf"] = groups[7].ToString(); // Final quote
groups[8] = GroupChar;
_definedCategories["P"] = new String(groups, 0, 9);
word.Append(groups[1]);
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
// Symbols
groups[1] = (char) ((int) UnicodeCategory.CurrencySymbol + 1);
_definedCategories["Sc"] = groups[1].ToString(); // Currency
groups[2] = (char) ((int) UnicodeCategory.ModifierSymbol + 1);
_definedCategories["Sk"] = groups[2].ToString(); // Modifier
groups[3] = (char) ((int) UnicodeCategory.MathSymbol + 1);
_definedCategories["Sm"] = groups[3].ToString(); // Math
groups[4] = (char) ((int) UnicodeCategory.OtherSymbol + 1);
_definedCategories["So"] = groups[4].ToString(); // Other
groups[5] = GroupChar;
_definedCategories["S"] = new String(groups, 0, 6);
// Separators
groups[1] = (char) ((int) UnicodeCategory.LineSeparator + 1);
_definedCategories["Zl"] = groups[1].ToString(); // Line
groups[2] = (char) ((int) UnicodeCategory.ParagraphSeparator + 1);
_definedCategories["Zp"] = groups[2].ToString(); // Paragraph
groups[3] = (char) ((int) UnicodeCategory.SpaceSeparator + 1);
_definedCategories["Zs"] = groups[3].ToString(); // Space
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
groups[4] = GroupChar;
_definedCategories["Z"] = new String(groups, 0, 5);
word.Append(GroupChar);
Word = word.ToString();
NotWord = NegateCategory(Word);
#if DBG
// make sure the _propTable is correctly ordered
int len = _propTable.GetLength(0);
2008-11-17 18:29:00 -05:00
for (int i = 0; i < len - 1; i++)
2008-03-05 09:52:00 -05:00
Debug.Assert(String.Compare(_propTable[i,0], _propTable[i+1,0], false, CultureInfo.InvariantCulture) < 0, "RegexCharClass _propTable is out of order at (" + _propTable[i,0] +", " + _propTable[i+1,0] + ")");
2008-11-17 18:29:00 -05:00
#endif
2008-03-05 09:52:00 -05:00
}
// RegexCharClass()
//
// Creates an empty character class.
internal RegexCharClass() {
_rangelist = new ArrayList(6);
_canonical = true;
_categories = new StringBuilder();
}
// RegexCharClass()
//
// Creates a character class out of a string representation.
internal RegexCharClass(String set) {
_rangelist = new ArrayList((set.Length + 1) / 2);
_canonical = true;
_categories = new StringBuilder();
AddSet(set);
}
// RegexCharClass()
//
// Creates a character class with a single range.
internal RegexCharClass(char first, char last) {
_rangelist = new ArrayList(1);
_rangelist.Add(new SingleRange(first, last));
_canonical = true;
_categories = new StringBuilder();
}
internal static RegexCharClass CreateFromCategory(string categoryName, bool invert, bool caseInsensitive, string pattern) {
RegexCharClass cc = new RegexCharClass();
cc.AddCategoryFromName(categoryName, invert, caseInsensitive, pattern);
return cc;
}
// AddCharClass()
//
// Adds a regex char class
internal void AddCharClass(RegexCharClass cc) {
int i;
2008-11-17 18:29:00 -05:00
if (_canonical && RangeCount() > 0 && cc.RangeCount() > 0 &&
2008-03-05 09:52:00 -05:00
cc.Range(cc.RangeCount() - 1)._last <= Range(RangeCount() - 1)._last)
_canonical = false;
for (i = 0; i < cc.RangeCount(); i += 1) {
_rangelist.Add(cc.Range(i));
}
_categories.Append(cc._categories.ToString());
}
// AddSet()
//
// Adds a set (specified by its string representation) to the class.
internal void AddSet(String set) {
int i;
2008-11-17 18:29:00 -05:00
if (_canonical && RangeCount() > 0 && set.Length > 0 &&
2008-03-05 09:52:00 -05:00
set[0] <= Range(RangeCount() - 1)._last)
_canonical = false;
for (i = 0; i < set.Length - 1; i += 2) {
_rangelist.Add(new SingleRange(set[i], (char)(set[i + 1] - 1)));
}
if (i < set.Length) {
_rangelist.Add(new SingleRange(set[i], Lastchar));
}
}
// AddRange()
//
// Adds a single range of characters to the class.
internal void AddRange(char first, char last) {
_rangelist.Add(new SingleRange(first, last));
if (_canonical && _rangelist.Count > 0 &&
first <= ((SingleRange)_rangelist[_rangelist.Count - 1])._last) {
_canonical = false;
}
}
internal string Category {
2008-11-17 18:29:00 -05:00
get {
2008-03-05 09:52:00 -05:00
//if (_negate)
// return NegateCategory(_categories.ToString());
//else
2008-11-17 18:29:00 -05:00
return _categories.ToString();
2008-03-05 09:52:00 -05:00
}
}
internal bool Negate {
set { _negate = value; }
}
internal void AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern) {
object cat = _definedCategories[categoryName];
if (cat != null) {
string catstr = (string) cat;
if (caseInsensitive) {
if (categoryName.Equals("Lu") || categoryName.Equals("Lt"))
catstr = /*catstr +*/ (string) _definedCategories["Ll"];
}
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
if (invert)
catstr = NegateCategory(catstr); // negate the category
_categories.Append((string) catstr);
}
else
AddSet(SetFromProperty(categoryName, invert, pattern));
}
internal void AddCategory(string category) {
_categories.Append(category);
}
2008-11-17 18:29:00 -05:00
////////////////////////////////////////////////////////////////////////////
// Let U be the set of Unicode character values and let L be the lowercase
// function, mapping from U to U. To perform case insensitive matching of
// character sets, we need to be able to map an interval I in U, say
//
// I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
//
// to a set A such that A contains L(I) and A is contained in the union of
// I and L(I).
//
// The table below partitions U into intervals on which L is non-decreasing.
// Thus, for any interval J = [a, b] contained in one of these intervals,
// L(J) is contained in [L(a), L(b)].
//
// It is also true that for any such J, [L(a), L(b)] is contained in the
// union of J and L(J). This does not follow from L being non-decreasing on
// these intervals. It follows from the nature of the L on each interval.
// On each interval, L has one of the following forms:
//
// (1) L(ch) = constant (LowercaseSet)
// (2) L(ch) = ch + offset (LowercaseAdd)
// (3) L(ch) = ch | 1 (LowercaseBor)
// (4) L(ch) = ch + (ch & 1) (LowercaseBad)
//
// It is easy to verify that for any of these forms [L(a), L(b)] is
// contained in the union of [a, b] and L([a, b]).
//////////////////////////////////////////////////////////////////////////
2008-03-05 09:52:00 -05:00
internal const int LowercaseSet = 0; // Set to arg.
internal const int LowercaseAdd = 1; // Add arg.
internal const int LowercaseBor = 2; // Bitwise or with 1.
internal const int LowercaseBad = 3; // Bitwise and with 1 and add original.
// Lower case mapping descriptor.
private sealed class LC {
internal LC(char chMin, char chMax, int lcOp, int data) {
_chMin = chMin;
_chMax = chMax;
_lcOp = lcOp;
_data = data;
}
internal char _chMin;
internal char _chMax;
internal int _lcOp;
internal int _data;
}
private static readonly LC[] _lcTable = new LC[]
{
new LC('\u0041', '\u005A', LowercaseAdd, 32),
new LC('\u00C0', '\u00DE', LowercaseAdd, 32),
new LC('\u0100', '\u012E', LowercaseBor, 0),
new LC('\u0130', '\u0130', LowercaseSet, 0x0069),
new LC('\u0132', '\u0136', LowercaseBor, 0),
new LC('\u0139', '\u0147', LowercaseBad, 0),
new LC('\u014A', '\u0176', LowercaseBor, 0),
new LC('\u0178', '\u0178', LowercaseSet, 0x00FF),
new LC('\u0179', '\u017D', LowercaseBad, 0),
new LC('\u0181', '\u0181', LowercaseSet, 0x0253),
new LC('\u0182', '\u0184', LowercaseBor, 0),
new LC('\u0186', '\u0186', LowercaseSet, 0x0254),
new LC('\u0187', '\u0187', LowercaseSet, 0x0188),
new LC('\u0189', '\u018A', LowercaseAdd, 205),
new LC('\u018B', '\u018B', LowercaseSet, 0x018C),
new LC('\u018E', '\u018F', LowercaseAdd, 202),
new LC('\u0190', '\u0190', LowercaseSet, 0x025B),
new LC('\u0191', '\u0191', LowercaseSet, 0x0192),
new LC('\u0193', '\u0193', LowercaseSet, 0x0260),
new LC('\u0194', '\u0194', LowercaseSet, 0x0263),
new LC('\u0196', '\u0196', LowercaseSet, 0x0269),
new LC('\u0197', '\u0197', LowercaseSet, 0x0268),
new LC('\u0198', '\u0198', LowercaseSet, 0x0199),
new LC('\u019C', '\u019C', LowercaseSet, 0x026F),
new LC('\u019D', '\u019D', LowercaseSet, 0x0272),
new LC('\u01A0', '\u01A4', LowercaseBor, 0),
new LC('\u01A7', '\u01A7', LowercaseSet, 0x01A8),
new LC('\u01A9', '\u01A9', LowercaseSet, 0x0283),
new LC('\u01AC', '\u01AC', LowercaseSet, 0x01AD),
new LC('\u01AE', '\u01AE', LowercaseSet, 0x0288),
new LC('\u01AF', '\u01AF', LowercaseSet, 0x01B0),
new LC('\u01B1', '\u01B2', LowercaseAdd, 217),
new LC('\u01B3', '\u01B5', LowercaseBad, 0),
new LC('\u01B7', '\u01B7', LowercaseSet, 0x0292),
new LC('\u01B8', '\u01B8', LowercaseSet, 0x01B9),
new LC('\u01BC', '\u01BC', LowercaseSet, 0x01BD),
new LC('\u01C4', '\u01C5', LowercaseSet, 0x01C6),
new LC('\u01C7', '\u01C8', LowercaseSet, 0x01C9),
new LC('\u01CA', '\u01CB', LowercaseSet, 0x01CC),
new LC('\u01CD', '\u01DB', LowercaseBad, 0),
new LC('\u01DE', '\u01EE', LowercaseBor, 0),
new LC('\u01F1', '\u01F2', LowercaseSet, 0x01F3),
new LC('\u01F4', '\u01F4', LowercaseSet, 0x01F5),
new LC('\u01FA', '\u0216', LowercaseBor, 0),
new LC('\u0386', '\u0386', LowercaseSet, 0x03AC),
new LC('\u0388', '\u038A', LowercaseAdd, 37),
new LC('\u038C', '\u038C', LowercaseSet, 0x03CC),
new LC('\u038E', '\u038F', LowercaseAdd, 63),
new LC('\u0391', '\u03AB', LowercaseAdd, 32),
new LC('\u03E2', '\u03EE', LowercaseBor, 0),
new LC('\u0401', '\u040F', LowercaseAdd, 80),
new LC('\u0410', '\u042F', LowercaseAdd, 32),
new LC('\u0460', '\u0480', LowercaseBor, 0),
new LC('\u0490', '\u04BE', LowercaseBor, 0),
new LC('\u04C1', '\u04C3', LowercaseBad, 0),
new LC('\u04C7', '\u04C7', LowercaseSet, 0x04C8),
new LC('\u04CB', '\u04CB', LowercaseSet, 0x04CC),
new LC('\u04D0', '\u04EA', LowercaseBor, 0),
new LC('\u04EE', '\u04F4', LowercaseBor, 0),
new LC('\u04F8', '\u04F8', LowercaseSet, 0x04F9),
new LC('\u0531', '\u0556', LowercaseAdd, 48),
new LC('\u10A0', '\u10C5', LowercaseAdd, 48),
new LC('\u1E00', '\u1EF8', LowercaseBor, 0),
new LC('\u1F08', '\u1F0F', LowercaseAdd, -8),
new LC('\u1F18', '\u1F1F', LowercaseAdd, -8),
new LC('\u1F28', '\u1F2F', LowercaseAdd, -8),
new LC('\u1F38', '\u1F3F', LowercaseAdd, -8),
new LC('\u1F48', '\u1F4D', LowercaseAdd, -8),
new LC('\u1F59', '\u1F59', LowercaseSet, 0x1F51),
new LC('\u1F5B', '\u1F5B', LowercaseSet, 0x1F53),
new LC('\u1F5D', '\u1F5D', LowercaseSet, 0x1F55),
new LC('\u1F5F', '\u1F5F', LowercaseSet, 0x1F57),
new LC('\u1F68', '\u1F6F', LowercaseAdd, -8),
new LC('\u1F88', '\u1F8F', LowercaseAdd, -8),
new LC('\u1F98', '\u1F9F', LowercaseAdd, -8),
new LC('\u1FA8', '\u1FAF', LowercaseAdd, -8),
new LC('\u1FB8', '\u1FB9', LowercaseAdd, -8),
new LC('\u1FBA', '\u1FBB', LowercaseAdd, -74),
new LC('\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3),
new LC('\u1FC8', '\u1FCB', LowercaseAdd, -86),
new LC('\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3),
new LC('\u1FD8', '\u1FD9', LowercaseAdd, -8),
new LC('\u1FDA', '\u1FDB', LowercaseAdd, -100),
new LC('\u1FE8', '\u1FE9', LowercaseAdd, -8),
new LC('\u1FEA', '\u1FEB', LowercaseAdd, -112),
new LC('\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5),
new LC('\u1FF8', '\u1FF9', LowercaseAdd, -128),
new LC('\u1FFA', '\u1FFB', LowercaseAdd, -126),
new LC('\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3),
new LC('\u2160', '\u216F', LowercaseAdd, 16),
new LC('\u24B6', '\u24D0', LowercaseAdd, 26),
new LC('\uFF21', '\uFF3A', LowercaseAdd, 32),
};
// AddLowerCase()
//
// Adds to the class any lowercase versions of characters already
// in the class. Used for case-insensitivity.
internal void AddLowercase(CultureInfo culture) {
int i;
int origSize;
SingleRange range;
_canonical = false;
for (i = 0, origSize = _rangelist.Count; i < origSize; i++) {
range = (SingleRange)_rangelist[i];
if (range._first == range._last)
range._first = range._last = Char.ToLower(range._first);//, culture);
else
AddLowercaseImpl(range._first, range._last, culture);
}
}
// AddLowerCaseImpl()
//
// For a single range that's in the set, adds any additional ranges
// necessary to ensure that lowercase equivalents are also included.
internal void AddLowercaseImpl(char chMin, char chMax, CultureInfo culture) {
int i, iMax, iMid;
char chMinT, chMaxT;
LC lc;
if (chMin == chMax) {
chMin = Char.ToLower(chMin);//, culture);
if (chMin != chMax)
AddRange(chMin, chMin);
return;
}
2008-11-17 18:29:00 -05:00
for (i = 0, iMax = _lcTable.Length; i < iMax;) {
2008-03-05 09:52:00 -05:00
iMid = (i + iMax) / 2;
if (_lcTable[iMid]._chMax < chMin)
i = iMid + 1;
else
iMax = iMid;
}
if (i >= _lcTable.Length)
return;
2008-11-17 18:29:00 -05:00
for (; i < _lcTable.Length && (lc = _lcTable[i])._chMin <= chMax; i++) {
2008-03-05 09:52:00 -05:00
if ((chMinT = lc._chMin) < chMin)
chMinT = chMin;
if ((chMaxT = lc._chMax) > chMax)
chMaxT = chMax;
switch (lc._lcOp) {
case LowercaseSet:
chMinT = (char)lc._data;
chMaxT = (char)lc._data;
break;
case LowercaseAdd:
chMinT += (char)lc._data;
chMaxT += (char)lc._data;
break;
case LowercaseBor:
chMinT |= (char)1;
chMaxT |= (char)1;
break;
case LowercaseBad:
chMinT += (char)(chMinT & 1);
chMaxT += (char)(chMaxT & 1);
break;
}
if (chMinT < chMin || chMaxT > chMax)
AddRange(chMinT, chMaxT);
}
}
// ToSet()
//
// Constructs the string representation of the class.
internal String ToSet() {
int i;
StringBuilder sb;
if (!_canonical)
Canonicalize();
if (_negate) {
sb = new StringBuilder(_rangelist.Count * 2 + 2);
sb.Append(Nullchar);
sb.Append(Nullchar);
}
else
sb = new StringBuilder(_rangelist.Count * 2);
for (i = 0; i < _rangelist.Count; i++) {
sb.Append(((SingleRange)_rangelist[i])._first);
if (((SingleRange)_rangelist[i])._last != Lastchar)
sb.Append((char)(((SingleRange)_rangelist[i])._last + 1));
}
return sb.ToString();
}
// ToSetCi()
//
// Constructs the string representation of the class.
internal String ToSetCi(bool caseInsensitive, CultureInfo culture) {
if (caseInsensitive)
AddLowercase(culture);
return ToSet();
}
// SetSize()
//
// Returns the number of characters included in the set.
internal static int SetSize(String set) {
int i;
int c;
c = 0;
for (i = 0; i < set.Length - 1; i += 2) {
c += set[i + 1] - set[i];
}
if (i < set.Length) {
c += 0x10000 - set[i];
}
return c;
}
// SetInverse()
//
// Inverts a string representation of a class directly.
internal static String SetInverse(String set) {
if (set.Length == 0 || set[0] != Nullchar)
return Any + set;
if (set.Length == 1)
return Empty;
return set.Substring(1, set.Length - 1);
}
// SetUnion()
//
// Builds the union of two string representations of a class directly.
internal static String SetUnion(String setI, String setJ) {
int i;
int j;
int s;
String swap;
StringBuilder sb;
char chExc;
if (setI.Equals(Empty) || setJ.Equals(Any))
return setJ;
if (setJ.Equals(Empty) || setI.Equals(Any))
return setI;
if (setI == setJ)
return setI;
i = 0;
j = 0;
sb = new StringBuilder(setI.Length + setJ.Length);
for (;;) {
if (j == setJ.Length) {
sb.Append(setI, i, setI.Length - i);
break;
}
if (i == setI.Length) {
sb.Append(setJ, j, setJ.Length - j);
break;
}
if (setJ[j] > setI[i]) {
s = i;
i = j;
j = s;
swap = setI;
setI = setJ;
setJ = swap;
}
sb.Append(setJ[j++]);
if (j == setJ.Length)
break;
chExc = setJ[j++];
for (;;) {
while (i < setI.Length && setI[i] <= chExc)
i++;
if ((i & 0x1) == 0) {
sb.Append(chExc);
goto OuterContinue;
}
else {
if (i == setI.Length)
goto OuterBreak;
chExc = setI[i++];
}
s = i;
i = j;
j = s;
swap = setI;
setI = setJ;
setJ = swap;
}
2008-11-17 18:29:00 -05:00
OuterContinue:
2008-03-05 09:52:00 -05:00
;
}
2008-11-17 18:29:00 -05:00
OuterBreak:
2008-03-05 09:52:00 -05:00
;
return sb.ToString();
}
internal static String CategoryUnion(string catI, string catJ) {
return catI + catJ;
}
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
// SetFromChar()
//
// Builds the string representations of a class with a single character.
internal static String SetFromChar(char ch) {
StringBuilder sb = new StringBuilder(2);
sb.Append(ch);
if (ch != Lastchar)
sb.Append((char)(ch + 1));
return sb.ToString();
}
// SetInverseFromChar()
//
// Builds the string representation of a class that omits a single character.
internal static String SetInverseFromChar(char ch) {
StringBuilder sb = new StringBuilder(3);
if (ch != Nullchar) {
sb.Append(Nullchar);
sb.Append(ch);
}
if (ch != Lastchar)
sb.Append((char)(ch + 1));
return sb.ToString();
}
// IsSingleton()
//
// True if the set contains a single character only
internal static bool IsSingleton(String set) {
return(set.Length == 2 && set[0] == set[1] - 1); // && _categories.Length == 0);
}
// SingletonChar()
//
// Returns the char
internal static char SingletonChar(String set) {
return set[0];
}
internal static bool IsECMAWordChar(char ch) {
return CharInSet(ch, ECMAWord, String.Empty);
}
internal static bool IsWordChar(char ch) {
return CharInCategory(ch, Word);
}
internal static bool CharInSet(char ch, String set, String category) {
bool b = CharInSetInternal(ch, set, category);
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
if (set.Length >= 2 && (set[0] == 0) && (set[1] == 0))
return !b;
else
return b;
}
// CharInSet()
//
// Determines a character's membership in a character class (via the
// string representation of the class).
internal static bool CharInSetInternal(char ch, string set, String category) {
int min;
int max;
int mid;
min = 0;
max = set.Length;
while (min != max) {
mid = (min + max) / 2;
if (ch < set[mid])
max = mid;
else
min = mid + 1;
}
2008-11-17 18:29:00 -05:00
if ((min & 0x1) != 0)
2008-03-05 09:52:00 -05:00
return true;
else
return CharInCategory(ch, category);
}
internal static bool CharInCategory(char ch, string category) {
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
if (category.Length == 0)
return false;
UnicodeCategory chcategory = char.GetUnicodeCategory(ch);
int i=0;
2008-11-17 18:29:00 -05:00
while (i < category.Length) {
2008-03-05 09:52:00 -05:00
int curcat = (short) category[i];
if (curcat == 0) {
// zero is our marker for a group of categories - treated as a unit
if (CharInCategoryGroup(ch, chcategory, category, ref i))
return true;
}
else if (curcat > 0) {
// greater than zero is a positive case
2008-11-17 18:29:00 -05:00
if (curcat == SpaceConst) {
2008-03-05 09:52:00 -05:00
if (Char.IsWhiteSpace(ch))
return true;
else {
i++;
continue;
}
}
--curcat;
if (chcategory == (UnicodeCategory) curcat)
return true;
}
else {
// less than zero is a negative case
if (curcat == NotSpaceConst) {
if (!Char.IsWhiteSpace(ch))
return true;
else {
i++;
continue;
}
}
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
curcat = -curcat;
--curcat;
if (chcategory != (UnicodeCategory) curcat)
return true;
}
i++;
}
return false;
}
// CharInCategoryGroup
// This is used for categories which are composed of other categories - L, N, Z, W...
// These groups need special treatment when they are negated
private static bool CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i) {
i++;
int curcat = (short) category[i];
if (curcat > 0) {
// positive case - the character must be in ANY of the categories in the group
bool answer = false;
while (curcat != 0) {
if (!answer) {
--curcat;
if (chcategory == (UnicodeCategory) curcat)
answer = true;
}
i++;
curcat = (short) category[i];
}
return answer;
}
else {
// negative case - the character must be in NONE of the categories in the group
bool answer = true;
while (curcat != 0) {
if (answer) {
curcat = -curcat;
--curcat;
if (chcategory == (UnicodeCategory) curcat)
answer = false;
}
i++;
curcat = (short) category[i];
}
return answer;
}
}
internal static string NegateCategory(string category) {
if (category == null)
return null;
StringBuilder sb = new StringBuilder();
2008-11-17 18:29:00 -05:00
for (int i = 0; i < category.Length; i++) {
2008-03-05 09:52:00 -05:00
short ch = (short) category[i];
sb.Append( (char) -ch);
}
return sb.ToString();
}
// RangeCount()
//
// The number of single ranges that have been accumulated so far.
private int RangeCount() {
return _rangelist.Count;
}
// Range(int i)
//
// The ith range.
private SingleRange Range(int i) {
return(SingleRange)_rangelist[i];
}
// SingleRangeComparer
//
// For sorting ranges; compare based on the first char in the range.
private sealed class SingleRangeComparer : IComparer {
public int Compare(Object x, Object y) {
return(((SingleRange)x)._first < ((SingleRange)y)._first ? -1
: (((SingleRange)x)._first > ((SingleRange)y)._first ? 1 : 0));
}
}
// SingleRange
//
// A first/last pair representing a single range of characters.
private sealed class SingleRange {
internal SingleRange(char first, char last) {
_first = first;
_last = last;
}
internal char _first;
internal char _last;
}
// Canonicalize()
//
// Logic to reduce a character class to a unique, sorted form.
private void Canonicalize() {
SingleRange CurrentRange;
int i;
int j;
char last;
bool Done;
_canonical = true;
_rangelist.Sort(0, _rangelist.Count, new SingleRangeComparer());
//
// Find and eliminate overlapping or abutting ranges
//
if (_rangelist.Count > 1) {
Done = false;
2008-11-17 18:29:00 -05:00
for (i = 1, j = 0;; i++) {
for (last = ((SingleRange)_rangelist[j])._last;; i++) {
2008-03-05 09:52:00 -05:00
if (i == _rangelist.Count || last == Lastchar) {
Done = true;
break;
}
if ((CurrentRange = (SingleRange)_rangelist[i])._first > last + 1)
break;
if (last < CurrentRange._last)
last = CurrentRange._last;
}
((SingleRange)_rangelist[j])._last = last;
j++;
if (Done)
break;
if (j < i)
_rangelist[j] = _rangelist[i];
}
_rangelist.RemoveRange(j, _rangelist.Count - j);
}
}
2008-11-17 18:29:00 -05:00
// The property table contains all the block definitions defined in the
// XML schema spec (http://www.w3.org/TR/2001/PR-xmlschema-2-20010316/#charcter-classes), Unicode 3.0 spec (www.unicode.org),
// and Perl 5.6 (see Programming Perl, 3rd edition page 167). Three blocks defined by Perl (and here) may
// not be in the Unicode: IsHighPrivateUseSurrogates, IsHighSurrogates, and IsLowSurrogates.
//
// In addition, there was some inconsistency in the definition of IsTibetan and IsArabicPresentationForms-B.
2008-03-05 09:52:00 -05:00
// Regex goes with with the XML spec on both of these, since it seems to be (oddly enough) more correct than the Unicode spec!
//
// This is what we use:
// IsTibetan: 0xF00 - 0x0FFF
// IsArabicPresentationForms-B: 0xFE70-0xFEFE
//
2008-11-17 18:29:00 -05:00
// The Unicode spec is inconsistent for IsTibetan. Its range is 0x0F00 - 0x0FBF. However, it clearly defines
2008-03-05 09:52:00 -05:00
// Tibetan characters above 0x0FBF. This appears to be an error between the 2.0 and 3.0 spec.
//
2008-11-17 18:29:00 -05:00
// The Unicode spec is also unclear on IsArabicPresentationForms-B, defining it as 0xFE70-0xFEFF.
// There is only one character different here, 0xFEFF, which is a byte-order mark character and
2008-03-05 09:52:00 -05:00
// is labeled in the spec as special. I have excluded it from IsArabicPresentationForms-B and left it in IsSpecial.
// Has to be sorted by the first column
private static readonly String[,] _propTable = {
{"_xmlC", /* Name Char */ "\u002D\u002F\u0030\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00AA\u00AB\u00B2\u00B4\u00B5\u00B6\u00B9\u00BB\u00BC\u00BF\u00C0\u00D7\u00D8\u00F7\u00F8\u01AA\u01AB\u01BB\u01BC\u01BE\u01C4\u01F6\u01FA\u0218\u0250\u02A9\u0386\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F3\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0490\u04C0\u04C1\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0561\u0588\u0660\u066A\u06F0\u06FA\u0966\u0970\u09E6\u09F0\u09F4\u09FA\u0A66\u0A70\u0AE6\u0AF0\u0B66\u0B70\u0BE7\u0BF3\u0C66\u0C70\u0CE6\u0CF0\u0D66\u0D70\u0E50\u0E5A\u0ED0\u0EDA\u0F20\u0F34\u10A0\u10C6\u10D0\u10F7\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u2070\u2071\u2074"
+"\u207A\u207F\u208A\u20A8\u20A9\u2102\u2103\u2107\u2108\u210A\u2114\u2115\u211E\u2120\u2123\u2124\u2125\u2126\u2127\u2128\u2129\u212A\u2132\u2133\u2135\u2153\u2183\u2460\u249C\u24B6\u24EB\u2776\u2794\u3007\u3008\u3021\u302A\u3280\u328A\u3372\u3375\u3376\u3377\u3385\u338A\u338D\u3391\u3399\u339F\u33A9\u33AA\u33AD\u33AE\u33B0\u33B4\u33B9\u33BA\u33BF\u33C0\u33C1\u33C2\u33C3\u33C6\u33C7\u33C8\u33C9\u33D8\u33D9\u33DE"},
{"_xmlD", "\u0030\u003A\u0660\u066A\u06F0\u06FA\u0966\u0970\u09E6\u09F0\u0A66\u0A70\u0AE6\u0AF0\u0B66\u0B70\u0BE7\u0BF0\u0C66\u0C70\u0CE6\u0CF0\u0D66\u0D70\u0E50\u0E5A\u0ED0\u0EDA\u0F20\u0F2A\u2070\u2071\u2074\u207A\u2080\u208A"},
{"_xmlI", /* Start Name Char */ "\u003A\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00A8\u00A9\u00AA\u00AB\u00AF\u00B0\u00B4\u00B6\u00B8\u00B9\u00BA\u00BB\u00C0\u00D7\u00D8\u00F7\u00F8\u01F6\u01FA\u0218\u0250\u02A9\u02B0\u02DF\u02E0\u02EA\u0374\u0375\u037A\u037B\u0384\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0588\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u064B\u0671\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06D6\u06E5\u06E7\u0905\u093A\u0958\u0962\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09DC\u09DE\u09DF\u09E2\u09F0\u09F2\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A59\u0A5D\u0A5E\u0A5F\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0AE0\u0AE1\u0B05"
+"\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B5C\u0B5E\u0B5F\u0B62\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C60\u0C62\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CDE\u0CDF\u0CE0\u0CE2\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D60\u0D62\u0E01\u0E31\u0E32\u0E34\u0E40\u0E47\u0E4F\u0E50\u0E5A\u0E5C\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EB1\u0EB2\u0EB4\u0EBD\u0EBE\u0EC0\u0EC5\u0EDC\u0EDE\u0F18\u0F1A\u0F40\u0F48\u0F49\u0F6A\u10A0\u10C6\u10D0\u10F7\u1100\u115A\u115F\u11A3\u11A8\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FC5\u1FC6\u1FD4\u1FD6\u1FDC\u1FDD\u1FF0"
+"\u1FF2\u1FF5\u1FF6\u1FFF\u207F\u2080\u20A8\u20A9\u2102\u2103\u2107\u2108\u210A\u2114\u2115\u211E\u2120\u2123\u2124\u2125\u2126\u2127\u2128\u2129\u212A\u2132\u2133\u2139\u24B6\u24EA\u3041\u3095\u309B\u309F\u30A1\u30FB\u30FC\u30FF\u3105\u312D\u3131\u318F\u3192\u31A0\u3260\u327C\u328A\u32B1\u32D0\u32FF\u3300\u3358\u3371\u3377\u337B\u3395\u3399\u339F\u33A9\u33AE\u33B0\u33C2\u33C3\u33C6\u33C7\u33D8\u33D9\u33DE\u4E00\u4E01\u9FA5\u9FA6\uAC00\uAC01\uD7A3\uD7A4\uF900"},
{"_xmlW", "\u0023\u0025\u0026\u0027\u002A\u002C\u0030\u003A\u003C\u003F\u0040\u005B\u005E\u007B\u007C\u007D\u007E\u007F\u00A2\u00AB\u00AC\u00AD\u00AE\u00B7\u00B8\u00BB\u00BC\u00BF\u00C0\u037E\u037F\u0387\u0388\u055A\u0560\u0589\u058A\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05F3\u05F5\u060C\u060D\u061B\u061C\u061F\u0620\u06D4\u06D5\u093D\u093E\u0970\u0971\u0ABD\u0ABE\u0B3D\u0B3E\u0EAF\u0EB0\u0F04\u0F13\u0F3A\u0F3E\u0F85\u0F86\u10FB\u10FC\u2000\u202F\u2030\u203D\u2045\u2047\u206A\u2070\u207D\u207F\u208D\u208F\u2329\u232B\u3000\u3004\u3005\u3007\u3008\u3012\u3014\u301D\u3030\u3031\u30FB\u30FC\uD800\uD801\uDB7F\uDB81\uDBFF\uDC01\uDFFF\uE001\uF8FF\uF900\uFD3E\uFD40\uFE30\uFE33\uFE35\uFE45\uFE50\uFE53\uFE54\uFE5F\uFE63\uFE64\uFE68\uFE69\uFE6A\uFE6B\uFEFF\uFF00\uFF01\uFF03\uFF05\uFF06\uFF07\uFF0A\uFF0C\uFF10\uFF1A\uFF1C\uFF1F\uFF20\uFF3B\uFF3E\uFF5B\uFF5C\uFF5D\uFF5E\uFF61\uFF66"},
{"IsAlphabeticPresentationForms", "\uFB00\uFB50"},
{"IsArabic", "\u0600\u0700"},
{"IsArabicPresentationForms-A", "\uFB50\uFE00"},
{"IsArabicPresentationForms-B", "\uFE70\uFEFF"},
{"IsArmenian", "\u0530\u0590"},
{"IsArrows", "\u2190\u2200"},
{"IsBasicLatin", "\u0000\u0080"},
{"IsBengali", "\u0980\u0A00"},
{"IsBlockElements", "\u2580\u25A0"},
{"IsBopomofo", "\u3100\u3130"},
{"IsBopomofoExtended", "\u31A0\u31C0"},
{"IsBoxDrawing", "\u2500\u2580"},
{"IsBraillePatterns", "\u2800\u2900"},
{"IsCherokee", "\u13A0\u1400"},
{"IsCJKCompatibility", "\u3300\u3400"},
{"IsCJKCompatibilityForms", "\uFE30\uFE50"},
{"IsCJKCompatibilityIdeographs", "\uF900\uFB00"},
{"IsCJKRadicalsSupplement", "\u2E80\u2F00"},
{"IsCJKSymbolsandPunctuation", "\u3000\u3040"},
{"IsCJKUnifiedIdeographs", "\u4E00\uA000"},
{"IsCJKUnifiedIdeographsExtensionA", "\u3400\u4DB6"},
{"IsCombiningDiacriticalMarks", "\u0300\u0370"},
{"IsCombiningHalfMarks", "\uFE20\uFE30"},
{"IsCombiningMarksforSymbols", "\u20D0\u2100"},
{"IsControlPictures", "\u2400\u2440"},
{"IsCurrencySymbols", "\u20A0\u20D0"},
{"IsCyrillic", "\u0400\u0500"},
{"IsDevanagari", "\u0900\u0980"},
{"IsDingbats", "\u2700\u27C0"},
{"IsEnclosedAlphanumerics", "\u2460\u2500"},
{"IsEnclosedCJKLettersandMonths", "\u3200\u3300"},
{"IsEthiopic", "\u1200\u1380"},
{"IsGeneralPunctuation", "\u2000\u2070"},
{"IsGeometricShapes", "\u25A0\u2600"},
{"IsGeorgian", "\u10A0\u1100"},
{"IsGreek", "\u0370\u0400"},
{"IsGreekExtended", "\u1F00\u2000"},
{"IsGujarati", "\u0A80\u0B00"},
{"IsGurmukhi", "\u0A00\u0A80"},
{"IsHalfwidthandFullwidthForms", "\uFF00\uFFF0"},
{"IsHangulCompatibilityJamo", "\u3130\u3190"},
{"IsHangulJamo", "\u1100\u1200"},
{"IsHangulSyllables", "\uAC00\uD7A4"},
{"IsHebrew", "\u0590\u0600"},
{"IsHighPrivateUseSurrogates", "\uDB80\uDC00"},
{"IsHighSurrogates", "\uD800\uDB80"},
{"IsHiragana", "\u3040\u30A0"},
{"IsIdeographicDescriptionCharacters", "\u2FF0\u3000"},
{"IsIPAExtensions", "\u0250\u02B0"},
{"IsKanbun", "\u3190\u31A0"},
{"IsKangxiRadicals", "\u2F00\u2FE0"},
{"IsKannada", "\u0C80\u0D00"},
{"IsKatakana", "\u30A0\u3100"},
{"IsKhmer", "\u1780\u1800"},
{"IsLao", "\u0E80\u0F00"},
{"IsLatin-1Supplement", "\u0080\u0100"},
{"IsLatinExtended-A", "\u0100\u0180"},
{"IsLatinExtendedAdditional", "\u1E00\u1F00"},
{"IsLatinExtended-B", "\u0180\u0250"},
{"IsLetterlikeSymbols", "\u2100\u2150"},
{"IsLowSurrogates", "\uDC00\uE000"},
{"IsMalayalam", "\u0D00\u0D80"},
{"IsMathematicalOperators", "\u2200\u2300"},
{"IsMiscellaneousSymbols", "\u2600\u2700"},
{"IsMiscellaneousTechnical", "\u2300\u2400"},
{"IsMongolian", "\u1800\u18B0"},
{"IsMyanmar", "\u1000\u10A0"},
{"IsNumberForms", "\u2150\u2190"},
{"IsOgham", "\u1680\u16A0"},
{"IsOpticalCharacterRecognition", "\u2440\u2460"},
{"IsOriya", "\u0B00\u0B80"},
{"IsPrivateUse", "\uE000\uF900"},
{"IsRunic", "\u16A0\u1700"},
{"IsSinhala", "\u0D80\u0E00"},
{"IsSmallFormVariants", "\uFE50\uFE70"},
{"IsSpacingModifierLetters", "\u02B0\u0300"},
{"IsSpecials", "\uFEFF\uFF00\uFFF0\uFFFE"},
{"IsSuperscriptsandSubscripts", "\u2070\u20A0"},
{"IsSyriac", "\u0700\u0750"},
{"IsTamil", "\u0B80\u0C00"},
{"IsTelugu", "\u0C00\u0C80"},
{"IsThaana", "\u0780\u07C0"},
{"IsThai", "\u0E00\u0E80"},
{"IsTibetan", "\u0F00\u1000"},
{"IsUnifiedCanadianAboriginalSyllabics","\u1400\u1680"},
{"IsYiRadicals", "\uA490\uA4D0"},
{"IsYiSyllables", "\uA000\uA490"},
};
internal static String SetFromProperty(String capname, bool invert, string pattern) {
int min = 0;
int max = _propTable.GetLength(0);
while (min != max) {
int mid = (min + max) / 2;
int res = String.Compare(capname, _propTable[mid,0], false);//, CultureInfo.InvariantCulture);
if (res < 0)
max = mid;
else if (res > 0)
min = mid + 1;
else {
String set = _propTable[mid,1];
return invert ? SetInverse(set): set;
}
}
2008-11-17 18:29:00 -05:00
throw new ArgumentException("Unknown property");//XXX: SR.GetString(SR.MakeException, pattern, SR.GetString(SR.UnknownProperty, capname)), pattern);
2008-03-05 09:52:00 -05:00
//return invert ? Any : Empty ;
}
#if DBG
// SetDescription()
//
// Produces a human-readable description for a set string.
internal static String SetDescription(String set) {
if (set.Equals(Any))
return "[^]";
if (set.Equals(Empty))
return "[]";
StringBuilder desc = new StringBuilder("[");
int index;
char ch1;
char ch2;
if (set[0] == Nullchar) {
index = 1;
desc.Append('^');
}
else {
index = 0;
}
while (index < set.Length) {
ch1 = set[index];
if (index + 1 < set.Length)
ch2 = (char)(set[index + 1] - 1);
else
ch2 = Lastchar;
desc.Append(CharDescription(ch1));
if (ch2 != ch1) {
if (ch1 + 1 != ch2)
desc.Append('-');
desc.Append(CharDescription(ch2));
}
index += 2;
}
desc.Append(']');
return desc.ToString();
}
internal static readonly char [] Hex = new char [] {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
// CharDescription()
//
// Produces a human-readable description for a single character.
internal static String CharDescription(char ch) {
StringBuilder sb = new StringBuilder();
int shift;
if (ch == '\\')
return "\\\\";
if (ch >= ' ' && ch <= '~') {
sb.Append(ch);
return sb.ToString();
}
if (ch < 256) {
sb.Append("\\x");
shift = 8;
}
else {
sb.Append("\\u");
shift = 16;
}
while (shift > 0) {
shift -= 4;
sb.Append(Hex[(ch >> shift) & 0xF]);
}
return sb.ToString();
}
#endif
}
}