//------------------------------------------------------------------------------ // // // Copyright (c) Microsoft Corporation. All rights reserved. // // //------------------------------------------------------------------------------ // This RegexParser class is internal to the Regex package. // It builds a tree of RegexNodes from a regular expression // // Implementation notes: // // It would be nice to get rid of the comment modes, since the // ScanBlank() calls are just kind of duct-taped in. #define ECMA namespace System.Text.RegularExpressions { using System.Collections; using System.Globalization; internal sealed class RegexParser { internal RegexNode _stack; internal RegexNode _group; internal RegexNode _alternation; internal RegexNode _concatenation; internal RegexNode _unit; internal String _pattern; internal int _currentPos; internal CultureInfo _culture; internal int _autocap; internal int _capcount; internal int _captop; internal int _capsize; internal Hashtable _caps; internal Hashtable _capnames; internal Object[] _capnumlist; internal ArrayList _capnamelist; internal RegexOptions _options; internal ArrayList _optionsStack; internal bool _ignoreNextParen = false; internal const int infinite = RegexNode.infinite; // This static call constructs a RegexTree from a regular expression // pattern string and an option string. // // The method creates, drives, and drops a parser instance. internal static RegexTree Parse(String re, RegexOptions op) { RegexParser p; RegexNode root; String[] capnamelist; p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture); p._options = op; p.SetPattern(re); p.CountCaptures(); p.Reset(op); root = p.ScanRegex(); if (p._capnamelist == null) capnamelist = null; else capnamelist = (String[])p._capnamelist.ToArray(typeof(String)); return new RegexTree(root, p._caps, p._capnumlist, p._captop, p._capnames, capnamelist, op); } // This static call constructs a flat concatenation node given // a replacement pattern. internal static RegexReplacement ParseReplacement(String rep, Hashtable caps, int capsize, Hashtable capnames, RegexOptions op) { RegexParser p; RegexNode root; p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture); p._options = op; p.NoteCaptures(caps, capsize, capnames); p.SetPattern(rep); root = p.ScanReplacement(); return new RegexReplacement(rep, root, caps); } // Escapes all metacharacters (including |,(,),[,{,|,^,$,*,+,?,\, spaces and #) internal static String Escape(String input) { for (int i = 0; i < input.Length; i++) { if (IsMetachar(input[i])) { StringBuilder sb = new StringBuilder(); char ch = input[i]; int lastpos; sb.Append(input, 0, i); do { sb.Append('\\'); switch (ch) { case '\n': ch = 'n'; break; case '\r': ch = 'r'; break; case '\t': ch = 't'; break; case '\f': ch = 'f'; break; } sb.Append(ch); i++; lastpos = i; while (i < input.Length) { ch = input[i]; if (IsMetachar(ch)) break; i++; } sb.Append(input, lastpos, i - lastpos); } while (i < input.Length); return sb.ToString(); } } return input; } // Escapes all metacharacters (including (,),[,],{,},|,^,$,*,+,?,\, spaces and #) internal static String Unescape(String input) { for (int i = 0; i < input.Length; i++) { if (input[i] == '\\') { StringBuilder sb = new StringBuilder(); RegexParser p = new RegexParser(CultureInfo.InvariantCulture); int lastpos; p.SetPattern(input); sb.Append(input, 0, i); do { i++; p.Textto(i); if (i < input.Length) sb.Append(p.ScanCharEscape()); i = p.Textpos(); lastpos = i; while (i < input.Length && input[i] != '\\') i++; sb.Append(input, lastpos, i - lastpos); } while (i < input.Length); return sb.ToString(); } } return input; } // Private constructor. private RegexParser(CultureInfo culture) { _culture = culture; _optionsStack = new ArrayList(); _caps = new Hashtable(); } // Drops a string into the pattern buffer. internal void SetPattern(String Re) { if (Re == null) Re = String.Empty; _pattern = Re; _currentPos = 0; } // Resets parsing to the beginning of the pattern. internal void Reset(RegexOptions topopts) { _currentPos = 0; _autocap = 1; _ignoreNextParen = false; if (_optionsStack.Count > 0) _optionsStack.RemoveRange(0, _optionsStack.Count - 1); _options = topopts; _stack = null; } // The main parsing function. internal RegexNode ScanRegex() { char ch = '@'; // nonspecial ch, means at beginning bool isQuantifier = false; StartGroup(new RegexNode(RegexNode.Capture, _options, 0, -1)); while (CharsRight() > 0) { bool wasPrevQuantifier = isQuantifier; isQuantifier = false; ScanBlank(); int startpos = Textpos(); if (UseOptionX()) while (CharsRight() > 0 && (!IsStopperX(ch = RightChar()) || ch == '{' && !IsTrueQuantifier())) RightNext(); else while (CharsRight() > 0 && (!IsSpecial(ch = RightChar()) || ch == '{' && !IsTrueQuantifier())) RightNext(); int endpos = Textpos(); ScanBlank(); if (CharsRight() == 0) ch = '!'; // nonspecial, means at end else if (IsSpecial(ch = RightChar())) { isQuantifier = IsQuantifier(ch); RightNext(); } else ch = ' '; // nonspecial, means at ordinary char if (startpos < endpos) { int cchUnquantified = endpos - startpos - (isQuantifier ? 1 : 0); wasPrevQuantifier = false; if (cchUnquantified > 0) AddConcatenate(startpos, cchUnquantified, false); if (isQuantifier) AddUnitOne(CharAt(endpos - 1)); } switch (ch) { case '!': goto BreakOuterScan; case ' ': goto ContinueOuterScan; case '[': AddUnitSet(ScanCharClass(UseOptionI())); if (CharsRight() == 0 || RightCharNext() != ']') throw MakeException("Unterminated Bracket");// XXX: SR.GetString(SR.UnterminatedBracket)); break; case '(': { RegexNode grouper; PushOptions(); if (null == (grouper = ScanGroupOpen())) { PopKeepOptions(); } else { PushGroup(); StartGroup(grouper); } } continue; case '|': AddAlternate(); goto ContinueOuterScan; case ')': if (EmptyStack()) throw MakeException("Too many parens");//XXX: R.GetString(SR.TooManyParens)); AddGroup(); PopGroup(); PopOptions(); if (Unit() == null) goto ContinueOuterScan; break; case '\\': AddUnitNode(ScanBackslash()); break; case '^': AddUnitType(UseOptionM() ? RegexNode.Bol : RegexNode.Beginning); break; case '$': AddUnitType(UseOptionM() ? RegexNode.Eol : RegexNode.EndZ); break; case '.': if (UseOptionS()) AddUnitSet(RegexCharClass.AnyClass); else AddUnitNotone('\n'); break; case '{': case '*': case '+': case '?': if (Unit() == null) throw MakeException(wasPrevQuantifier ? "Nested Quantify": "Quantify After Nothing"); //XXX throw MakeException(wasPrevQuantifier ? // SR.GetString(SR.NestedQuantify, ch.ToString()) : // SR.GetString(SR.QuantifyAfterNothing)); LeftNext(); break; default: throw MakeException("Internal Error"); // XXX: SR.GetString(SR.InternalError)); } ScanBlank(); if (CharsRight() == 0 || !(isQuantifier = IsTrueQuantifier())) { AddConcatenate(); goto ContinueOuterScan; } ch = RightCharNext(); // Handle quantifiers while (Unit() != null) { int min; int max; bool lazy; switch (ch) { case '*': min = 0; max = infinite; break; case '?': min = 0; max = 1; break; case '+': min = 1; max = infinite; break; case '{': { startpos = Textpos(); max = min = ScanDecimal(); if (startpos < Textpos()) { if (CharsRight() > 0 && RightChar() == ',') { RightNext(); if (CharsRight() == 0 || RightChar() == '}') max = infinite; else max = ScanDecimal(); } } if (startpos == Textpos() || CharsRight() == 0 || RightCharNext() != '}') { AddConcatenate(); Textto(startpos - 1); goto ContinueOuterScan; } } break; default: throw MakeException("Internal Error");//XXX: SR.GetString(SR.InternalError)); } ScanBlank(); if (CharsRight() == 0 || RightChar() != '?') lazy = false; else { RightNext(); lazy = true; } if (min > max) throw MakeException("Illegal Range"); //SR.GetString(SR.IllegalRange)); AddConcatenate(lazy, min, max); } ContinueOuterScan: ; } BreakOuterScan: ; if (!EmptyStack()) throw MakeException("Not enough Parens"); // XXX: SR.GetString(SR.NotEnoughParens)); AddGroup(); return Unit(); } // Simple parsing for replacement patterns internal RegexNode ScanReplacement() { int c; int startpos; _concatenation = new RegexNode(RegexNode.Concatenate, _options); for (;;) { c = CharsRight(); if (c == 0) break; startpos = Textpos(); while (c > 0 && RightChar() != '$') { RightNext(); c--; } AddConcatenate(startpos, Textpos() - startpos, true); if (c > 0) { if (RightCharNext() == '$') AddUnitNode(ScanDollar()); AddConcatenate(); } } return _concatenation; } // Scans contents of [] (not including []'s), and converts to a // RegexCharClass. internal RegexCharClass ScanCharClass(bool caseInsensitive) { return ScanCharClass(caseInsensitive, false); } // Scans contents of [] (not including []'s), and converts to a // RegexCharClass. internal RegexCharClass ScanCharClass(bool caseInsensitive, bool scanOnly) { char ch = '\0'; bool inRange; bool firstChar; char chPrev = '\0'; RegexCharClass cc; cc = scanOnly ? null : new RegexCharClass(); if (CharsRight() > 0 && RightChar() == '^') { RightNext(); firstChar = false; if (!scanOnly) cc.Negate = true; } inRange = false; for (firstChar = true; CharsRight() > 0; firstChar = false) { switch (ch = RightCharNext()) { case ']': if (!firstChar) { LeftNext(); goto BreakScan; } break; case '\\': if (CharsRight() > 0) { switch (ch = RightCharNext()) { case 'd': if (!scanOnly) { if (inRange) throw MakeException("Bad class in char range");//XXX: SR.GetString(SR.BadClassInCharRange, ch.ToString())); if (UseOptionE()) cc.AddSet(RegexCharClass.ECMADigit); else cc.AddCategoryFromName("Nd", false, false, _pattern); } continue; case 'D': if (!scanOnly) { if (inRange) throw MakeException("Bad class in char range"); //XXX: SR.GetString(SR.BadClassInCharRange, ch.ToString())); if (UseOptionE()) cc.AddSet(RegexCharClass.NotECMADigit); else cc.AddCategoryFromName("Nd", true, false, _pattern); } continue; case 's': if (!scanOnly) { if (inRange) throw MakeException("Bad class in char range");//XXX: SR.GetString(SR.BadClassInCharRange, ch.ToString())); if (UseOptionE()) cc.AddSet(RegexCharClass.ECMASpace); else cc.AddCategory(RegexCharClass.Space); } continue; case 'S': if (!scanOnly) { if (inRange) throw MakeException("Bad class in char range");//XXX: SR.GetString(SR.BadClassInCharRange, ch.ToString())); if (UseOptionE()) cc.AddSet(RegexCharClass.NotECMASpace); else cc.AddCategory(RegexCharClass.NotSpace); } continue; case 'w': if (!scanOnly) { if (inRange) throw MakeException("Bad class in char range");//XXX: SR.GetString(SR.BadClassInCharRange, ch.ToString())); if (UseOptionE()) cc.AddSet(RegexCharClass.ECMAWord); else cc.AddCategory(RegexCharClass.Word); } continue; case 'W': if (!scanOnly) { if (inRange) throw MakeException("Bad class in char range");//XXX: SR.GetString(SR.BadClassInCharRange, ch.ToString())); if (UseOptionE()) cc.AddSet(RegexCharClass.NotECMAWord); else cc.AddCategory(RegexCharClass.NotWord); } continue; case 'p': case 'P': if (!scanOnly) { if (inRange) throw MakeException("Bad class in char range"); //XXX: SR.GetString(SR.BadClassInCharRange, ch.ToString())); cc.AddCategoryFromName(ParseProperty(), (ch != 'p'), caseInsensitive, _pattern); } else ParseProperty(); continue; default: LeftNext(); ch = ScanCharEscape(); break; } } break; case '[': if (CharsRight() > 0 && RightChar() == ':' && !inRange) { String name; int savePos = Textpos(); RightNext(); name = ScanCapname(); if (CharsRight() < 2 || RightCharNext() != ':' || RightCharNext() != ']') Textto(savePos); // else lookup name (not yet implemented) } break; } if (inRange) { inRange = false; if (!scanOnly) { if (chPrev > ch) throw MakeException("Reversed char range");//XXX: SR.GetString(SR.ReversedCharRange)); cc.AddRange(chPrev, ch); } } else if (CharsRight() >= 2 && RightChar() == '-' && RightChar(1) != ']') { chPrev = ch; inRange = true; RightNext(); } else { if (!scanOnly) cc.AddRange(ch, ch); } } BreakScan: ; return cc; } // Scans chars following a '(' (not counting the '('), and returns // a RegexNode for the type of group scanned, or null if the group // simply changed options (?cimsx-cimsx) or was a comment (#...). internal RegexNode ScanGroupOpen() { char ch = '\0'; int NodeType; char close = '>'; // just return a RegexNode if we have: // 1. "(" followed by nothing // 2. "(x" where x != ? // 3. "(?)" if (CharsRight() == 0 || RightChar() != '?' || (RightChar() == '?' && RightChar(1) == ')')) { if (UseOptionN() || _ignoreNextParen) { _ignoreNextParen = false; return new RegexNode(RegexNode.Group, _options); } else return new RegexNode(RegexNode.Capture, _options, _autocap++, -1); } RightNext(); for (;;) { if (CharsRight() == 0) break; switch (ch = RightCharNext()) { case ':': NodeType = RegexNode.Group; break; case '=': _options &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Require; break; case '!': _options &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Prevent; break; case '>': NodeType = RegexNode.Greedy; break; case '\'': close = '\''; goto case '<'; // fall through case '<': if (CharsRight() == 0) goto BreakRecognize; switch (ch = RightCharNext()) { case '=': if (close == '\'') goto BreakRecognize; _options |= RegexOptions.RightToLeft; NodeType = RegexNode.Require; break; case '!': if (close == '\'') goto BreakRecognize; _options |= RegexOptions.RightToLeft; NodeType = RegexNode.Prevent; break; default: LeftNext(); int capnum = -1; int uncapnum = -1; bool proceed = false; // grab part before - if (ch >= '0' && ch <= '9') { capnum = ScanDecimal(); if (!IsCaptureSlot(capnum)) capnum = -1; // check if we have bogus characters after the number if (CharsRight() > 0 && !(RightChar() == close || RightChar() == '-')) throw MakeException("Invalid group name");//XXX: SR.GetString(SR.InvalidGroupName)); if (capnum == 0) throw MakeException("Capnum not zero"); //XXX: SR.GetString(SR.CapnumNotZero)); } else if (RegexCharClass.IsWordChar(ch)) { String capname = ScanCapname(); if (IsCaptureName(capname)) capnum = CaptureSlotFromName(capname); // check if we have bogus character after the name if (CharsRight() > 0 && !(RightChar() == close || RightChar() == '-')) throw MakeException("Invalid group name"); //XXX: SR.GetString(SR.InvalidGroupName)); } else if (ch == '-') { proceed = true; } else { // bad group name - starts with something other than a word character and isn't a number throw MakeException("Invalid group name"); //XXX: SR.GetString(SR.InvalidGroupName)); } // grab part after - if any if ((capnum != -1 || proceed == true) && CharsRight() > 0 && RightChar() == '-') { RightNext(); ch = RightChar(); if (ch >= '0' && ch <= '9') { uncapnum = ScanDecimal(); if (!IsCaptureSlot(uncapnum)) throw MakeException("Undefined backreference");//XXX: SR.GetString(SR.UndefinedBackref, uncapnum)); // check if we have bogus characters after the number if (CharsRight() > 0 && RightChar() != close) throw MakeException("Invalid group name");//XXX: SR.GetString(SR.InvalidGroupName)); } else if (RegexCharClass.IsWordChar(ch)) { String uncapname = ScanCapname(); if (IsCaptureName(uncapname)) uncapnum = CaptureSlotFromName(uncapname); else throw MakeException("Undefined name reference"); //XXX: SR.GetString(SR.UndefinedNameRef, uncapname)); // check if we have bogus character after the name if (CharsRight() > 0 && RightChar() != close) throw MakeException("Invalid group name"); //XXX: SR.GetString(SR.InvalidGroupName)); } else { // bad group name - starts with something other than a word character and isn't a number throw MakeException("Invalid group name"); //XXX: SR.GetString(SR.InvalidGroupName)); } } // actually make the node if ((capnum != -1 || uncapnum != -1) && CharsRight() > 0 && RightCharNext() == close) { return new RegexNode(RegexNode.Capture, _options, capnum, uncapnum); } goto BreakRecognize; } break; case '(': // alternation construct (?(...) | ) int parenPos = Textpos(); ch = RightChar(); // check if the alternation condition is a backref if (ch >= '0' && ch <= '9') { int capnum = ScanDecimal(); if (CharsRight() > 0 && RightCharNext() == ')') { if (IsCaptureSlot(capnum)) return new RegexNode(RegexNode.Testref, _options, capnum); else throw MakeException("Undefined reference"); //XXX: SR.GetString(SR.UndefinedReference, capnum.ToString())); } else throw MakeException("Malformed reference"); //XXX: SR.GetString(SR.MalformedReference, capnum.ToString())); } else if (RegexCharClass.IsWordChar(ch)) { String capname = ScanCapname(); if (IsCaptureName(capname) && CharsRight() > 0 && RightCharNext() == ')') return new RegexNode(RegexNode.Testref, _options, CaptureSlotFromName(capname)); } // not a backref NodeType = RegexNode.Testgroup; Textto(parenPos - 1); // jump to the start of the parentheses _ignoreNextParen = true; // but make sure we don't try to capture the insides int charsRight = CharsRight(); if (charsRight >= 3 && RightChar(1) == '?') { char rightchar2 = RightChar(2); // disallow comments in the condition if (rightchar2 == '#') throw MakeException("Alternation cannnot have comment"); //XXX: SR.GetString(SR.AlternationCantHaveComment)); // disallow named capture group (?<..>..) in the condition if (rightchar2 == '\'' ) throw MakeException("Alternation cannot capture"); //XXX: SR.GetString(SR.AlternationCantCapture)); else { if (charsRight >=4 && (rightchar2 == '<' && RightChar(3) != '!' && RightChar(3) != '=')) throw MakeException("Alternation cannot capture"); //XXX: SR.GetString(SR.AlternationCantCapture)); } } break; default: LeftNext(); NodeType = RegexNode.Group; ScanOptions(); if (CharsRight() == 0) goto BreakRecognize; if ((ch = RightCharNext()) == ')') return null; if (ch != ':') goto BreakRecognize; break; } return new RegexNode(NodeType, _options); } BreakRecognize: ; // break Recognize comes here throw MakeException("Unrecognized grouping"); //XXX: SR.GetString(SR.UnrecognizedGrouping)); } // Scans whitespace or x-mode comments. internal void ScanBlank() { if (UseOptionX()) { for (;;) { while (CharsRight() > 0 && IsSpace(RightChar())) RightNext(); if (CharsRight() == 0) break; if (RightChar() == '#') { while (CharsRight() > 0 && RightChar() != '\n') RightNext(); } else if (CharsRight() >= 3 && RightChar(2) == '#' && RightChar(1) == '?' && RightChar() == '(') { while (CharsRight() > 0 && RightChar() != ')') RightNext(); if (CharsRight() == 0) throw MakeException("Unterminated comment"); //SR.GetString(SR.UnterminatedComment)); RightNext(); } else break; } } else { for (;;) { if (CharsRight() < 3 || RightChar(2) != '#' || RightChar(1) != '?' || RightChar() != '(') return; while (CharsRight() > 0 && RightChar() != ')') RightNext(); if (CharsRight() == 0) throw MakeException("Unterminated comment"); //XXX: SR.GetString(SR.UnterminatedComment)); RightNext(); } } } // Scans chars following a '\' (not counting the '\'), and returns // a RegexNode for the type of atom scanned. internal RegexNode ScanBackslash() { char ch; RegexCharClass cc; if (CharsRight() == 0) throw MakeException("IllegalEnd Escape"); //SR.GetString(SR.IllegalEndEscape)); switch (ch = RightChar()) { case 'b': case 'B': case 'A': case 'G': case 'Z': case 'z': RightNext(); return new RegexNode(TypeFromCode(ch), _options); case 'w': RightNext(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMAWord, String.Empty); return new RegexNode(RegexNode.Set, _options, String.Empty, RegexCharClass.Word); case 'W': RightNext(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMAWord, String.Empty); return new RegexNode(RegexNode.Set, _options, String.Empty, RegexCharClass.NotWord); case 's': RightNext(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMASpace, String.Empty); return new RegexNode(RegexNode.Set, _options, String.Empty, RegexCharClass.Space); case 'S': RightNext(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMASpace, String.Empty); return new RegexNode(RegexNode.Set, _options, String.Empty, RegexCharClass.NotSpace); case 'd': RightNext(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMADigit, String.Empty); cc = RegexCharClass.CreateFromCategory("Nd", false, false, _pattern); return new RegexNode(RegexNode.Set, _options, String.Empty, cc.Category); case 'D': RightNext(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMADigit, String.Empty); cc = RegexCharClass.CreateFromCategory("Nd", true, false, _pattern); return new RegexNode(RegexNode.Set, _options, String.Empty, cc.Category); case 'p': case 'P': RightNext(); cc = RegexCharClass.CreateFromCategory(ParseProperty(), (ch != 'p'), UseOptionI(), _pattern); return new RegexNode(RegexNode.Set, _options, cc.ToSetCi(UseOptionI(), _culture), cc.Category); default: return ScanBasicBackslash(); } } // Scans \-style backreferences and character escapes internal RegexNode ScanBasicBackslash() { if (CharsRight() == 0) throw MakeException("Illegal end escape"); //XXX: SR.GetString(SR.IllegalEndEscape)); char ch; bool angled = false; char close = '\0'; int backpos; backpos = Textpos(); ch = RightChar(); // allow \k instead of , which is now deprecated if (ch == 'k') { if (CharsRight() >= 2) { RightNext(); ch = RightCharNext(); if (ch == '<' || ch == '\'') { angled = true; close = (ch == '\'') ? '\'' : '>'; } } if (!angled) throw MakeException("Malformed name reference"); //XXX: SR.GetString(SR.MalformedNameRef)); ch = RightChar(); } // Note angle without \g else if ((ch == '<' || ch == '\'') && CharsRight() > 1) { angled = true; close = (ch == '\'') ? '\'' : '>'; RightNext(); ch = RightChar(); } // Try to parse backreference: <1> or if (angled && ch >= '0' && ch <= '9') { int capnum = ScanDecimal(); if (CharsRight() > 0 && RightCharNext() == close) { if (IsCaptureSlot(capnum)) return new RegexNode(RegexNode.Ref, _options, capnum); else throw MakeException("Undefined back reference"); //XXX: SR.GetString(SR.UndefinedBackref, capnum.ToString())); } } // Try to parse backreference or octal: \1 else if (!angled && ch >= '1' && ch <= '9') { if (UseOptionE()) { int capnum = -1; int newcapnum = (int)(ch - '0'); int pos = Textpos() - 1; while (newcapnum <= _captop) { if (IsCaptureSlot(newcapnum) && (_caps == null || (int)_caps[newcapnum] < pos)) capnum = newcapnum; RightNext(); if (CharsRight() == 0 || (ch = RightChar()) < '0' || ch > '9') break; newcapnum = newcapnum * 10 + (int)(ch - '0'); } if (capnum >= 0) return new RegexNode(RegexNode.Ref, _options, capnum); } else { int capnum = ScanDecimal(); if (IsCaptureSlot(capnum)) return new RegexNode(RegexNode.Ref, _options, capnum); else if (capnum <= 9) throw MakeException("Undefined back reference"); //XXX: SR.GetString(SR.UndefinedBackref, capnum.ToString())); } } else if (angled && RegexCharClass.IsWordChar(ch)) { String capname = ScanCapname(); if (CharsRight() > 0 && RightCharNext() == close) { if (IsCaptureName(capname)) return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname)); else throw MakeException("Undefined name reference"); //XXX: SR.GetString(SR.UndefinedNameRef, capname)); } } // Not backreference: must be char code Textto(backpos); ch = ScanCharEscape(); if (UseOptionI()) ch = Char.ToLower(ch);//, _culture); return new RegexNode(RegexNode.One, _options, ch); } // Scans $ patterns recognized within replacment patterns internal RegexNode ScanDollar() { if (CharsRight() == 0) return new RegexNode(RegexNode.One, _options, '$'); char ch; bool angled; int backpos; backpos = Textpos(); ch = RightChar(); // Note angle if (ch == '{' && CharsRight() > 1) { angled = true; RightNext(); ch = RightChar(); } else { angled = false; } // Try to parse backreference: \1 or \{1} or \{cap} if (ch >= '0' && ch <= '9') { if (!angled && UseOptionE()) { int capnum = -1; int newcapnum = (int)(ch - '0'); int pos = Textpos() - 1; while (newcapnum <= _capsize) { if (IsCaptureSlot(newcapnum)) capnum = newcapnum; RightNext(); if (CharsRight() == 0 || (ch = RightChar()) < '0' || ch > '9') break; newcapnum = newcapnum * 10 + (int)(ch - '0'); } if (capnum >= 0) return new RegexNode(RegexNode.Ref, _options, capnum); } else { int capnum = ScanDecimal(); if (!angled || CharsRight() > 0 && RightCharNext() == '}') { if (IsCaptureSlot(capnum)) return new RegexNode(RegexNode.Ref, _options, capnum); } } } else if (angled && RegexCharClass.IsWordChar(ch)) { String capname = ScanCapname(); if (CharsRight() > 0 && RightCharNext() == '}') { if (IsCaptureName(capname)) return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname)); } } else if (!angled) { int capnum = 1; switch (ch) { case '$': RightNext(); return new RegexNode(RegexNode.One, _options, '$'); case '&': capnum = 0; break; case '`': capnum = RegexReplacement.LeftPortion; break; case '\'': capnum = RegexReplacement.RightPortion; break; case '+': capnum = RegexReplacement.LastGroup; break; case '_': capnum = RegexReplacement.WholeString; break; } if (capnum != 1) { RightNext(); return new RegexNode(RegexNode.Ref, _options, capnum); } } // unrecognized $: literalize Textto(backpos); return new RegexNode(RegexNode.One, _options, '$'); } // Scans a capture name: consumes word chars internal String ScanCapname() { int startpos = Textpos(); while (CharsRight() > 0) { if (!RegexCharClass.IsWordChar(RightCharNext())) { LeftNext(); break; } } return _pattern.Substring(startpos, Textpos() - startpos); } // Scans up to three octal digits (stops before exceeding 0377). internal char ScanOctal() { int d; int i; int c; // Consume octal chars only up to 3 digits and value 0377 c = 3; if (c > CharsRight()) c = CharsRight(); for (i = 0; c > 0 && (uint)(d = RightChar() - '0') <= 7; c -= 1) { RightNext(); i *= 8; i += d; if (UseOptionE() && i >= 0x20) break; } // Octal codes only code from 0-127 i &= 0x7F; return(char)i; } // Scans any number of decimal digits (pegs value at 2^31-1 if too large) internal int ScanDecimal() { int i = 0; int d; while (CharsRight() > 0 && (uint)(d = (char)(RightChar() - '0')) <= 9) { RightNext(); if (i > (infinite / 10) || i == (infinite / 10) && d > (infinite % 10)) i = infinite; i *= 10; i += d; } return i; } // Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF) internal char ScanHex(int c) { int i; int d; i = 0; if (CharsRight() >= c) { for (; c > 0 && ((d = HexDigit(RightCharNext())) >= 0); c -= 1) { i *= 0x10; i += d; } } if (c > 0) throw MakeException("Too few hex"); //XXX: SR.GetString(SR.TooFewHex)); return(char)i; } // Returns n <= 0xF for a hex digit. internal static int HexDigit(char ch) { int d; if ((uint)(d = ch - '0') <= 9) return d; if ((uint)(d = ch - 'a') <= 5) return d + 0xa; if ((uint)(d = ch - 'A') <= 5) return d + 0xa; return -1; } // Grabs and converts an ascii control character internal char ScanControl() { char ch; if (CharsRight() <= 0) throw MakeException("Missing control"); //XXX: SR.GetString(SR.MissingControl)); ch = RightCharNext(); // \ca interpreted as \cA if (ch >= 'a' && ch <= 'z') ch = (char)(ch - ('a' - 'A')); if ((ch = (char)(ch - '@')) < ' ') return ch; throw MakeException("Unrecognized Control"); //XXX: SR.GetString(SR.UnrecognizedControl)); } // Returns true for options allowed only at the top level internal bool IsOnlyTopOption(RegexOptions option) { return(option == RegexOptions.RightToLeft || option == RegexOptions.Compiled || option == RegexOptions.CultureInvariant || option == RegexOptions.ECMAScript ); } // Scans cimsx-cimsx option string, stops at the first unrecognized char. internal void ScanOptions() { char ch; bool off; RegexOptions option; for (off = false; CharsRight() > 0; RightNext()) { ch = RightChar(); if (ch == '-') { off = true; } else if (ch == '+') { off = false; } else { option = OptionFromCode(ch); if (option == 0 || IsOnlyTopOption(option)) return; if (off) _options &= ~option; else _options |= option; } } } // Scans \ code for escape codes that map to single unicode chars. internal char ScanCharEscape() { char ch; ch = RightCharNext(); if (ch >= '0' && ch <= '7') { LeftNext(); return ScanOctal(); } switch (ch) { case 'x': return ScanHex(2); case 'u': return ScanHex(4); case 'a': return '\u0007'; case 'b': return '\b'; case 'e': return '\u001B'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\u000B'; case 'c': return ScanControl(); default: if ( !UseOptionE() && RegexCharClass.IsWordChar(ch)) throw MakeException("Unrecognized Escape"); //XXX: SR.GetString(SR.UnrecognizedEscape, ch.ToString())); return ch; } } // Scans X for \p{X} or \P{X} internal String ParseProperty() { if (CharsRight() < 3) { throw MakeException("Incomplete Slash P"); //XXX: SR.GetString(SR.IncompleteSlashP)); } char ch = RightCharNext(); if (ch != '{') { throw MakeException("Malformed Slash P"); //XXX: SR.GetString(SR.MalformedSlashP)); } String capname = ScanCapname(); if (CharsRight() == 0 || RightCharNext() != '}') throw MakeException("Incomplete Slash P"); //XXX: SR.GetString(SR.IncompleteSlashP)); return capname; } // Returns ReNode type for zero-length assertions with a \ code. internal int TypeFromCode(char ch) { switch (ch) { case 'b': return UseOptionE() ? RegexNode.ECMABoundary : RegexNode.Boundary; case 'B': return UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.Nonboundary; case 'A': return RegexNode.Beginning; case 'G': return RegexNode.Start; case 'Z': return RegexNode.EndZ; case 'z': return RegexNode.End; default: return RegexNode.Nothing; } } // Returns option bit from single-char (?cimsx) code. internal static RegexOptions OptionFromCode(char ch) { // case-insensitive if (ch >= 'A' && ch <= 'Z') ch += (char)('a' - 'A'); switch (ch) { case 'c': return RegexOptions.Compiled; case 'i': return RegexOptions.IgnoreCase; case 'r': return RegexOptions.RightToLeft; case 'm': return RegexOptions.Multiline; case 'n': return RegexOptions.ExplicitCapture; case 's': return RegexOptions.Singleline; case 'x': return RegexOptions.IgnorePatternWhitespace; #if DBG case 'd': return RegexOptions.Debug; #endif case 'e': return RegexOptions.ECMAScript; default: return 0; } } // a prescanner for deducing the slots used for // captures by doing a partial tokenization of the pattern. internal void CountCaptures() { char ch; NoteCaptureSlot(0, 0); _autocap = 1; while (CharsRight() > 0) { int pos = Textpos(); ch = RightCharNext(); switch (ch) { case '\\': if (CharsRight() > 0) RightNext(); break; case '#': if (UseOptionX()) { LeftNext(); ScanBlank(); } break; case '[': ScanCharClass(false, true); break; case ')': if (!EmptyOptionsStack()) PopOptions(); break; case '(': if (CharsRight() >= 2 && RightChar(1) == '#' && RightChar() == '?') { LeftNext(); ScanBlank(); } else { PushOptions(); if (CharsRight() > 0 && RightChar() == '?') { // we have (?... RightNext(); if (CharsRight() > 1 && (RightChar() == '<' || RightChar() == '\'')) { // named group: (?<... or (?'... RightNext(); ch = RightChar(); if (ch != '0' && RegexCharClass.IsWordChar(ch)) { //if (_ignoreNextParen) // throw MakeException(SR.GetString(SR.AlternationCantCapture)); if (ch >= '1' && ch <= '9') NoteCaptureSlot(ScanDecimal(), pos); else NoteCaptureName(ScanCapname(), pos); } } else { // (?... // get the options if it's an option construct (?cimsx-cimsx...) ScanOptions(); if (CharsRight() > 0) { if (RightChar() == ')') { // (?cimsx-cimsx) RightNext(); PopKeepOptions(); } else if (RightChar() == '(') { // alternation construct: (?(foo)yes|no) // ignore the next paren so we don't capture the condition _ignoreNextParen = true; // break from here so we don't reset _ignoreNextParen break; } } } } else { if (!UseOptionN() && !_ignoreNextParen) NoteCaptureSlot(_autocap++, pos); } } _ignoreNextParen = false; break; } } AssignNameSlots(); } // Notes a used capture slot internal void NoteCaptureSlot(int i, int pos) { if (!_caps.ContainsKey(i)) { // the rhs of the hashtable isn't used in the parser _caps.Add(i, pos); _capcount++; if (_captop <= i) _captop = i + 1; } } // Notes a used capture slot internal void NoteCaptureName(String name, int pos) { if (_capnames == null) { _capnames = new Hashtable(); _capnamelist = new ArrayList(); } if (!_capnames.ContainsKey(name)) { _capnames.Add(name, pos); _capnamelist.Add(name); } } // For when all the used captures are known: note them all at once internal void NoteCaptures(Hashtable caps, int capsize, Hashtable capnames) { _caps = caps; _capsize = capsize; _capnames = capnames; } // Assigns unused slot numbers to the capture names internal void AssignNameSlots() { if (_capnames != null) { for (int i = 0; i < _capnamelist.Count; i++) { while (IsCaptureSlot(_autocap)) _autocap++; string name = (string)_capnamelist[i]; int pos = (int)_capnames[name]; _capnames[name] = _autocap; NoteCaptureSlot(_autocap, pos); _autocap++; } } // if the caps array has at least one gap, construct the list of used slots if (_capcount < _captop) { _capnumlist = new Object[_capcount]; int i = 0; for (IDictionaryEnumerator de = _caps.GetEnumerator(); de.MoveNext(); ) _capnumlist[i++] = de.Key; //XXX: System.Array.Sort(_capnumlist, InvariantComparer.Default); System.Array.Sort(_capnumlist, System.Collections.Comparer.Default); } // merge capsnumlist into capnamelist if (_capnames != null || _capnumlist != null) { ArrayList oldcapnamelist; int next; int k = 0; if (_capnames == null) { oldcapnamelist = null; _capnames = new Hashtable(); _capnamelist = new ArrayList(); next = -1; } else { oldcapnamelist = _capnamelist; _capnamelist = new ArrayList(); next = (int)_capnames[oldcapnamelist[0]]; } for (int i = 0; i < _capcount; i++) { int j = (_capnumlist == null) ? i : (int)_capnumlist[i]; if (next == j) { _capnamelist.Add((String)oldcapnamelist[k++]); next = (k == oldcapnamelist.Count) ? -1 : (int)_capnames[oldcapnamelist[k]]; } else { String str = j.ToString(); //Convert.ToString(j); _capnamelist.Add(str); _capnames[str] = j; } } } } // Looks up the slot number for a given name internal int CaptureSlotFromName(String capname) { return(int)_capnames[capname]; } // True if the capture slot was noted internal bool IsCaptureSlot(int i) { if (_caps != null) return _caps.ContainsKey(i); return(i >= 0 && i < _capsize); } // Looks up the slot number for a given name internal bool IsCaptureName(String capname) { if (_capnames == null) return false; return _capnames.ContainsKey(capname); } // True if N option disabling '(' autocapture is on. internal bool UseOptionN() { return(_options & RegexOptions.ExplicitCapture) != 0; } // True if I option enabling case-insensitivity is on. internal bool UseOptionI() { return(_options & RegexOptions.IgnoreCase) != 0; } // True if M option altering meaning of $ and ^ is on. internal bool UseOptionM() { return(_options & RegexOptions.Multiline) != 0; } // True if S option altering meaning of . is on. internal bool UseOptionS() { return(_options & RegexOptions.Singleline) != 0; } // True if X option enabling whitespace/comment mode is on. internal bool UseOptionX() { return(_options & RegexOptions.IgnorePatternWhitespace) != 0; } // True if E option enabling ECMAScript behavior is on. internal bool UseOptionE() { return(_options & RegexOptions.ECMAScript) != 0; } internal const byte Q = 5; // quantifier internal const byte S = 4; // ordinary stopper internal const byte Z = 3; // ScanBlank stopper internal const byte X = 2; // whitespace internal const byte E = 1; // should be escaped // For categorizing ascii characters. internal static readonly byte[] _category = new byte[] { // 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0,0,0,0,0,0,0,0,0,X,X,0,X,X,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? X,0,0,Z,S,0,0,0,S,S,Q,Q,0,0,S,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Q, // @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,S,S,0,S,0, // ' a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Q,S,0,0,0}; // Returns true for those characters that terminate a string of ordinary chars. internal static bool IsSpecial(char ch) { return(ch <= '|' && _category[ch] >= S); } // Returns true for those characters that terminate a string of ordinary chars. internal static bool IsStopperX(char ch) { return(ch <= '|' && _category[ch] >= X); } // Returns true for those characters that begin a quantifier. internal static bool IsQuantifier(char ch) { return(ch <= '{' && _category[ch] >= Q); } internal bool IsTrueQuantifier() { int nChars = CharsRight(); if (nChars == 0) return false; int startpos = Textpos(); char ch = CharAt(startpos); if (ch != '{') return ch <= '{' && _category[ch] >= Q; int pos = startpos; while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ; if (nChars == 0 || pos - startpos == 1) return false; if (ch == '}') return true; if (ch != ',') return false; while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ; return nChars > 0 && ch == '}'; } // Returns true for whitespace. internal static bool IsSpace(char ch) { return(ch <= ' ' && _category[ch] == X); } // Returns true for chars that should be escaped. internal static bool IsMetachar(char ch) { return(ch <= '|' && _category[ch] >= E); } // Add a string to the last concatenate. internal void AddConcatenate(int pos, int cch, bool isReplacement) { RegexNode node; if (cch == 0) return; if (cch > 1) { String str = _pattern.Substring(pos, cch); if (UseOptionI() && !isReplacement) str = str.ToLower();//_culture); node = new RegexNode(RegexNode.Multi, _options, str); } else { char ch = _pattern[pos]; if (UseOptionI() && !isReplacement) ch = Char.ToLower(ch);//, _culture); node = new RegexNode(RegexNode.One, _options, ch); } _concatenation.AddChild(node); } // Push the parser state (in response to an open paren) internal void PushGroup() { _group._next = _stack; _alternation._next = _group; _concatenation._next = _alternation; _stack = _concatenation; } // Remember the pushed state (in response to a ')') internal void PopGroup() { _concatenation = _stack; _alternation = _concatenation._next; _group = _alternation._next; _stack = _group._next; // The first () inside a Testgroup group goes directly to the group if (_group.Type() == RegexNode.Testgroup && _group.ChildCount() == 0) { if (_unit == null) throw MakeException("Illegal Condition"); // XXX: SR.GetString(SR.IllegalCondition)); _group.AddChild(_unit); _unit = null; } } // True if the group stack is empty. internal bool EmptyStack() { return _stack == null; } // Start a new round for the parser state (in response to an open paren or string start) internal void StartGroup(RegexNode openGroup) { _group = openGroup; _alternation = new RegexNode(RegexNode.Alternate, _options); _concatenation = new RegexNode(RegexNode.Concatenate, _options); } // Finish the current concatenation (in response to a |) internal void AddAlternate() { // The | parts inside a Testgroup group go directly to the group if (_group.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref) { _group.AddChild(_concatenation.ReverseLeft()); } else { _alternation.AddChild(_concatenation.ReverseLeft()); } _concatenation = new RegexNode(RegexNode.Concatenate, _options); } // Finish the current quantifiable (when a quantifier is not found or is not possible) internal void AddConcatenate() { // The first (| inside a Testgroup group goes directly to the group _concatenation.AddChild(_unit); _unit = null; } // Finish the current quantifiable (when a quantifier is found) internal void AddConcatenate(bool lazy, int min, int max) { _concatenation.AddChild(_unit.MakeQuantifier(lazy, min, max)); _unit = null; } // Returns the current unit internal RegexNode Unit() { return _unit; } // Sets the current unit to a single char node internal void AddUnitOne(char ch) { if (UseOptionI()) ch = Char.ToLower(ch);//, _culture); _unit = new RegexNode(RegexNode.One, _options, ch); } // Sets the current unit to a single inverse-char node internal void AddUnitNotone(char ch) { if (UseOptionI()) ch = Char.ToLower(ch);//, _culture); _unit = new RegexNode(RegexNode.Notone, _options, ch); } // Sets the current unit to a single set node internal void AddUnitSet(RegexCharClass cc) { _unit = new RegexNode(RegexNode.Set, _options, cc.ToSetCi(UseOptionI(), _culture), cc.Category); } // Sets the current unit to a subtree internal void AddUnitNode(RegexNode node) { _unit = node; } // Sets the current unit to an assertion of the specified type internal void AddUnitType(int type) { _unit = new RegexNode(type, _options); } // Finish the current group (in response to a ')' or end) internal void AddGroup() { if (_group.Type() == RegexNode.Testgroup || _group.Type() == RegexNode.Testref) { _group.AddChild(_concatenation.ReverseLeft()); if (_group.Type() == RegexNode.Testref && _group.ChildCount() > 2 || _group.ChildCount() > 3) throw MakeException("Too many alternatives"); //SR.GetString(SR.TooManyAlternates)); } else { _alternation.AddChild(_concatenation.ReverseLeft()); _group.AddChild(_alternation); } /* */ _unit = _group; } // Saves options on a stack. internal void PushOptions() { _optionsStack.Add(_options); } // Recalls options from the stack. internal void PopOptions() { _options = (RegexOptions) _optionsStack[_optionsStack.Count - 1]; _optionsStack.RemoveAt(_optionsStack.Count - 1); } // True if options stack is empty. internal bool EmptyOptionsStack() { return(_optionsStack.Count == 0); } // Pops the option stack, but keeps the current options unchanged. internal void PopKeepOptions() { _optionsStack.RemoveAt(_optionsStack.Count - 1); } // Fills in an ArgumentException internal ArgumentException MakeException(String message) { // XXX: return new ArgumentException(SR.GetString(SR.MakeException, _pattern, message), _pattern); return new ArgumentException(message, _pattern); } // Returns the current parsing position. internal int Textpos() { return _currentPos; } // Zaps to a specific parsing position. internal void Textto(int pos) { _currentPos = pos; } // Returns the char at the right of the current parsing position and advances to the right. internal char RightCharNext() { return _pattern[_currentPos++]; } // Returns the char at the right of the current parsing position and advances to the right. internal char RightNext() { return _pattern[_currentPos++]; } // Moves the current parsing position one to the left. internal void LeftNext() { --_currentPos; } // Returns the char left of the current parsing position. internal char CharAt(int i) { return _pattern[i]; } // Returns the char right of the current parsing position. internal char RightChar() { return _pattern[_currentPos]; } // Returns the char i chars right of the current parsing position. internal char RightChar(int i) { return _pattern[_currentPos + i]; } // Number of characters to the right of the current parsing position. internal int CharsRight() { return _pattern.Length - _currentPos; } // Number of characters to the left of the current parsing position. //internal int CharsLeft() { // return _currentPos; //} // Returns the char left of the current parsing position. //internal char LeftChar() { // return _pattern[_currentPos - 1]; //} } }