//------------------------------------------------------------------------------ // // // Copyright (c) Microsoft Corporation. All rights reserved. // // //------------------------------------------------------------------------------ // This RegexInterpreter class is internal to the RegularExpression package. // It executes a block of regular expression codes while consuming // input. // // Implementation notes: // // #define ECMA namespace System.Text.RegularExpressions { using System.Collections; using System.Diagnostics; using System.Globalization; internal sealed class RegexInterpreter : RegexRunner { internal int runoperator; internal int [] runcodes; internal int runcodepos; internal String [] runstrings; internal RegexCode runcode; internal RegexPrefix runfcPrefix; internal RegexPrefix runscPrefix; internal RegexBoyerMoore runbmPrefix; internal int runanchors; internal bool runrtl; internal bool runci; internal CultureInfo runculture; internal const int infinite = RegexNode.infinite; internal RegexInterpreter(RegexCode code, CultureInfo culture) { runcode = code; runcodes = code._codes; runstrings = code._strings; runfcPrefix = code._fcPrefix; runscPrefix = code._scPrefix; runbmPrefix = code._bmPrefix; runanchors = code._anchors; runculture = culture; } protected override void InitTrackCount() { runtrackcount = runcode._trackcount; } private void Advance() { Advance(0); } private void Advance(int i) { runcodepos += (i + 1); SetOperator(runcodes[runcodepos]); } private void Goto(int newpos) { // when branching backward, ensure storage if (newpos < runcodepos) EnsureStorage(); SetOperator(runcodes[newpos]); runcodepos = newpos; } private void Textto(int newpos) { runtextpos = newpos; } private void Trackto(int newpos) { runtrackpos = runtrack.Length - newpos; } private int Textstart() { return runtextstart; } private int Textpos() { return runtextpos; } // push onto the backtracking stack private int Trackpos() { return runtrack.Length - runtrackpos; } private void Track() { runtrack[--runtrackpos] = runcodepos; } private void Track(int I1) { runtrack[--runtrackpos] = I1; runtrack[--runtrackpos] = runcodepos; } private void Track(int I1, int I2) { runtrack[--runtrackpos] = I1; runtrack[--runtrackpos] = I2; runtrack[--runtrackpos] = runcodepos; } private void Track(int I1, int I2, int I3) { runtrack[--runtrackpos] = I1; runtrack[--runtrackpos] = I2; runtrack[--runtrackpos] = I3; runtrack[--runtrackpos] = runcodepos; } private void Track2(int I1) { runtrack[--runtrackpos] = I1; runtrack[--runtrackpos] = -runcodepos; } private void Track2(int I1, int I2) { runtrack[--runtrackpos] = I1; runtrack[--runtrackpos] = I2; runtrack[--runtrackpos] = -runcodepos; } private void Backtrack() { int newpos = runtrack[runtrackpos++]; if (newpos < 0) { newpos = -newpos; SetOperator(runcodes[newpos] | RegexCode.Back2); } else { SetOperator(runcodes[newpos] | RegexCode.Back); } // When branching backward, ensure storage if (newpos < runcodepos) EnsureStorage(); runcodepos = newpos; } private void SetOperator(int op) { runci = (0 != (op & RegexCode.Ci)); runrtl = (0 != (op & RegexCode.Rtl)); runoperator = op & ~(RegexCode.Rtl | RegexCode.Ci); } // pop framesize items from the backtracking stack private void Trackframe(int framesize) { runtrackpos += framesize; } // get the ith element down on the backtracking stack private int Tracked(int i) { return runtrack[runtrackpos - i - 1]; } // Push onto the grouping stack private void Stack(int I1) { runstack[--runstackpos] = I1; } private void Stack(int I1, int I2) { runstack[--runstackpos] = I1; runstack[--runstackpos] = I2; } // pop framesize items from the grouping stack private void Stackframe(int framesize) { runstackpos += framesize; } // get the ith element down on the grouping stack private int Stacked(int i) { return runstack[runstackpos - i - 1]; } private int Operator() { return runoperator; } private int Operand(int i) { return runcodes[runcodepos + i + 1]; } private int Leftchars() { return runtextpos - runtextbeg; } private int Rightchars() { return runtextend - runtextpos; } private int Bump() { return runrtl ? -1 : 1; } private int Forwardchars() { return runrtl ? runtextpos - runtextbeg : runtextend - runtextpos; } private char Forwardcharnext() { char ch = (runrtl ? runtext[--runtextpos] : runtext[runtextpos++]); //return(runci ? Char.ToLower(ch, runculture) : ch); return(runci ? Char.ToLower(ch) : ch); } private bool Stringmatch(String str) { int c; int pos; if (!runrtl) { if (runtextend - runtextpos < (c = str.Length)) return false; pos = runtextpos + c; } else { if (runtextpos - runtextbeg < (c = str.Length)) return false; pos = runtextpos; } if (!runci) { while (c != 0) if (str[--c] != runtext[--pos]) return false; } else { while (c != 0) if (str[--c] != Char.ToLower(runtext[--pos]))//, runculture)) return false; } if (!runrtl) { pos += str.Length; } runtextpos = pos; return true; } private bool Refmatch(int index, int len) { int c; int pos; int cmpos; if (!runrtl) { if (runtextend - runtextpos < len) return false; pos = runtextpos + len; } else { if (runtextpos - runtextbeg < len) return false; pos = runtextpos; } cmpos = index + len; c = len; if (!runci) { while (c-- != 0) if (runtext[--cmpos] != runtext[--pos]) return false; } else { while (c-- != 0) // if (Char.ToLower(runtext[--cmpos], runculture) != Char.ToLower(runtext[--pos], runculture)) if (Char.ToLower(runtext[--cmpos]) != Char.ToLower(runtext[--pos])) return false; } if (!runrtl) { pos += len; } runtextpos = pos; return true; } private void Backwardnext() { runtextpos += runrtl ? 1 : -1; } private char CharAt(int j) { return runtext[j]; } protected override bool FindFirstChar() { int i; String set; if (0 != (runanchors & (RegexFCD.Beginning | RegexFCD.Start | RegexFCD.EndZ | RegexFCD.End))) { if (!runcode._rightToLeft) { if ((0 != (runanchors & RegexFCD.Beginning) && runtextpos > runtextbeg) || (0 != (runanchors & RegexFCD.Start) && runtextpos > runtextstart)) { runtextpos = runtextend; return false; } if (0 != (runanchors & RegexFCD.EndZ) && runtextpos < runtextend - 1) { runtextpos = runtextend - 1; } else if (0 != (runanchors & RegexFCD.End) && runtextpos < runtextend) { runtextpos = runtextend; } } else { if ((0 != (runanchors & RegexFCD.End) && runtextpos < runtextend) || (0 != (runanchors & RegexFCD.EndZ) && (runtextpos < runtextend - 1 || (runtextpos == runtextend - 1 && CharAt(runtextpos) != '\n'))) || (0 != (runanchors & RegexFCD.Start) && runtextpos < runtextstart)) { runtextpos = runtextbeg; return false; } if (0 != (runanchors & RegexFCD.Beginning) && runtextpos > runtextbeg) { runtextpos = runtextbeg; } } if (runbmPrefix != null) { return runbmPrefix.IsMatch(runtext, runtextpos, runtextbeg, runtextend); } } else if (runbmPrefix != null) { runtextpos = runbmPrefix.Scan(runtext, runtextpos, runtextbeg, runtextend); if (runtextpos == -1) { runtextpos = (runcode._rightToLeft ? runtextbeg : runtextend); return false; } return true; } if (runfcPrefix == null) return true; runrtl = runcode._rightToLeft; runci = runfcPrefix.CaseInsensitive; set = runfcPrefix.Prefix; if (RegexCharClass.IsSingleton(set)) { char ch = RegexCharClass.SingletonChar(set); for (i = Forwardchars(); i > 0; i--) { if (ch == Forwardcharnext()) { Backwardnext(); return true; } } } else { for (i = Forwardchars(); i > 0; i--) { if (RegexCharClass.CharInSet(Forwardcharnext(), set, String.Empty)) { Backwardnext(); return true; } } } return false; } protected override void Go() { Goto(0); for (;;) { #if DBG if (runmatch.Debug) { DumpState(); } #endif switch (Operator()) { case RegexCode.Stop: return; case RegexCode.Nothing: break; case RegexCode.Goto: Goto(Operand(0)); continue; case RegexCode.Testref: if (!IsMatched(Operand(0))) break; Advance(1); continue; case RegexCode.Lazybranch: Track(Textpos()); Advance(1); continue; case RegexCode.Lazybranch | RegexCode.Back: Trackframe(1); Textto(Tracked(0)); Goto(Operand(0)); continue; case RegexCode.Setmark: Stack(Textpos()); Track(); Advance(); continue; case RegexCode.Nullmark: Stack(-1); Track(); Advance(); continue; case RegexCode.Setmark | RegexCode.Back: case RegexCode.Nullmark | RegexCode.Back: Stackframe(1); break; case RegexCode.Getmark: Stackframe(1); Track(Stacked(0)); Textto(Stacked(0)); Advance(); continue; case RegexCode.Getmark | RegexCode.Back: Trackframe(1); Stack(Tracked(0)); break; case RegexCode.Capturemark: if (Operand(1) != -1 && !IsMatched(Operand(1))) break; Stackframe(1); if (Operand(1) != -1) TransferCapture(Operand(0), Operand(1), Stacked(0), Textpos()); else Capture(Operand(0), Stacked(0), Textpos()); Track(Stacked(0)); Advance(2); /* */ continue; case RegexCode.Capturemark | RegexCode.Back: Trackframe(1); Stack(Tracked(0)); Uncapture(); if (Operand(0) != -1 && Operand(1) != -1) Uncapture(); break; case RegexCode.Branchmark: { int matched; Stackframe(1); matched = Textpos() - Stacked(0); if (matched != 0) { // Nonempty match -> loop now Track(Stacked(0), Textpos()); // Save old mark, textpos Stack(Textpos()); // Make new mark Goto(Operand(0)); // Loop } else { // Empty match -> straight now Track2(Stacked(0)); // Save old mark Advance(1); // Straight } continue; } case RegexCode.Branchmark | RegexCode.Back: Trackframe(2); Stackframe(1); Textto(Tracked(1)); // Recall position Track2(Tracked(0)); // Save old mark Advance(1); // Straight continue; case RegexCode.Branchmark | RegexCode.Back2: Trackframe(1); Stack(Tracked(0)); // Recall old mark break; // Backtrack case RegexCode.Lazybranchmark: { int matched; Stackframe(1); matched = Textpos() - Stacked(0); if (matched != 0) { // Nonempty match -> next loop Track(Stacked(0), Textpos()); // Save old mark, textpos } else { // Empty match -> no loop Track2(Stacked(0)); // Save old mark } Advance(1); continue; } case RegexCode.Lazybranchmark | RegexCode.Back: { int pos; Trackframe(2); pos = Tracked(1); Track2(Tracked(0)); // Save old mark Stack(pos); // Make new mark Textto(pos); // Recall position Goto(Operand(0)); // Loop continue; } case RegexCode.Lazybranchmark | RegexCode.Back2: Stackframe(1); Trackframe(1); Stack(Tracked(0)); // Recall old mark break; case RegexCode.Setcount: Stack(Textpos(), Operand(0)); Track(); Advance(1); continue; case RegexCode.Nullcount: Stack(-1, Operand(0)); Track(); Advance(1); continue; case RegexCode.Setcount | RegexCode.Back: Stackframe(2); break; case RegexCode.Nullcount | RegexCode.Back: Stackframe(2); break; case RegexCode.Branchcount: // Stack: // 0: Mark // 1: Count { Stackframe(2); int mark = Stacked(0); int count = Stacked(1); int matched = Textpos() - mark; if (count >= Operand(1) || (matched == 0 && count >= 0)) { // Max loops or empty match -> straight now Track2(mark, count); // Save old mark, count Advance(2); // Straight } else { // Nonempty match -> count+loop now Track(mark); // remember mark Stack(Textpos(), count + 1); // Make new mark, incr count Goto(Operand(0)); // Loop } continue; } case RegexCode.Branchcount | RegexCode.Back: // Track: // 0: Previous mark // Stack: // 0: Mark (= current pos, discarded) // 1: Count Trackframe(1); Stackframe(2); if (Stacked(1) > 0) { // Positive -> can go straight Textto(Stacked(0)); // Zap to mark Track2(Tracked(0), Stacked(1) - 1); // Save old mark, old count Advance(2); // Straight continue; } Stack(Tracked(0), Stacked(1) - 1); // recall old mark, old count break; case RegexCode.Branchcount | RegexCode.Back2: // Track: // 0: Previous mark // 1: Previous count Trackframe(2); Stack(Tracked(0), Tracked(1)); // Recall old mark, old count break; // Backtrack case RegexCode.Lazybranchcount: // Stack: // 0: Mark // 1: Count { Stackframe(2); int mark = Stacked(0); int count = Stacked(1); if (count < 0) { // Negative count -> loop now Track2(mark); // Save old mark Stack(Textpos(), count + 1); // Make new mark, incr count Goto(Operand(0)); // Loop } else { // Nonneg count -> straight now Track(mark, count, Textpos()); // Save mark, count, position Advance(2); // Straight } continue; } case RegexCode.Lazybranchcount | RegexCode.Back: // Track: // 0: Mark // 1: Count // 2: Textpos { Trackframe(3); int mark = Tracked(0); int textpos = Tracked(2); if (Tracked(1) <= Operand(1) && textpos != mark) { // Under limit and not empty match -> loop Textto(textpos); // Recall position Stack(textpos, Tracked(1) + 1); // Make new mark, incr count Track2(mark); // Save old mark Goto(Operand(0)); // Loop continue; } else { // Max loops or empty match -> backtrack Stack(Tracked(0), Tracked(1)); // Recall old mark, count break; // backtrack } } case RegexCode.Lazybranchcount | RegexCode.Back2: // Track: // 0: Previous mark // Stack: // 0: Mark (== current pos, discarded) // 1: Count Trackframe(1); Stackframe(2); Stack(Tracked(0), Stacked(1) - 1); // Recall old mark, count break; // Backtrack case RegexCode.Setjump: Stack(Trackpos(), Crawlpos()); Track(); Advance(); continue; case RegexCode.Setjump | RegexCode.Back: Stackframe(2); break; case RegexCode.Backjump: // Stack: // 0: Saved trackpos // 1: Crawlpos Stackframe(2); Trackto(Stacked(0)); while (Crawlpos() != Stacked(1)) Uncapture(); break; case RegexCode.Forejump: // Stack: // 0: Saved trackpos // 1: Crawlpos Stackframe(2); Trackto(Stacked(0)); Track(Stacked(1)); Advance(); continue; case RegexCode.Forejump | RegexCode.Back: // Track: // 0: Crawlpos Trackframe(1); while (Crawlpos() != Tracked(0)) Uncapture(); break; case RegexCode.Bol: if (Leftchars() > 0 && CharAt(Textpos() - 1) != '\n') break; Advance(); continue; case RegexCode.Eol: if (Rightchars() > 0 && CharAt(Textpos()) != '\n') break; Advance(); continue; case RegexCode.Boundary: if (!IsBoundary(Textpos(), runtextbeg, runtextend)) break; Advance(); continue; case RegexCode.Nonboundary: if (IsBoundary(Textpos(), runtextbeg, runtextend)) break; Advance(); continue; case RegexCode.ECMABoundary: if (!IsECMABoundary(Textpos(), runtextbeg, runtextend)) break; Advance(); continue; case RegexCode.NonECMABoundary: if (IsECMABoundary(Textpos(), runtextbeg, runtextend)) break; Advance(); continue; case RegexCode.Beginning: if (Leftchars() > 0) break; Advance(); continue; case RegexCode.Start: if (Textpos() != Textstart()) break; Advance(); continue; case RegexCode.EndZ: if (Rightchars() > 1 || Rightchars() == 1 && CharAt(Textpos()) != '\n') break; Advance(); continue; case RegexCode.End: if (Rightchars() > 0) break; Advance(); continue; case RegexCode.One: if (Forwardchars() < 1 || Forwardcharnext() != (char)Operand(0)) break; Advance(1); continue; case RegexCode.Notone: if (Forwardchars() < 1 || Forwardcharnext() == (char)Operand(0)) break; Advance(1); continue; case RegexCode.Set: if (Forwardchars() < 1 || !RegexCharClass.CharInSet(Forwardcharnext(), runstrings[Operand(0)], runstrings[Operand(1)])) break; Advance(2); continue; case RegexCode.Multi: { if (!Stringmatch(runstrings[Operand(0)])) break; Advance(1); continue; } case RegexCode.Ref: { int capnum = Operand(0); if (IsMatched(capnum)) { if (!Refmatch(MatchIndex(capnum), MatchLength(capnum))) break; } else { if ((runregex.roptions & RegexOptions.ECMAScript) == 0) break; } Advance(1); continue; } case RegexCode.Onerep: { int c = Operand(1); if (Forwardchars() < c) break; char ch = (char)Operand(0); while (c-- > 0) if (Forwardcharnext() != ch) goto BreakBackward; Advance(2); continue; } case RegexCode.Notonerep: { int c = Operand(1); if (Forwardchars() < c) break; char ch = (char)Operand(0); while (c-- > 0) if (Forwardcharnext() == ch) goto BreakBackward; Advance(2); continue; } case RegexCode.Setrep: { int c = Operand(2); if (Forwardchars() < c) break; String set = runstrings[Operand(0)]; String cat = runstrings[Operand(1)]; while (c-- > 0) if (!RegexCharClass.CharInSet(Forwardcharnext(), set, cat)) goto BreakBackward; Advance(3); continue; } case RegexCode.Oneloop: { int c = Operand(1); if (c > Forwardchars()) c = Forwardchars(); char ch = (char)Operand(0); int i; for (i = c; i > 0; i--) { if (Forwardcharnext() != ch) { Backwardnext(); break; } } if (c > i) Track(c - i - 1, Textpos() - Bump()); Advance(2); continue; } case RegexCode.Notoneloop: { int c = Operand(1); if (c > Forwardchars()) c = Forwardchars(); char ch = (char)Operand(0); int i; for (i = c; i > 0; i--) { if (Forwardcharnext() == ch) { Backwardnext(); break; } } if (c > i) Track(c - i - 1, Textpos() - Bump()); Advance(2); continue; } case RegexCode.Setloop: { int c = Operand(2); if (c > Forwardchars()) c = Forwardchars(); String set = runstrings[Operand(0)]; String cat = runstrings[Operand(1)]; int i; for (i = c; i > 0; i--) { if (!RegexCharClass.CharInSet(Forwardcharnext(), set, cat)) { Backwardnext(); break; } } if (c > i) Track(c - i - 1, Textpos() - Bump()); Advance(3); continue; } case RegexCode.Oneloop | RegexCode.Back: case RegexCode.Notoneloop | RegexCode.Back: { Trackframe(2); int i = Tracked(0); int pos = Tracked(1); Textto(pos); if (i > 0) Track(i - 1, pos - Bump()); Advance(2); continue; } case RegexCode.Setloop | RegexCode.Back: { Trackframe(2); int i = Tracked(0); int pos = Tracked(1); Textto(pos); if (i > 0) Track(i - 1, pos - Bump()); Advance(3); continue; } case RegexCode.Onelazy: case RegexCode.Notonelazy: { int c = Operand(1); if (c > Forwardchars()) c = Forwardchars(); if (c > 0) Track(c - 1, Textpos()); Advance(2); continue; } case RegexCode.Setlazy: { int c = Operand(2); if (c > Forwardchars()) c = Forwardchars(); if (c > 0) Track(c - 1, Textpos()); Advance(3); continue; } case RegexCode.Onelazy | RegexCode.Back: { Trackframe(2); int pos = Tracked(1); Textto(pos); if (Forwardcharnext() != (char)Operand(0)) break; int i = Tracked(0); if (i > 0) Track(i - 1, pos + Bump()); Advance(2); continue; } case RegexCode.Notonelazy | RegexCode.Back: { Trackframe(2); int pos = Tracked(1); Textto(pos); if (Forwardcharnext() == (char)Operand(0)) break; int i = Tracked(0); if (i > 0) Track(i - 1, pos + Bump()); Advance(2); continue; } case RegexCode.Setlazy | RegexCode.Back: { Trackframe(2); int pos = Tracked(1); Textto(pos); if (!RegexCharClass.CharInSet(Forwardcharnext(), runstrings[Operand(0)], runstrings[Operand(1)])) break; int i = Tracked(0); if (i > 0) Track(i - 1, pos + Bump()); Advance(3); continue; } default: throw new Exception("Unimplemented State");//XXX: SR.GetString(SR.UnimplementedState)); //NotImplementedException("Unimplemented State");//XXX: SR.GetString(SR.UnimplementedState)); } BreakBackward: ; // "break Backward" comes here: Backtrack(); } } #if DBG public override void DumpState() { base.DumpState(); Debug.WriteLine(" " + runcode.OpcodeDescription(runcodepos) + ((runoperator & RegexCode.Back) != 0 ? " Back" : "") + ((runoperator & RegexCode.Back2) != 0 ? " Back2" : "")); } #endif } }