654 lines
22 KiB
C#
654 lines
22 KiB
C#
//------------------------------------------------------------------------------
|
|
// <copyright company='Microsoft Corporation'>
|
|
//
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
//
|
|
// </copyright>
|
|
//------------------------------------------------------------------------------
|
|
|
|
// This RegexFCD class is internal to the Regex package.
|
|
// It builds a bunch of FC information (RegexFC) about
|
|
// the regex for optimization purposes.
|
|
//
|
|
|
|
// Implementation notes:
|
|
//
|
|
// This step is as simple as walking the tree and emitting
|
|
// sequences of codes.
|
|
//
|
|
#define ECMA
|
|
|
|
namespace System.Text.RegularExpressions {
|
|
|
|
using System.Collections;
|
|
using System.Globalization;
|
|
|
|
internal sealed class RegexFCD {
|
|
internal int[] _intStack;
|
|
internal int _intDepth;
|
|
internal RegexFC[] _fcStack;
|
|
internal int _fcDepth;
|
|
internal bool _earlyexit;
|
|
internal bool _skipchild;
|
|
|
|
internal const int BeforeChild = 64;
|
|
internal const int AfterChild = 128;
|
|
|
|
// where the regex can be pegged
|
|
|
|
internal const int Beginning = 0x0001;
|
|
internal const int Bol = 0x0002;
|
|
internal const int Start = 0x0004;
|
|
internal const int Eol = 0x0008;
|
|
internal const int EndZ = 0x0010;
|
|
internal const int End = 0x0020;
|
|
internal const int Boundary = 0x0040;
|
|
internal const int ECMABoundary = 0x0080;
|
|
|
|
internal const int infinite = RegexCode.infinite;
|
|
|
|
// This is the one of the only two functions that should be called from outside.
|
|
// It takes a RegexTree and computes the set of chars that can start it.
|
|
internal static RegexPrefix FirstChars(RegexTree t) {
|
|
RegexFCD s = new RegexFCD();
|
|
RegexFC fc = s.RegexFCFromRegexTree(t);
|
|
|
|
if (fc._nullable)
|
|
return null;
|
|
|
|
CultureInfo culture = ((t._options & RegexOptions.CultureInvariant) != 0) ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
|
|
return new RegexPrefix(fc.GetFirstChars(culture), fc.IsCaseInsensitive());
|
|
}
|
|
|
|
// This is a related computation: it takes a RegexTree and computes the
|
|
// leading substring if it see one. It's quite trivial and gives up easily.
|
|
internal static RegexPrefix Prefix(RegexTree tree) {
|
|
RegexNode curNode;
|
|
RegexNode concatNode = null;
|
|
int nextChild = 0;
|
|
|
|
curNode = tree._root;
|
|
|
|
for (;;) {
|
|
switch (curNode._type) {
|
|
case RegexNode.Concatenate:
|
|
if (curNode.ChildCount() > 0) {
|
|
concatNode = curNode;
|
|
nextChild = 0;
|
|
}
|
|
break;
|
|
|
|
case RegexNode.Greedy:
|
|
case RegexNode.Capture:
|
|
curNode = curNode.Child(0);
|
|
concatNode = null;
|
|
continue;
|
|
|
|
case RegexNode.Oneloop:
|
|
case RegexNode.Onelazy:
|
|
case RegexNode.Multi:
|
|
goto OuterloopBreak;
|
|
|
|
case RegexNode.Bol:
|
|
case RegexNode.Eol:
|
|
case RegexNode.Boundary:
|
|
case RegexNode.ECMABoundary:
|
|
case RegexNode.Beginning:
|
|
case RegexNode.Start:
|
|
case RegexNode.EndZ:
|
|
case RegexNode.End:
|
|
case RegexNode.Empty:
|
|
case RegexNode.Require:
|
|
case RegexNode.Prevent:
|
|
break;
|
|
|
|
default:
|
|
return RegexPrefix.Empty;
|
|
}
|
|
|
|
if (concatNode == null || nextChild >= concatNode.ChildCount())
|
|
return RegexPrefix.Empty;
|
|
|
|
curNode = concatNode.Child(nextChild++);
|
|
}
|
|
|
|
OuterloopBreak:
|
|
;
|
|
|
|
switch (curNode._type) {
|
|
case RegexNode.Multi:
|
|
return new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase));
|
|
|
|
case RegexNode.Oneloop:
|
|
goto
|
|
case RegexNode.Onelazy;
|
|
case RegexNode.Onelazy:
|
|
if (curNode._m > 0) {
|
|
StringBuilder sb = new StringBuilder();
|
|
sb.Append(curNode._ch, curNode._m);
|
|
return new RegexPrefix(sb.ToString(), 0 != (curNode._options & RegexOptions.IgnoreCase));
|
|
}
|
|
// else fall through
|
|
goto default;
|
|
|
|
default:
|
|
return RegexPrefix.Empty;
|
|
}
|
|
}
|
|
|
|
// This is a related computation: it takes a RegexTree and computes the
|
|
// leading []* construct if it see one. It's quite trivial and gives up easily.
|
|
internal static RegexPrefix ScanChars(RegexTree tree) {
|
|
RegexNode curNode;
|
|
RegexNode concatNode = null;
|
|
int nextChild = 0;
|
|
String foundSet = null;
|
|
bool caseInsensitive = false;
|
|
|
|
curNode = tree._root;
|
|
|
|
for (;;) {
|
|
switch (curNode._type) {
|
|
case RegexNode.Concatenate:
|
|
if (curNode.ChildCount() > 0) {
|
|
concatNode = curNode;
|
|
nextChild = 0;
|
|
}
|
|
break;
|
|
|
|
case RegexNode.Greedy:
|
|
case RegexNode.Capture:
|
|
curNode = curNode.Child(0);
|
|
concatNode = null;
|
|
continue;
|
|
|
|
case RegexNode.Bol:
|
|
case RegexNode.Eol:
|
|
case RegexNode.Boundary:
|
|
case RegexNode.ECMABoundary:
|
|
case RegexNode.Beginning:
|
|
case RegexNode.Start:
|
|
case RegexNode.EndZ:
|
|
case RegexNode.End:
|
|
case RegexNode.Empty:
|
|
case RegexNode.Require:
|
|
case RegexNode.Prevent:
|
|
break;
|
|
|
|
case RegexNode.Oneloop:
|
|
case RegexNode.Onelazy:
|
|
if (curNode._n != infinite)
|
|
return null;
|
|
|
|
foundSet = RegexCharClass.SetFromChar(curNode._ch);
|
|
caseInsensitive = (0 != (curNode._options & RegexOptions.IgnoreCase));
|
|
break;
|
|
|
|
case RegexNode.Notoneloop:
|
|
case RegexNode.Notonelazy:
|
|
if (curNode._n != infinite)
|
|
return null;
|
|
|
|
foundSet = RegexCharClass.SetInverseFromChar(curNode._ch);
|
|
caseInsensitive = (0 != (curNode._options & RegexOptions.IgnoreCase));
|
|
break;
|
|
|
|
case RegexNode.Setloop:
|
|
case RegexNode.Setlazy:
|
|
if (curNode._n != infinite || (curNode._str2 != null && curNode._str2.Length != 0))
|
|
return null;
|
|
|
|
foundSet = curNode._str;
|
|
caseInsensitive = (0 != (curNode._options & RegexOptions.IgnoreCase));
|
|
break;
|
|
|
|
default:
|
|
return null;
|
|
}
|
|
|
|
if (foundSet != null)
|
|
return new RegexPrefix(foundSet, caseInsensitive);
|
|
|
|
if (concatNode == null || nextChild >= concatNode.ChildCount())
|
|
return null;
|
|
|
|
curNode = concatNode.Child(nextChild++);
|
|
}
|
|
}
|
|
|
|
// Yet another related computation: it takes a RegexTree and computes the
|
|
// leading anchors that it encounters.
|
|
internal static int Anchors(RegexTree tree) {
|
|
RegexNode curNode;
|
|
RegexNode concatNode = null;
|
|
int nextChild = 0;
|
|
int result = 0;
|
|
|
|
curNode = tree._root;
|
|
|
|
for (;;) {
|
|
switch (curNode._type) {
|
|
case RegexNode.Concatenate:
|
|
if (curNode.ChildCount() > 0) {
|
|
concatNode = curNode;
|
|
nextChild = 0;
|
|
}
|
|
break;
|
|
|
|
case RegexNode.Greedy:
|
|
case RegexNode.Capture:
|
|
curNode = curNode.Child(0);
|
|
concatNode = null;
|
|
continue;
|
|
|
|
case RegexNode.Bol:
|
|
case RegexNode.Eol:
|
|
case RegexNode.Boundary:
|
|
case RegexNode.ECMABoundary:
|
|
case RegexNode.Beginning:
|
|
case RegexNode.Start:
|
|
case RegexNode.EndZ:
|
|
case RegexNode.End:
|
|
return result | AnchorFromType(curNode._type);
|
|
|
|
case RegexNode.Empty:
|
|
case RegexNode.Require:
|
|
case RegexNode.Prevent:
|
|
break;
|
|
|
|
default:
|
|
return result;
|
|
}
|
|
|
|
if (concatNode == null || nextChild >= concatNode.ChildCount())
|
|
return result;
|
|
|
|
curNode = concatNode.Child(nextChild++);
|
|
}
|
|
}
|
|
|
|
// Convert anchor type to anchor bit.
|
|
internal static int AnchorFromType(int type) {
|
|
switch (type) {
|
|
case RegexNode.Bol: return Bol;
|
|
case RegexNode.Eol: return Eol;
|
|
case RegexNode.Boundary: return Boundary;
|
|
case RegexNode.ECMABoundary: return ECMABoundary;
|
|
case RegexNode.Beginning: return Beginning;
|
|
case RegexNode.Start: return Start;
|
|
case RegexNode.EndZ: return EndZ;
|
|
case RegexNode.End: return End;
|
|
default: return 0;
|
|
}
|
|
}
|
|
|
|
#if DBG
|
|
internal static String AnchorDescription(int anchors) {
|
|
StringBuilder sb = new StringBuilder();
|
|
|
|
if (0 != (anchors & Beginning)) sb.Append(", Beginning");
|
|
if (0 != (anchors & Start)) sb.Append(", Start");
|
|
if (0 != (anchors & Bol)) sb.Append(", Bol");
|
|
if (0 != (anchors & Boundary)) sb.Append(", Boundary");
|
|
if (0 != (anchors & ECMABoundary)) sb.Append(", ECMABoundary");
|
|
if (0 != (anchors & Eol)) sb.Append(", Eol");
|
|
if (0 != (anchors & End)) sb.Append(", End");
|
|
if (0 != (anchors & EndZ)) sb.Append(", EndZ");
|
|
|
|
if (sb.Length >= 2)
|
|
return(sb.ToString(2, sb.Length - 2));
|
|
|
|
return "None";
|
|
}
|
|
#endif
|
|
|
|
// private constructor; can't be created outside
|
|
private RegexFCD() {
|
|
_fcStack = new RegexFC[32];
|
|
_intStack = new int[32];
|
|
}
|
|
|
|
// To avoid recursion, we use a simple integer stack.
|
|
// This is the push.
|
|
internal void PushInt(int I) {
|
|
if (_intDepth >= _intStack.Length) {
|
|
int [] expanded = new int[_intDepth * 2];
|
|
|
|
System.Array.Copy(_intStack, 0, expanded, 0, _intDepth);
|
|
|
|
_intStack = expanded;
|
|
}
|
|
|
|
_intStack[_intDepth++] = I;
|
|
}
|
|
|
|
// True if the stack is empty.
|
|
internal bool EmptyInt() {
|
|
return _intDepth == 0;
|
|
}
|
|
|
|
// This is the pop.
|
|
internal int PopInt() {
|
|
return _intStack[--_intDepth];
|
|
}
|
|
|
|
// We also use a stack of RegexFC objects.
|
|
// This is the push.
|
|
internal void PushFC(RegexFC fc) {
|
|
if (_fcDepth >= _fcStack.Length) {
|
|
RegexFC[] expanded = new RegexFC[_fcDepth * 2];
|
|
|
|
System.Array.Copy(_fcStack, 0, expanded, 0, _fcDepth);
|
|
_fcStack = expanded;
|
|
}
|
|
|
|
_fcStack[_fcDepth++] = fc;
|
|
}
|
|
|
|
// True if the stack is empty.
|
|
internal bool EmptyFC() {
|
|
return _fcDepth == 0;
|
|
}
|
|
|
|
// This is the pop.
|
|
internal RegexFC PopFC() {
|
|
return _fcStack[--_fcDepth];
|
|
}
|
|
|
|
// This is the top.
|
|
internal RegexFC TopFC() {
|
|
return _fcStack[_fcDepth - 1];
|
|
}
|
|
|
|
// The main FC computation. It does a shortcutted depth-first walk
|
|
// through the tree and calls CalculateFC to emits code before
|
|
// and after each child of an interior node, and at each leaf.
|
|
internal RegexFC RegexFCFromRegexTree(RegexTree tree) {
|
|
RegexNode curNode;
|
|
int curChild;
|
|
|
|
curNode = tree._root;
|
|
curChild = 0;
|
|
|
|
for (;;) {
|
|
if (curNode._children == null) {
|
|
CalculateFC(curNode._type, curNode, 0);
|
|
}
|
|
else if (curChild < curNode._children.Count && !_earlyexit) {
|
|
CalculateFC(curNode._type | BeforeChild, curNode, curChild);
|
|
|
|
if (!_skipchild) {
|
|
curNode = (RegexNode)curNode._children[curChild];
|
|
PushInt(curChild);
|
|
curChild = 0;
|
|
}
|
|
else {
|
|
curChild++;
|
|
_skipchild = false;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
_earlyexit = false;
|
|
|
|
if (EmptyInt())
|
|
break;
|
|
|
|
curChild = PopInt();
|
|
curNode = curNode._next;
|
|
|
|
CalculateFC(curNode._type | AfterChild, curNode, curChild);
|
|
curChild++;
|
|
}
|
|
|
|
if (EmptyFC())
|
|
return new RegexFC(RegexCharClass.Any, true, false);
|
|
|
|
return PopFC();
|
|
}
|
|
|
|
// Called in AfterChild to prevent processing of the rest of the children at the current level
|
|
internal void EarlyExit() {
|
|
_earlyexit = true;
|
|
}
|
|
|
|
// Called in Beforechild to prevent further processing of the current child
|
|
internal void SkipChild() {
|
|
_skipchild = true;
|
|
}
|
|
|
|
// FC computation and shortcut cases for each node type
|
|
internal void CalculateFC(int NodeType, RegexNode node, int CurIndex) {
|
|
bool ci = false;
|
|
bool rtl = false;
|
|
|
|
if (NodeType <= RegexNode.Ref) {
|
|
if ((node._options & RegexOptions.IgnoreCase) != 0)
|
|
ci = true;
|
|
if ((node._options & RegexOptions.RightToLeft) != 0)
|
|
rtl = true;
|
|
}
|
|
|
|
switch (NodeType) {
|
|
case RegexNode.Concatenate | BeforeChild:
|
|
case RegexNode.Alternate | BeforeChild:
|
|
case RegexNode.Testref | BeforeChild:
|
|
case RegexNode.Loop | BeforeChild:
|
|
case RegexNode.Lazyloop | BeforeChild:
|
|
break;
|
|
|
|
case RegexNode.Testgroup | BeforeChild:
|
|
if (CurIndex == 0)
|
|
SkipChild();
|
|
break;
|
|
|
|
case RegexNode.Empty:
|
|
PushFC(new RegexFC(true));
|
|
break;
|
|
|
|
case RegexNode.Concatenate | AfterChild:
|
|
if (CurIndex != 0) {
|
|
RegexFC child = PopFC();
|
|
RegexFC cumul = TopFC();
|
|
|
|
cumul.AddFC(child, true);
|
|
}
|
|
|
|
if (!TopFC()._nullable)
|
|
EarlyExit();
|
|
break;
|
|
|
|
case RegexNode.Testgroup | AfterChild:
|
|
if (CurIndex > 1) {
|
|
RegexFC child = PopFC();
|
|
RegexFC cumul = TopFC();
|
|
|
|
cumul.AddFC(child, false);
|
|
}
|
|
break;
|
|
|
|
case RegexNode.Alternate | AfterChild:
|
|
case RegexNode.Testref | AfterChild:
|
|
if (CurIndex != 0) {
|
|
RegexFC child = PopFC();
|
|
RegexFC cumul = TopFC();
|
|
|
|
cumul.AddFC(child, false);
|
|
}
|
|
break;
|
|
|
|
case RegexNode.Loop | AfterChild:
|
|
case RegexNode.Lazyloop | AfterChild:
|
|
if (node._m == 0)
|
|
TopFC()._nullable = true;
|
|
break;
|
|
|
|
case RegexNode.Group | BeforeChild:
|
|
case RegexNode.Group | AfterChild:
|
|
case RegexNode.Capture | BeforeChild:
|
|
case RegexNode.Capture | AfterChild:
|
|
case RegexNode.Greedy | BeforeChild:
|
|
case RegexNode.Greedy | AfterChild:
|
|
break;
|
|
|
|
case RegexNode.Require | BeforeChild:
|
|
case RegexNode.Prevent | BeforeChild:
|
|
SkipChild();
|
|
PushFC(new RegexFC(true));
|
|
break;
|
|
|
|
case RegexNode.Require | AfterChild:
|
|
case RegexNode.Prevent | AfterChild:
|
|
break;
|
|
|
|
case RegexNode.One:
|
|
case RegexNode.Notone:
|
|
PushFC(new RegexFC(node._ch, NodeType == RegexNode.Notone, false, ci));
|
|
break;
|
|
|
|
case RegexNode.Oneloop:
|
|
case RegexNode.Onelazy:
|
|
PushFC(new RegexFC(node._ch, false, node._m == 0, ci));
|
|
break;
|
|
|
|
case RegexNode.Notoneloop:
|
|
case RegexNode.Notonelazy:
|
|
PushFC(new RegexFC(node._ch, true, node._m == 0, ci));
|
|
break;
|
|
|
|
case RegexNode.Multi:
|
|
if (node._str.Length == 0)
|
|
PushFC(new RegexFC(true));
|
|
else if (!rtl)
|
|
PushFC(new RegexFC(node._str[0], false, false, ci));
|
|
else
|
|
PushFC(new RegexFC(node._str[node._str.Length - 1], false, false, ci));
|
|
break;
|
|
|
|
case RegexNode.Set:
|
|
// mark this node as nullable if we have some categories
|
|
PushFC(new RegexFC(node._str, !(node._str2 == null || node._str2.Length == 0), ci));
|
|
break;
|
|
|
|
case RegexNode.Setloop:
|
|
case RegexNode.Setlazy:
|
|
// don't need to worry about categories since this is nullable
|
|
PushFC(new RegexFC(node._str, true, ci));
|
|
break;
|
|
|
|
case RegexNode.Ref:
|
|
PushFC(new RegexFC(RegexCharClass.Any, true, false));
|
|
break;
|
|
|
|
case RegexNode.Nothing:
|
|
case RegexNode.Bol:
|
|
case RegexNode.Eol:
|
|
case RegexNode.Boundary:
|
|
case RegexNode.Nonboundary:
|
|
case RegexNode.ECMABoundary:
|
|
case RegexNode.NonECMABoundary:
|
|
case RegexNode.Beginning:
|
|
case RegexNode.Start:
|
|
case RegexNode.EndZ:
|
|
case RegexNode.End:
|
|
PushFC(new RegexFC(true));
|
|
break;
|
|
|
|
default:
|
|
throw new ArgumentException("Unexpected Opcode");//XXX: SR.GetString(SR.UnexpectedOpcode, NodeType.ToString()));
|
|
}
|
|
}
|
|
}
|
|
|
|
internal sealed class RegexFC {
|
|
internal RegexCharClass _cc;
|
|
internal bool _nullable;
|
|
internal bool _caseInsensitive;
|
|
|
|
internal RegexFC(bool nullable) {
|
|
_cc = new RegexCharClass();
|
|
_nullable = nullable;
|
|
}
|
|
|
|
internal RegexFC(char ch, bool not, bool nullable, bool caseInsensitive) {
|
|
_cc = new RegexCharClass();
|
|
|
|
if (not) {
|
|
if (ch > 0)
|
|
_cc.AddRange('\0', (char)(ch - 1));
|
|
if (ch < 0xFFFF)
|
|
_cc.AddRange((char)(ch + 1), '\uFFFF');
|
|
}
|
|
else {
|
|
_cc.AddRange(ch, ch);
|
|
}
|
|
|
|
_caseInsensitive = caseInsensitive;
|
|
_nullable = nullable;
|
|
}
|
|
|
|
internal RegexFC(String set, bool nullable, bool caseInsensitive) {
|
|
_cc = new RegexCharClass();
|
|
|
|
_cc.AddSet(set);
|
|
_nullable = nullable;
|
|
_caseInsensitive = caseInsensitive;
|
|
}
|
|
|
|
internal void AddFC(RegexFC fc, bool concatenate) {
|
|
if (concatenate) {
|
|
if (!_nullable)
|
|
return;
|
|
|
|
if (!fc._nullable)
|
|
_nullable = false;
|
|
}
|
|
else {
|
|
if (fc._nullable)
|
|
_nullable = true;
|
|
}
|
|
|
|
_caseInsensitive |= fc._caseInsensitive;
|
|
_cc.AddCharClass(fc._cc);
|
|
}
|
|
|
|
internal String GetFirstChars(CultureInfo culture) {
|
|
return _cc.ToSetCi(_caseInsensitive, culture);
|
|
}
|
|
|
|
internal bool IsCaseInsensitive() {
|
|
return _caseInsensitive;
|
|
}
|
|
}
|
|
|
|
internal sealed class RegexPrefix {
|
|
internal RegexPrefix(String prefix, bool ci) {
|
|
_prefix = prefix;
|
|
_caseInsensitive = ci;
|
|
}
|
|
|
|
internal String Prefix {
|
|
get {
|
|
return _prefix;
|
|
}
|
|
}
|
|
|
|
internal bool CaseInsensitive {
|
|
get {
|
|
return _caseInsensitive;
|
|
}
|
|
}
|
|
|
|
internal String _prefix;
|
|
internal bool _caseInsensitive;
|
|
|
|
internal static RegexPrefix _empty = new RegexPrefix(String.Empty, false);
|
|
|
|
internal static RegexPrefix Empty {
|
|
get {
|
|
return _empty;
|
|
}
|
|
}
|
|
}
|
|
}
|