singrdk/base/Kernel/System/Text/RegularExpressions/regexfcd.cs

654 lines
22 KiB
C#

//------------------------------------------------------------------------------
// <copyright company='Microsoft Corporation'>
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// </copyright>
//------------------------------------------------------------------------------
// This RegexFCD class is internal to the Regex package.
// It builds a bunch of FC information (RegexFC) about
// the regex for optimization purposes.
//
// Implementation notes:
//
// This step is as simple as walking the tree and emitting
// sequences of codes.
//
#define ECMA
namespace System.Text.RegularExpressions {
using System.Collections;
using System.Globalization;
internal sealed class RegexFCD {
internal int[] _intStack;
internal int _intDepth;
internal RegexFC[] _fcStack;
internal int _fcDepth;
internal bool _earlyexit;
internal bool _skipchild;
internal const int BeforeChild = 64;
internal const int AfterChild = 128;
// where the regex can be pegged
internal const int Beginning = 0x0001;
internal const int Bol = 0x0002;
internal const int Start = 0x0004;
internal const int Eol = 0x0008;
internal const int EndZ = 0x0010;
internal const int End = 0x0020;
internal const int Boundary = 0x0040;
internal const int ECMABoundary = 0x0080;
internal const int infinite = RegexCode.infinite;
// This is the one of the only two functions that should be called from outside.
// It takes a RegexTree and computes the set of chars that can start it.
internal static RegexPrefix FirstChars(RegexTree t) {
RegexFCD s = new RegexFCD();
RegexFC fc = s.RegexFCFromRegexTree(t);
if (fc._nullable)
return null;
CultureInfo culture = ((t._options & RegexOptions.CultureInvariant) != 0) ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
return new RegexPrefix(fc.GetFirstChars(culture), fc.IsCaseInsensitive());
}
// This is a related computation: it takes a RegexTree and computes the
// leading substring if it see one. It's quite trivial and gives up easily.
internal static RegexPrefix Prefix(RegexTree tree) {
RegexNode curNode;
RegexNode concatNode = null;
int nextChild = 0;
curNode = tree._root;
for (;;) {
switch (curNode._type) {
case RegexNode.Concatenate:
if (curNode.ChildCount() > 0) {
concatNode = curNode;
nextChild = 0;
}
break;
case RegexNode.Greedy:
case RegexNode.Capture:
curNode = curNode.Child(0);
concatNode = null;
continue;
case RegexNode.Oneloop:
case RegexNode.Onelazy:
case RegexNode.Multi:
goto OuterloopBreak;
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.Boundary:
case RegexNode.ECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.End:
case RegexNode.Empty:
case RegexNode.Require:
case RegexNode.Prevent:
break;
default:
return RegexPrefix.Empty;
}
if (concatNode == null || nextChild >= concatNode.ChildCount())
return RegexPrefix.Empty;
curNode = concatNode.Child(nextChild++);
}
OuterloopBreak:
;
switch (curNode._type) {
case RegexNode.Multi:
return new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase));
case RegexNode.Oneloop:
goto
case RegexNode.Onelazy;
case RegexNode.Onelazy:
if (curNode._m > 0) {
StringBuilder sb = new StringBuilder();
sb.Append(curNode._ch, curNode._m);
return new RegexPrefix(sb.ToString(), 0 != (curNode._options & RegexOptions.IgnoreCase));
}
// else fall through
goto default;
default:
return RegexPrefix.Empty;
}
}
// This is a related computation: it takes a RegexTree and computes the
// leading []* construct if it see one. It's quite trivial and gives up easily.
internal static RegexPrefix ScanChars(RegexTree tree) {
RegexNode curNode;
RegexNode concatNode = null;
int nextChild = 0;
String foundSet = null;
bool caseInsensitive = false;
curNode = tree._root;
for (;;) {
switch (curNode._type) {
case RegexNode.Concatenate:
if (curNode.ChildCount() > 0) {
concatNode = curNode;
nextChild = 0;
}
break;
case RegexNode.Greedy:
case RegexNode.Capture:
curNode = curNode.Child(0);
concatNode = null;
continue;
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.Boundary:
case RegexNode.ECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.End:
case RegexNode.Empty:
case RegexNode.Require:
case RegexNode.Prevent:
break;
case RegexNode.Oneloop:
case RegexNode.Onelazy:
if (curNode._n != infinite)
return null;
foundSet = RegexCharClass.SetFromChar(curNode._ch);
caseInsensitive = (0 != (curNode._options & RegexOptions.IgnoreCase));
break;
case RegexNode.Notoneloop:
case RegexNode.Notonelazy:
if (curNode._n != infinite)
return null;
foundSet = RegexCharClass.SetInverseFromChar(curNode._ch);
caseInsensitive = (0 != (curNode._options & RegexOptions.IgnoreCase));
break;
case RegexNode.Setloop:
case RegexNode.Setlazy:
if (curNode._n != infinite || (curNode._str2 != null && curNode._str2.Length != 0))
return null;
foundSet = curNode._str;
caseInsensitive = (0 != (curNode._options & RegexOptions.IgnoreCase));
break;
default:
return null;
}
if (foundSet != null)
return new RegexPrefix(foundSet, caseInsensitive);
if (concatNode == null || nextChild >= concatNode.ChildCount())
return null;
curNode = concatNode.Child(nextChild++);
}
}
// Yet another related computation: it takes a RegexTree and computes the
// leading anchors that it encounters.
internal static int Anchors(RegexTree tree) {
RegexNode curNode;
RegexNode concatNode = null;
int nextChild = 0;
int result = 0;
curNode = tree._root;
for (;;) {
switch (curNode._type) {
case RegexNode.Concatenate:
if (curNode.ChildCount() > 0) {
concatNode = curNode;
nextChild = 0;
}
break;
case RegexNode.Greedy:
case RegexNode.Capture:
curNode = curNode.Child(0);
concatNode = null;
continue;
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.Boundary:
case RegexNode.ECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.End:
return result | AnchorFromType(curNode._type);
case RegexNode.Empty:
case RegexNode.Require:
case RegexNode.Prevent:
break;
default:
return result;
}
if (concatNode == null || nextChild >= concatNode.ChildCount())
return result;
curNode = concatNode.Child(nextChild++);
}
}
// Convert anchor type to anchor bit.
internal static int AnchorFromType(int type) {
switch (type) {
case RegexNode.Bol: return Bol;
case RegexNode.Eol: return Eol;
case RegexNode.Boundary: return Boundary;
case RegexNode.ECMABoundary: return ECMABoundary;
case RegexNode.Beginning: return Beginning;
case RegexNode.Start: return Start;
case RegexNode.EndZ: return EndZ;
case RegexNode.End: return End;
default: return 0;
}
}
#if DBG
internal static String AnchorDescription(int anchors) {
StringBuilder sb = new StringBuilder();
if (0 != (anchors & Beginning)) sb.Append(", Beginning");
if (0 != (anchors & Start)) sb.Append(", Start");
if (0 != (anchors & Bol)) sb.Append(", Bol");
if (0 != (anchors & Boundary)) sb.Append(", Boundary");
if (0 != (anchors & ECMABoundary)) sb.Append(", ECMABoundary");
if (0 != (anchors & Eol)) sb.Append(", Eol");
if (0 != (anchors & End)) sb.Append(", End");
if (0 != (anchors & EndZ)) sb.Append(", EndZ");
if (sb.Length >= 2)
return(sb.ToString(2, sb.Length - 2));
return "None";
}
#endif
// private constructor; can't be created outside
private RegexFCD() {
_fcStack = new RegexFC[32];
_intStack = new int[32];
}
// To avoid recursion, we use a simple integer stack.
// This is the push.
internal void PushInt(int I) {
if (_intDepth >= _intStack.Length) {
int [] expanded = new int[_intDepth * 2];
System.Array.Copy(_intStack, 0, expanded, 0, _intDepth);
_intStack = expanded;
}
_intStack[_intDepth++] = I;
}
// True if the stack is empty.
internal bool EmptyInt() {
return _intDepth == 0;
}
// This is the pop.
internal int PopInt() {
return _intStack[--_intDepth];
}
// We also use a stack of RegexFC objects.
// This is the push.
internal void PushFC(RegexFC fc) {
if (_fcDepth >= _fcStack.Length) {
RegexFC[] expanded = new RegexFC[_fcDepth * 2];
System.Array.Copy(_fcStack, 0, expanded, 0, _fcDepth);
_fcStack = expanded;
}
_fcStack[_fcDepth++] = fc;
}
// True if the stack is empty.
internal bool EmptyFC() {
return _fcDepth == 0;
}
// This is the pop.
internal RegexFC PopFC() {
return _fcStack[--_fcDepth];
}
// This is the top.
internal RegexFC TopFC() {
return _fcStack[_fcDepth - 1];
}
// The main FC computation. It does a shortcutted depth-first walk
// through the tree and calls CalculateFC to emits code before
// and after each child of an interior node, and at each leaf.
internal RegexFC RegexFCFromRegexTree(RegexTree tree) {
RegexNode curNode;
int curChild;
curNode = tree._root;
curChild = 0;
for (;;) {
if (curNode._children == null) {
CalculateFC(curNode._type, curNode, 0);
}
else if (curChild < curNode._children.Count && !_earlyexit) {
CalculateFC(curNode._type | BeforeChild, curNode, curChild);
if (!_skipchild) {
curNode = (RegexNode)curNode._children[curChild];
PushInt(curChild);
curChild = 0;
}
else {
curChild++;
_skipchild = false;
}
continue;
}
_earlyexit = false;
if (EmptyInt())
break;
curChild = PopInt();
curNode = curNode._next;
CalculateFC(curNode._type | AfterChild, curNode, curChild);
curChild++;
}
if (EmptyFC())
return new RegexFC(RegexCharClass.Any, true, false);
return PopFC();
}
// Called in AfterChild to prevent processing of the rest of the children at the current level
internal void EarlyExit() {
_earlyexit = true;
}
// Called in Beforechild to prevent further processing of the current child
internal void SkipChild() {
_skipchild = true;
}
// FC computation and shortcut cases for each node type
internal void CalculateFC(int NodeType, RegexNode node, int CurIndex) {
bool ci = false;
bool rtl = false;
if (NodeType <= RegexNode.Ref) {
if ((node._options & RegexOptions.IgnoreCase) != 0)
ci = true;
if ((node._options & RegexOptions.RightToLeft) != 0)
rtl = true;
}
switch (NodeType) {
case RegexNode.Concatenate | BeforeChild:
case RegexNode.Alternate | BeforeChild:
case RegexNode.Testref | BeforeChild:
case RegexNode.Loop | BeforeChild:
case RegexNode.Lazyloop | BeforeChild:
break;
case RegexNode.Testgroup | BeforeChild:
if (CurIndex == 0)
SkipChild();
break;
case RegexNode.Empty:
PushFC(new RegexFC(true));
break;
case RegexNode.Concatenate | AfterChild:
if (CurIndex != 0) {
RegexFC child = PopFC();
RegexFC cumul = TopFC();
cumul.AddFC(child, true);
}
if (!TopFC()._nullable)
EarlyExit();
break;
case RegexNode.Testgroup | AfterChild:
if (CurIndex > 1) {
RegexFC child = PopFC();
RegexFC cumul = TopFC();
cumul.AddFC(child, false);
}
break;
case RegexNode.Alternate | AfterChild:
case RegexNode.Testref | AfterChild:
if (CurIndex != 0) {
RegexFC child = PopFC();
RegexFC cumul = TopFC();
cumul.AddFC(child, false);
}
break;
case RegexNode.Loop | AfterChild:
case RegexNode.Lazyloop | AfterChild:
if (node._m == 0)
TopFC()._nullable = true;
break;
case RegexNode.Group | BeforeChild:
case RegexNode.Group | AfterChild:
case RegexNode.Capture | BeforeChild:
case RegexNode.Capture | AfterChild:
case RegexNode.Greedy | BeforeChild:
case RegexNode.Greedy | AfterChild:
break;
case RegexNode.Require | BeforeChild:
case RegexNode.Prevent | BeforeChild:
SkipChild();
PushFC(new RegexFC(true));
break;
case RegexNode.Require | AfterChild:
case RegexNode.Prevent | AfterChild:
break;
case RegexNode.One:
case RegexNode.Notone:
PushFC(new RegexFC(node._ch, NodeType == RegexNode.Notone, false, ci));
break;
case RegexNode.Oneloop:
case RegexNode.Onelazy:
PushFC(new RegexFC(node._ch, false, node._m == 0, ci));
break;
case RegexNode.Notoneloop:
case RegexNode.Notonelazy:
PushFC(new RegexFC(node._ch, true, node._m == 0, ci));
break;
case RegexNode.Multi:
if (node._str.Length == 0)
PushFC(new RegexFC(true));
else if (!rtl)
PushFC(new RegexFC(node._str[0], false, false, ci));
else
PushFC(new RegexFC(node._str[node._str.Length - 1], false, false, ci));
break;
case RegexNode.Set:
// mark this node as nullable if we have some categories
PushFC(new RegexFC(node._str, !(node._str2 == null || node._str2.Length == 0), ci));
break;
case RegexNode.Setloop:
case RegexNode.Setlazy:
// don't need to worry about categories since this is nullable
PushFC(new RegexFC(node._str, true, ci));
break;
case RegexNode.Ref:
PushFC(new RegexFC(RegexCharClass.Any, true, false));
break;
case RegexNode.Nothing:
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.Boundary:
case RegexNode.Nonboundary:
case RegexNode.ECMABoundary:
case RegexNode.NonECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.End:
PushFC(new RegexFC(true));
break;
default:
throw new ArgumentException("Unexpected Opcode");//XXX: SR.GetString(SR.UnexpectedOpcode, NodeType.ToString()));
}
}
}
internal sealed class RegexFC {
internal RegexCharClass _cc;
internal bool _nullable;
internal bool _caseInsensitive;
internal RegexFC(bool nullable) {
_cc = new RegexCharClass();
_nullable = nullable;
}
internal RegexFC(char ch, bool not, bool nullable, bool caseInsensitive) {
_cc = new RegexCharClass();
if (not) {
if (ch > 0)
_cc.AddRange('\0', (char)(ch - 1));
if (ch < 0xFFFF)
_cc.AddRange((char)(ch + 1), '\uFFFF');
}
else {
_cc.AddRange(ch, ch);
}
_caseInsensitive = caseInsensitive;
_nullable = nullable;
}
internal RegexFC(String set, bool nullable, bool caseInsensitive) {
_cc = new RegexCharClass();
_cc.AddSet(set);
_nullable = nullable;
_caseInsensitive = caseInsensitive;
}
internal void AddFC(RegexFC fc, bool concatenate) {
if (concatenate) {
if (!_nullable)
return;
if (!fc._nullable)
_nullable = false;
}
else {
if (fc._nullable)
_nullable = true;
}
_caseInsensitive |= fc._caseInsensitive;
_cc.AddCharClass(fc._cc);
}
internal String GetFirstChars(CultureInfo culture) {
return _cc.ToSetCi(_caseInsensitive, culture);
}
internal bool IsCaseInsensitive() {
return _caseInsensitive;
}
}
internal sealed class RegexPrefix {
internal RegexPrefix(String prefix, bool ci) {
_prefix = prefix;
_caseInsensitive = ci;
}
internal String Prefix {
get {
return _prefix;
}
}
internal bool CaseInsensitive {
get {
return _caseInsensitive;
}
}
internal String _prefix;
internal bool _caseInsensitive;
internal static RegexPrefix _empty = new RegexPrefix(String.Empty, false);
internal static RegexPrefix Empty {
get {
return _empty;
}
}
}
}