singrdk/base/Applications/Runtime/Full/System/Text/RegularExpressions/regexnode.cs

667 lines
23 KiB
C#
Raw Normal View History

2008-03-05 09:52:00 -05:00
//------------------------------------------------------------------------------
// Copyright (c) Microsoft Corporation. All rights reserved.
//------------------------------------------------------------------------------
// This RegexNode class is internal to the Regex package.
// It is built into a parsed tree for a regular expression.
//
// Implementation notes:
2008-11-17 18:29:00 -05:00
//
2008-03-05 09:52:00 -05:00
// Since the node tree is a temporary data structure only used
// during compilation of the regexp to integer codes, it's
// designed for clarity and convenience rather than
// space efficiency.
//
// RegexNodes are built into a tree, linked by the _children list.
// Each node also has a _parent and _ichild member indicating
// its parent and which child # it is in its parent's list.
//
// RegexNodes come in as many types as there are constructs in
// a regular expression, for example, "concatenate", "alternate",
// "one", "rept", "group". There are also node types for basic
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
//
// Because perl 5 allows "lookback" groups that scan backwards,
// each node also gets a "direction". Normally the value of
// boolean _backward = false.
//
// During parsing, top-level nodes are also stacked onto a parse
// stack (a stack of trees). For this purpose we have a _next
// pointer. [Note that to save a few bytes, we could overload the
// _parent pointer instead.]
//
// On the parse stack, each tree has a "role" - basically, the
// nonterminal in the grammar that the parser has currently
// assigned to the tree. That code is stored in _role.
//
// Finally, some of the different kinds of nodes have data.
// Two integers (for the looping constructs) are stored in
// _operands, an an object (either a string or a set)
// is stored in _data
//
#define ECMA
2008-11-17 18:29:00 -05:00
namespace System.Text.RegularExpressions
{
2008-03-05 09:52:00 -05:00
using System.Collections;
using System.Diagnostics;
internal sealed class RegexNode {
// RegexNode types
2008-11-17 18:29:00 -05:00
//
2008-03-05 09:52:00 -05:00
// the following are leaves, and correspond to primitive operations
// static final int Onerep = RegexCode.Onerep; // c,n a {n}
// static final int Notonerep = RegexCode.Notonerep; // c,n .{n}
// static final int Setrep = RegexCode.Setrep; // set,n \d {n}
internal const int Oneloop = RegexCode.Oneloop; // c,n a*
internal const int Notoneloop = RegexCode.Notoneloop; // c,n .*
internal const int Setloop = RegexCode.Setloop; // set,n \d*
internal const int Onelazy = RegexCode.Onelazy; // c,n a*?
internal const int Notonelazy = RegexCode.Notonelazy; // c,n .*?
internal const int Setlazy = RegexCode.Setlazy; // set,n \d*?
internal const int One = RegexCode.One; // char a
internal const int Notone = RegexCode.Notone; // char . [^a]
internal const int Set = RegexCode.Set; // set [a-z] \w \s \d
internal const int Multi = RegexCode.Multi; // string abcdef
internal const int Ref = RegexCode.Ref; // index \1
internal const int Bol = RegexCode.Bol; // ^
internal const int Eol = RegexCode.Eol; // $
internal const int Boundary = RegexCode.Boundary; // \b
internal const int Nonboundary= RegexCode.Nonboundary;// \B
internal const int ECMABoundary = RegexCode.ECMABoundary; // \b
internal const int NonECMABoundary= RegexCode.NonECMABoundary; // \B
internal const int Beginning = RegexCode.Beginning; // \A
internal const int Start = RegexCode.Start; // \G
internal const int EndZ = RegexCode.EndZ; // \Z
internal const int End = RegexCode.End; // \z
// (note: End = 21;)
// interior nodes do not correspond to primitive operations, but
// control structures compositing other operations
// concat and alternate take n children, and can run forward or backwards
internal const int Nothing = 22; // []
internal const int Empty = 23; // ()
internal const int Alternate = 24; // a|b
internal const int Concatenate= 25; // ab
internal const int Loop = 26; // m,x * + ? {,}
internal const int Lazyloop = 27; // m,x *? +? ?? {,}?
internal const int Capture = 28; // n ()
internal const int Group = 29; // (?:)
internal const int Require = 30; // (?=) (?<=)
internal const int Prevent = 31; // (?!) (?<!)
internal const int Greedy = 32; // (?>) (?<)
internal const int Testref = 33; // (?(n) | )
internal const int Testgroup = 34; // (?(...) | )
internal const int infinite = 0x7FFFFFFF;
// RegexNode data members
2008-11-17 18:29:00 -05:00
//
2008-03-05 09:52:00 -05:00
internal int _type;
internal ArrayList _children;
internal String _str;
internal String _str2;
internal char _ch;
internal int _m;
internal int _n;
internal RegexOptions _options;
internal RegexNode _next;
internal RegexNode(int type, RegexOptions options) {
_type = type;
_options = options;
}
internal RegexNode(int type, RegexOptions options, char ch) {
_type = type;
_options = options;
_ch = ch;
}
internal RegexNode(int type, RegexOptions options, String str) {
_type = type;
_options = options;
_str = str;
}
internal RegexNode(int type, RegexOptions options, String str, String str2) {
_type = type;
_options = options;
_str = str;
_str2 = str2;
}
internal RegexNode(int type, RegexOptions options, int m) {
_type = type;
_options = options;
_m = m;
}
internal RegexNode(int type, RegexOptions options, int m, int n) {
_type = type;
_options = options;
_m = m;
_n = n;
}
internal RegexNode() {
}
internal bool UseOptionR() {
return(_options & RegexOptions.RightToLeft) != 0;
}
internal RegexNode ReverseLeft() {
if (UseOptionR() && _type == Concatenate && _children != null) {
_children.Reverse(0, _children.Count);
}
return this;
}
// Pass type as OneLazy or OneLoop
internal void MakeRep(int type, int min, int max) {
_type += (type - One);
_m = min;
_n = max;
}
// Reduce
//
// Removes redundant nodes from the subtree, and returns a reduced subtree.
internal RegexNode Reduce() {
RegexNode n;
switch (Type()) {
case Alternate:
n = ReduceAlternation();
break;
case Concatenate:
n = ReduceConcatenation();
break;
case Loop:
case Lazyloop:
n = ReduceRep();
break;
case Group:
n = ReduceGroup();
break;
case Set:
case Setloop:
n = ReduceSet();
break;
default:
n = this;
break;
}
return n;
}
// StripEnation:
//
// Simple optimization. If a concatenation or alternation has only
// one child strip out the intermediate node. If it has zero children,
// turn it into an empty.
2008-11-17 18:29:00 -05:00
//
2008-03-05 09:52:00 -05:00
internal RegexNode StripEnation(int emptyType) {
switch (ChildCount()) {
case 0:
return new RegexNode(emptyType, _options);
case 1:
return Child(0);
default:
return this;
}
}
// ReduceGroup:
//
// Simple optimization. Once parsed into a tree, noncapturing groups
// serve no function, so strip them out.
internal RegexNode ReduceGroup() {
RegexNode u;
2008-11-17 18:29:00 -05:00
for (u = this; u.Type() == Group;)
2008-03-05 09:52:00 -05:00
u = u.Child(0);
return u;
}
// ReduceRep:
//
// Nested repeaters just get multiplied with each other if they're not
// too lumpy
internal RegexNode ReduceRep() {
RegexNode u;
RegexNode child;
int type;
int min;
int max;
u = this;
type = Type();
min = _m;
max = _n;
for (;;) {
if (u.ChildCount() == 0)
break;
child = u.Child(0);
// multiply reps of the same type only
if (child.Type() != type) {
int childType = child.Type();
if (!(childType >= Oneloop && childType <= Setloop && type == Loop ||
childType >= Onelazy && childType <= Setlazy && type == Lazyloop))
break;
}
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
// [but things like (a {2,})+ are not too lumpy...]
if (u._m == 0 && child._m > 1 || child._n < child._m * 2)
break;
u = child;
if (u._m > 0)
u._m = min = ((infinite - 1) / u._m < min) ? infinite : u._m * min;
if (u._n > 0)
u._n = max = ((infinite - 1) / u._n < max) ? infinite : u._n * max;
}
return min == infinite ? new RegexNode(Nothing, _options) : u;
}
// ReduceSet:
//
// Simple optimization. If a set is a singleton, an inverse singleton,
// or empty, it's transformed accordingly.
internal RegexNode ReduceSet() {
// Extract empty-set, one and not-one case as special
if (_str2 == String.Empty) {
if (_str == RegexCharClass.Empty) {
_type = Nothing;
_str = null;
}
else if (_str.Length == 2 && _str[0] + 1 == _str[1]) {
_ch = _str[0];
_str = null;
_type += (One - Set);
}
else if (_str.Length == 3 && _str[0] == '\0' && (char)(_str[1] + 1) == _str[2]) {
_ch = _str[1];
_str = null;
_type += (Notone - Set);
}
}
return this;
}
// ReduceAlternation:
//
// Basic optimization. Single-letter alternations can be replaced
// by faster set specifications, and nested alternations with no
// intervening operators can be flattened:
//
// a|b|c|def|g|h -> [a-c]|def|[gh]
// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
//
internal RegexNode ReduceAlternation() {
// Combine adjacent sets/chars
bool wasLastSet;
RegexOptions optionsLast;
RegexOptions optionsAt;
int i;
int j;
RegexNode at;
RegexNode prev;
if (_children == null)
return new RegexNode(RegexNode.Nothing, _options);
wasLastSet = false;
optionsLast = 0;
for (i = 0, j = 0; i < _children.Count; i++, j++) {
at = (RegexNode)_children[i];
if (j < i)
_children[j] = at;
for (;;) {
if (at._type == Alternate) {
for (int k = 0; k < at._children.Count; k++)
((RegexNode)at._children[k])._next = this;
_children.InsertRange(i + 1, at._children);
j--;
}
else if (at._type == Set || at._type == One) {
// Cannot merge sets if L or I options differ
optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
if (!wasLastSet || optionsLast != optionsAt) {
wasLastSet = true;
optionsLast = optionsAt;
break;
}
// The last node was a Set or a One, we're a Set or One and our options are the same.
// Merge the two nodes.
prev = (RegexNode)_children[--j];
if (prev._type == RegexNode.One) {
prev._type = RegexNode.Set;
prev._str = RegexCharClass.SetFromChar(prev._ch);
}
2008-11-17 18:29:00 -05:00
if (at._type == RegexNode.One)
2008-03-05 09:52:00 -05:00
prev._str = RegexCharClass.SetUnion(prev._str, RegexCharClass.SetFromChar(at._ch)); // no categories to worry about
else {
prev._str = RegexCharClass.SetUnion(prev._str, at._str);
prev._str2 = RegexCharClass.CategoryUnion(prev._str2, at._str2);
}
}
else if (at._type == RegexNode.Nothing) {
j--;
}
else {
wasLastSet = false;
}
break;
}
}
if (j < i)
_children.RemoveRange(j, i - j);
return StripEnation(RegexNode.Nothing);
}
// ReduceConcatenation:
//
// Basic optimization. Adjacent strings can be concatenated.
//
// (?:abc)(?:def) -> abcdef
internal RegexNode ReduceConcatenation() {
// Eliminate empties and concat adjacent strings/chars
bool wasLastString;
RegexOptions optionsLast;
RegexOptions optionsAt;
int i;
int j;
if (_children == null)
return new RegexNode(RegexNode.Empty, _options);
wasLastString = false;
optionsLast = 0;
for (i = 0, j = 0; i < _children.Count; i++, j++) {
RegexNode at;
RegexNode prev;
at = (RegexNode)_children[i];
if (j < i)
_children[j] = at;
if (at._type == RegexNode.Concatenate &&
((at._options & RegexOptions.RightToLeft) == (_options & RegexOptions.RightToLeft))) {
for (int k = 0; k < at._children.Count; k++)
((RegexNode)at._children[k])._next = this;
_children.InsertRange(i + 1, at._children);
j--;
}
else if (at._type == RegexNode.Multi ||
at._type == RegexNode.One) {
// Cannot merge strings if L or I options differ
optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
if (!wasLastString || optionsLast != optionsAt) {
wasLastString = true;
optionsLast = optionsAt;
continue;
}
prev = (RegexNode)_children[--j];
if (prev._type == RegexNode.One) {
prev._type = RegexNode.Multi;
prev._str = prev._ch.ToString(); //Convert.ToString(prev._ch);
}
if ((optionsAt & RegexOptions.RightToLeft) == 0) {
if (at._type == RegexNode.One)
prev._str += at._ch.ToString();
else
prev._str += at._str;
}
else {
if (at._type == RegexNode.One)
prev._str = at._ch.ToString() + prev._str;
else
prev._str = at._str + prev._str;
}
}
else if (at._type == RegexNode.Empty) {
j--;
}
else {
wasLastString = false;
}
}
if (j < i)
_children.RemoveRange(j, i - j);
return StripEnation(RegexNode.Empty);
}
internal RegexNode MakeQuantifier(bool lazy, int min, int max) {
RegexNode result;
if (min == 0 && max == 0)
return new RegexNode(RegexNode.Empty, _options);
if (min == 1 && max == 1)
return this;
switch (_type) {
case RegexNode.One:
case RegexNode.Notone:
case RegexNode.Set:
MakeRep(lazy ? RegexNode.Onelazy : RegexNode.Oneloop, min, max);
return this;
default:
result = new RegexNode(lazy ? RegexNode.Lazyloop : RegexNode.Loop, _options, min, max);
result.AddChild(this);
return result;
}
}
2008-11-17 18:29:00 -05:00
2008-03-05 09:52:00 -05:00
internal void AddChild(RegexNode newChild) {
RegexNode reducedChild;
if (_children == null)
_children = new ArrayList(4);
reducedChild = newChild.Reduce();
_children.Add(reducedChild);
reducedChild._next = this;
}
internal RegexNode Child(int i) {
return(RegexNode)_children[i];
}
internal int ChildCount() {
return _children == null ? 0 : _children.Count;
}
internal int Type() {
return _type;
}
#if DBG
internal static String[] TypeStr = new String[] {
"Onerep", "Notonerep", "Setrep",
"Oneloop", "Notoneloop", "Setloop",
"Onelazy", "Notonelazy", "Setlazy",
"One", "Notone", "Set",
"Multi", "Ref",
"Bol", "Eol", "Boundary", "Nonboundary",
"ECMABoundary", "NonECMABoundary",
"Beginning", "Start", "EndZ", "End",
"Nothing", "Empty",
"Alternate", "Concatenate",
"Loop", "Lazyloop",
"Capture", "Group", "Require", "Prevent", "Greedy",
"Testref", "Testgroup"};
internal String Description() {
StringBuilder ArgSb = new StringBuilder();
ArgSb.Append(TypeStr[_type]);
if ((_options & RegexOptions.ExplicitCapture) != 0)
ArgSb.Append("-C");
if ((_options & RegexOptions.IgnoreCase) != 0)
ArgSb.Append("-I");
if ((_options & RegexOptions.RightToLeft) != 0)
ArgSb.Append("-L");
if ((_options & RegexOptions.Multiline) != 0)
ArgSb.Append("-M");
if ((_options & RegexOptions.Singleline) != 0)
ArgSb.Append("-S");
if ((_options & RegexOptions.IgnorePatternWhitespace) != 0)
ArgSb.Append("-X");
if ((_options & RegexOptions.ECMAScript) != 0)
ArgSb.Append("-E");
switch (_type) {
case Oneloop:
case Notoneloop:
case Onelazy:
case Notonelazy:
case One:
case Notone:
ArgSb.Append("(Ch = " + RegexCharClass.CharDescription(_ch) + ")");
break;
case Capture:
ArgSb.Append("(index = " + _m.ToString() + ", unindex = " + _n.ToString() + ")");
break;
case Ref:
case Testref:
ArgSb.Append("(index = " + _m.ToString() + ")");
break;
case Multi:
ArgSb.Append("(String = " + _str + ")");
break;
case Set:
case Setloop:
case Setlazy:
ArgSb.Append("(Set = " + RegexCharClass.SetDescription(_str) + ")");
break;
}
switch (_type) {
case Oneloop:
case Notoneloop:
case Onelazy:
case Notonelazy:
case Setloop:
case Setlazy:
case Loop:
case Lazyloop:
ArgSb.Append("(Min = " + _m.ToString() + ", Max = " + (_n == infinite ? "inf" : Convert.ToString(_n)) + ")");
2008-11-17 18:29:00 -05:00
break;
2008-03-05 09:52:00 -05:00
}
return ArgSb.ToString();
}
internal const String Space = " ";
internal void Dump() {
ArrayList Stack = new ArrayList();
RegexNode CurNode;
int CurChild;
CurNode = this;
CurChild = 0;
Debug.WriteLine(CurNode.Description());
for (;;) {
if (CurNode._children != null && CurChild < CurNode._children.Count) {
Stack.Add(CurChild + 1);
CurNode = (RegexNode)CurNode._children[CurChild];
CurChild = 0;
int Depth = Stack.Count;
if (Depth > 32)
Depth = 32;
Debug.WriteLine(Space.Substring(0, Depth) + CurNode.Description());
}
else {
if (Stack.Count == 0)
break;
CurChild = (Int32)Stack[Stack.Count - 1];
Stack.RemoveAt(Stack.Count - 1);
CurNode = CurNode._next;
}
}
}
#endif
}
}