singrdk/base/Kernel/System/Text/RegularExpressions/regexwriter.cs

540 lines
20 KiB
C#
Raw Normal View History

2008-03-05 09:52:00 -05:00
//------------------------------------------------------------------------------
// <copyright company='Microsoft Corporation'>
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// </copyright>
//------------------------------------------------------------------------------
// This RegexWriter class is internal to the Regex package.
// It builds a block of regular expression codes (RegexCode)
// from a RegexTree parse tree.
//
// Implementation notes:
//
// This step is as simple as walking the tree and emitting
// sequences of codes.
//
#define ECMA
namespace System.Text.RegularExpressions {
using System.Collections;
//using System.Collections.Specialized;
using System.Globalization;
internal sealed class RegexWriter {
internal int[] _intStack;
internal int _depth;
internal int[] _emitted;
internal int _curpos;
internal IDictionary _stringhash;
internal ArrayList _stringtable;
// not used! internal int _stringcount;
internal bool _counting;
internal int _count;
internal int _trackcount;
internal Hashtable _caps;
internal const int BeforeChild = 64;
internal const int AfterChild = 128;
internal const int infinite = RegexCode.infinite;
// This is the only function that should be called from outside.
// It takes a RegexTree and creates a corresponding RegexCode.
internal static RegexCode Write(RegexTree t) {
RegexWriter w = new RegexWriter();
RegexCode retval = w.RegexCodeFromRegexTree(t);
#if DBG
if (t.Debug) {
retval.Dump();
}
#endif
return retval;
}
// private constructor; can't be created outside
private RegexWriter() {
_intStack = new int[32];
_emitted = new int[32];
_stringhash = new Hashtable(); //HybridDictionary();
_stringtable = new ArrayList();
}
// To avoid recursion, we use a simple integer stack.
// This is the push.
internal void PushInt(int I) {
if (_depth >= _intStack.Length) {
int [] expanded = new int[_depth * 2];
System.Array.Copy(_intStack, 0, expanded, 0, _depth);
_intStack = expanded;
}
_intStack[_depth++] = I;
}
// True if the stack is empty.
internal bool EmptyStack() {
return _depth == 0;
}
// This is the pop.
internal int PopInt() {
return _intStack[--_depth];
}
// Returns the current position in the emitted code.
internal int CurPos() {
return _curpos;
}
// Fixes up a jump instruction at the specified offset
// so that it jumps to the specified jumpDest.
internal void PatchJump(int Offset, int jumpDest) {
_emitted[Offset + 1] = jumpDest;
}
// Emits a zero-argument operation. Note that the emit
// functions all run in two modes: they can emit code, or
// they can just count the size of the code.
internal void Emit(int op) {
if (_counting) {
_count += 1;
if (RegexCode.OpcodeBacktracks(op))
_trackcount += 1;
return;
}
_emitted[_curpos++] = op;
}
// Emits a one-argument operation.
internal void Emit(int op, int opd1) {
if (_counting) {
_count += 2;
if (RegexCode.OpcodeBacktracks(op))
_trackcount += 1;
return;
}
_emitted[_curpos++] = op;
_emitted[_curpos++] = opd1;
}
// Emits a two-argument operation.
internal void Emit(int op, int opd1, int opd2) {
if (_counting) {
_count += 3;
if (RegexCode.OpcodeBacktracks(op))
_trackcount += 1;
return;
}
_emitted[_curpos++] = op;
_emitted[_curpos++] = opd1;
_emitted[_curpos++] = opd2;
}
// Emits a three-argument operation.
internal void Emit(int op, int opd1, int opd2, int opd3) {
if (_counting) {
_count += 4;
if (RegexCode.OpcodeBacktracks(op))
_trackcount += 1;
return;
}
_emitted[_curpos++] = op;
_emitted[_curpos++] = opd1;
_emitted[_curpos++] = opd2;
_emitted[_curpos++] = opd3;
}
// Returns an index in the string table for a string;
// uses a hashtable to eliminate duplicates.
internal int StringCode(String str) {
Int32 i;
if (_counting)
return 0;
if (str == null)
str = String.Empty;
if (_stringhash.Contains(str)) {
i = (Int32)_stringhash[str];
}
else {
i = _stringtable.Count;
_stringhash[str] = i;
_stringtable.Add(str);
}
return i;
}
// Just returns an exception; should be dead code
internal ArgumentException MakeException(String message) {
return new ArgumentException(message);
}
// When generating code on a regex that uses a sparse set
// of capture slots, we hash them to a dense set of indices
// for an array of capture slots. Instead of doing the hash
// at match time, it's done at compile time, here.
internal int MapCapnum(int capnum) {
if (capnum == -1)
return -1;
if (_caps != null)
return(Int32)_caps[capnum];
else
return capnum;
}
// The top level RegexCode generator. It does a depth-first walk
// through the tree and calls EmitFragment to emits code before
// and after each child of an interior node, and at each leaf.
//
// It runs two passes, first to count the size of the generated
// code, and second to generate the code.
//
internal RegexCode RegexCodeFromRegexTree(RegexTree tree) {
RegexNode curNode;
int curChild;
int capsize;
RegexPrefix fcPrefix;
RegexPrefix scPrefix;
RegexPrefix prefix;
int anchors;
RegexBoyerMoore bmPrefix;
bool rtl;
// construct sparse capnum mapping if some numbers are unused
if (tree._capnumlist == null || tree._captop == tree._capnumlist.Length) {
capsize = tree._captop;
_caps = null;
}
else {
capsize = tree._capnumlist.Length;
_caps = tree._caps;
for (int i = 0; i < tree._capnumlist.Length; i++)
_caps[tree._capnumlist[i]] = i;
}
_counting = true;
for (;;) {
if (!_counting)
_emitted = new int[_count];
curNode = tree._root;
curChild = 0;
Emit(RegexCode.Lazybranch, 0);
for (;;) {
if (curNode._children == null) {
EmitFragment(curNode._type, curNode, 0);
}
else if (curChild < curNode._children.Count) {
EmitFragment(curNode._type | BeforeChild, curNode, curChild);
curNode = (RegexNode)curNode._children[curChild];
PushInt(curChild);
curChild = 0;
continue;
}
if (EmptyStack())
break;
curChild = PopInt();
curNode = curNode._next;
EmitFragment(curNode._type | AfterChild, curNode, curChild);
curChild++;
}
PatchJump(0, CurPos());
Emit(RegexCode.Stop);
if (!_counting)
break;
_counting = false;
}
// if the set of possible first chars is very large,
// don't bother scanning for it (common case: . == [^\n])
fcPrefix = RegexFCD.FirstChars(tree);
if (fcPrefix != null && RegexCharClass.SetSize(fcPrefix.Prefix) > 0)
fcPrefix = null;
scPrefix = null; //RegexFCD.ScanChars(tree);
prefix = RegexFCD.Prefix(tree);
rtl = ((tree._options & RegexOptions.RightToLeft) != 0);
CultureInfo culture = (tree._options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
if (prefix != null && prefix.Prefix.Length > 0)
bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture);
else
bmPrefix = null;
anchors = RegexFCD.Anchors(tree);
return new RegexCode(_emitted, _stringtable, _trackcount, _caps, capsize, bmPrefix, fcPrefix, scPrefix, anchors, rtl);
}
// The main RegexCode generator. It does a depth-first walk
// through the tree and calls EmitFragment to emits code before
// and after each child of an interior node, and at each leaf.
internal void EmitFragment(int nodetype, RegexNode node, int CurIndex) {
int bits = 0;
if (nodetype <= RegexNode.Ref) {
if (node.UseOptionR())
bits |= RegexCode.Rtl;
if ((node._options & RegexOptions.IgnoreCase) != 0)
bits |= RegexCode.Ci;
}
switch (nodetype) {
case RegexNode.Concatenate | BeforeChild:
case RegexNode.Concatenate | AfterChild:
case RegexNode.Empty:
break;
case RegexNode.Alternate | BeforeChild:
if (CurIndex < node._children.Count - 1) {
PushInt(CurPos());
Emit(RegexCode.Lazybranch, 0);
}
break;
case RegexNode.Alternate | AfterChild: {
if (CurIndex < node._children.Count - 1) {
int LBPos = PopInt();
PushInt(CurPos());
Emit(RegexCode.Goto, 0);
PatchJump(LBPos, CurPos());
}
else {
int I;
for (I = 0; I < CurIndex; I++) {
PatchJump(PopInt(), CurPos());
}
}
break;
}
case RegexNode.Testref | BeforeChild:
switch (CurIndex) {
case 0:
Emit(RegexCode.Setjump);
PushInt(CurPos());
Emit(RegexCode.Lazybranch, 0);
Emit(RegexCode.Testref, MapCapnum(node._m));
Emit(RegexCode.Forejump);
break;
}
break;
case RegexNode.Testref | AfterChild:
switch (CurIndex) {
case 0: {
int Branchpos = PopInt();
PushInt(CurPos());
Emit(RegexCode.Goto, 0);
PatchJump(Branchpos, CurPos());
Emit(RegexCode.Forejump);
if (node._children.Count > 1)
break;
// else fall through
goto case 1;
}
case 1:
PatchJump(PopInt(), CurPos());
break;
}
break;
case RegexNode.Testgroup | BeforeChild:
switch (CurIndex) {
case 0:
Emit(RegexCode.Setjump);
Emit(RegexCode.Setmark);
PushInt(CurPos());
Emit(RegexCode.Lazybranch, 0);
break;
}
break;
case RegexNode.Testgroup | AfterChild:
switch (CurIndex) {
case 0:
Emit(RegexCode.Getmark);
Emit(RegexCode.Forejump);
break;
case 1:
int Branchpos = PopInt();
PushInt(CurPos());
Emit(RegexCode.Goto, 0);
PatchJump(Branchpos, CurPos());
Emit(RegexCode.Getmark);
Emit(RegexCode.Forejump);
if (node._children.Count > 2)
break;
// else fall through
goto case 2;
case 2:
PatchJump(PopInt(), CurPos());
break;
}
break;
case RegexNode.Loop | BeforeChild:
case RegexNode.Lazyloop | BeforeChild:
if (node._n < infinite || node._m > 1)
Emit(node._m == 0 ? RegexCode.Nullcount : RegexCode.Setcount, node._m == 0 ? 0 : 1 - node._m);
else
Emit(node._m == 0 ? RegexCode.Nullmark : RegexCode.Setmark);
if (node._m == 0) {
PushInt(CurPos());
Emit(RegexCode.Goto, 0);
}
PushInt(CurPos());
break;
case RegexNode.Loop | AfterChild:
case RegexNode.Lazyloop | AfterChild: {
int StartJumpPos = CurPos();
int Lazy = (nodetype - (RegexNode.Loop | AfterChild));
if (node._n < infinite || node._m > 1)
Emit(RegexCode.Branchcount + Lazy, PopInt(), node._n == infinite ? infinite : node._n - node._m);
else
Emit(RegexCode.Branchmark + Lazy, PopInt());
if (node._m == 0)
PatchJump(PopInt(), StartJumpPos);
}
break;
case RegexNode.Group | BeforeChild:
case RegexNode.Group | AfterChild:
break;
case RegexNode.Capture | BeforeChild:
Emit(RegexCode.Setmark);
break;
case RegexNode.Capture | AfterChild:
Emit(RegexCode.Capturemark, MapCapnum(node._m), MapCapnum(node._n));
break;
case RegexNode.Require | BeforeChild:
// NOTE: the following line causes lookahead/lookbehind to be
// NON-BACKTRACKING. It can be commented out with (*)
Emit(RegexCode.Setjump);
Emit(RegexCode.Setmark);
break;
case RegexNode.Require | AfterChild:
Emit(RegexCode.Getmark);
// NOTE: the following line causes lookahead/lookbehind to be
// NON-BACKTRACKING. It can be commented out with (*)
Emit(RegexCode.Forejump);
break;
case RegexNode.Prevent | BeforeChild:
Emit(RegexCode.Setjump);
PushInt(CurPos());
Emit(RegexCode.Lazybranch, 0);
break;
case RegexNode.Prevent | AfterChild:
Emit(RegexCode.Backjump);
PatchJump(PopInt(), CurPos());
Emit(RegexCode.Forejump);
break;
case RegexNode.Greedy | BeforeChild:
Emit(RegexCode.Setjump);
break;
case RegexNode.Greedy | AfterChild:
Emit(RegexCode.Forejump);
break;
case RegexNode.One:
case RegexNode.Notone:
Emit(node._type | bits, (int)node._ch);
break;
case RegexNode.Notoneloop:
case RegexNode.Notonelazy:
case RegexNode.Oneloop:
case RegexNode.Onelazy:
if (node._m > 0)
Emit(((node._type == RegexNode.Oneloop || node._type == RegexNode.Onelazy) ?
RegexCode.Onerep : RegexCode.Notonerep) | bits, (int)node._ch, node._m);
if (node._n > node._m)
Emit(node._type | bits, (int)node._ch, node._n == infinite ?
infinite : node._n - node._m);
break;
case RegexNode.Setloop:
case RegexNode.Setlazy:
if (node._m > 0)
Emit(RegexCode.Setrep | bits, StringCode(node._str), StringCode(node._str2), node._m);
if (node._n > node._m)
Emit(node._type | bits, StringCode(node._str), StringCode(node._str2),
(node._n == infinite) ? infinite : node._n - node._m);
break;
case RegexNode.Multi:
Emit(node._type | bits, StringCode(node._str));
break;
case RegexNode.Set:
Emit(node._type | bits, StringCode(node._str), StringCode(node._str2));
break;
case RegexNode.Ref:
Emit(node._type | bits, MapCapnum(node._m));
break;
case RegexNode.Nothing:
case RegexNode.Bol:
case RegexNode.Eol:
case RegexNode.Boundary:
case RegexNode.Nonboundary:
case RegexNode.ECMABoundary:
case RegexNode.NonECMABoundary:
case RegexNode.Beginning:
case RegexNode.Start:
case RegexNode.EndZ:
case RegexNode.End:
Emit(node._type);
break;
default:
throw MakeException("Unexpected Opcode"); //XXX: SR.GetString(SR.UnexpectedOpcode, nodetype.ToString()));
}
}
}
}