2008-03-05 09:52:00 -05:00
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
|
|
|
|
// The RegexBoyerMoore object precomputes the Boyer-Moore
|
|
|
|
// tables for fast string scanning. These tables allow
|
|
|
|
// you to scan for the first occurrence of a string within
|
|
|
|
// a large body of text without examining every character.
|
|
|
|
// The performance of the heuristic depends on the actual
|
|
|
|
// string and the text being searched, but usually, the longer
|
|
|
|
// the string that is being searched for, the fewer characters
|
|
|
|
// need to be examined.
|
|
|
|
//
|
|
|
|
|
|
|
|
namespace System.Text.RegularExpressions
|
|
|
|
{
|
|
|
|
|
|
|
|
using System.Collections;
|
|
|
|
using System.Diagnostics;
|
|
|
|
using System.Globalization;
|
|
|
|
|
|
|
|
internal sealed class RegexBoyerMoore {
|
|
|
|
internal int[] _positive;
|
|
|
|
internal int[] _negativeASCII;
|
|
|
|
internal int[][] _negativeUnicode;
|
|
|
|
internal String _pattern;
|
|
|
|
internal int _lowASCII;
|
|
|
|
internal int _highASCII;
|
|
|
|
internal bool _rightToLeft;
|
|
|
|
internal bool _caseInsensitive;
|
|
|
|
internal CultureInfo _culture;
|
|
|
|
|
|
|
|
internal const int infinite = 0x7FFFFFFF;
|
|
|
|
|
|
|
|
// Constructs a Boyer-Moore state machine for searching for the string
|
|
|
|
// pattern. The string must not be zero-length.
|
|
|
|
internal RegexBoyerMoore(String pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) {
|
|
|
|
Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf");
|
|
|
|
|
|
|
|
int beforefirst;
|
|
|
|
int last;
|
|
|
|
int bump;
|
|
|
|
int examine;
|
|
|
|
int scan;
|
|
|
|
int match;
|
|
|
|
char ch;
|
|
|
|
|
2008-11-17 18:29:00 -05:00
|
|
|
|
|
|
|
if (caseInsensitive) {
|
2008-03-05 09:52:00 -05:00
|
|
|
// pattern = pattern.ToLower(culture);
|
|
|
|
pattern = pattern.ToLower();
|
|
|
|
}
|
|
|
|
|
|
|
|
_pattern = pattern;
|
|
|
|
_rightToLeft = rightToLeft;
|
|
|
|
_caseInsensitive = caseInsensitive;
|
|
|
|
_culture = culture;
|
2008-11-17 18:29:00 -05:00
|
|
|
|
2008-03-05 09:52:00 -05:00
|
|
|
if (!rightToLeft) {
|
|
|
|
beforefirst = -1;
|
|
|
|
last = pattern.Length - 1;
|
|
|
|
bump = 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
beforefirst = pattern.Length;
|
|
|
|
last = 0;
|
|
|
|
bump = -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// PART I - the good-suffix shift table
|
2008-11-17 18:29:00 -05:00
|
|
|
//
|
2008-03-05 09:52:00 -05:00
|
|
|
// compute the positive requirement:
|
|
|
|
// if char "i" is the first one from the right that doesn't match,
|
|
|
|
// then we know the matcher can advance by _positive[i].
|
|
|
|
//
|
|
|
|
_positive = new int[pattern.Length];
|
|
|
|
|
|
|
|
examine = last;
|
|
|
|
ch = pattern[examine];
|
|
|
|
_positive[examine] = bump;
|
|
|
|
examine -= bump;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
// find an internal char (examine) that matches the tail
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if (examine == beforefirst)
|
|
|
|
goto OuterloopBreak;
|
|
|
|
if (pattern[examine] == ch)
|
|
|
|
break;
|
|
|
|
examine -= bump;
|
|
|
|
}
|
|
|
|
|
|
|
|
match = last;
|
|
|
|
scan = examine;
|
|
|
|
|
|
|
|
// find the length of the match
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if (scan == beforefirst || pattern[match] != pattern[scan]) {
|
|
|
|
// at the end of the match, note the difference in _positive
|
|
|
|
// this is not the length of the match, but the distance from the internal match
|
2008-11-17 18:29:00 -05:00
|
|
|
// to the tail suffix.
|
2008-03-05 09:52:00 -05:00
|
|
|
if (_positive[match] == 0)
|
|
|
|
_positive[match] = match - scan;
|
|
|
|
|
|
|
|
// System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
scan -= bump;
|
|
|
|
match -= bump;
|
|
|
|
}
|
|
|
|
|
|
|
|
examine -= bump;
|
|
|
|
}
|
|
|
|
|
|
|
|
OuterloopBreak:
|
|
|
|
|
|
|
|
match = last - bump;
|
|
|
|
|
|
|
|
// scan for the chars for which there are no shifts that yield a different candidate
|
|
|
|
|
|
|
|
while (match != beforefirst) {
|
|
|
|
if (_positive[match] == 0)
|
|
|
|
_positive[match] = bump;
|
|
|
|
|
|
|
|
match -= bump;
|
|
|
|
}
|
|
|
|
|
|
|
|
//System.Diagnostics.Debug.WriteLine("good suffix shift table:");
|
|
|
|
//for (int i=0; i<_positive.Length; i++)
|
|
|
|
// System.Diagnostics.Debug.WriteLine("\t_positive[" + i + "] = " + _positive[i]);
|
2008-11-17 18:29:00 -05:00
|
|
|
|
2008-03-05 09:52:00 -05:00
|
|
|
|
|
|
|
// PART II - the bad-character shift table
|
2008-11-17 18:29:00 -05:00
|
|
|
//
|
2008-03-05 09:52:00 -05:00
|
|
|
// compute the negative requirement:
|
|
|
|
// if char "ch" is the reject character when testing position "i",
|
|
|
|
// we can slide up by _negative[ch];
|
|
|
|
// (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
|
|
|
|
//
|
|
|
|
// the lookup table is divided into ASCII and Unicode portions;
|
|
|
|
// only those parts of the Unicode 16-bit code set that actually
|
|
|
|
// appear in the string are in the table. (Maximum size with
|
|
|
|
// Unicode is 65K; ASCII only case is 512 bytes.)
|
|
|
|
|
|
|
|
_negativeASCII = new int[128];
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; i++)
|
|
|
|
_negativeASCII[i] = last - beforefirst;
|
|
|
|
|
|
|
|
_lowASCII = 127;
|
|
|
|
_highASCII = 0;
|
|
|
|
|
|
|
|
for (examine = last; examine != beforefirst; examine -= bump) {
|
|
|
|
ch = pattern[examine];
|
|
|
|
|
|
|
|
if (ch < 128) {
|
|
|
|
if (_lowASCII > ch)
|
|
|
|
_lowASCII = ch;
|
|
|
|
|
|
|
|
if (_highASCII < ch)
|
|
|
|
_highASCII = ch;
|
|
|
|
|
|
|
|
if (_negativeASCII[ch] == last - beforefirst)
|
|
|
|
_negativeASCII[ch] = last - examine;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
int i = ch >> 8;
|
|
|
|
int j = ch & 0xFF;
|
|
|
|
|
|
|
|
if (_negativeUnicode == null) {
|
|
|
|
_negativeUnicode = new int[256][];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (_negativeUnicode[i] == null) {
|
|
|
|
int[] newarray = new int[256];
|
|
|
|
|
|
|
|
for (int k = 0; k < 256; k++)
|
|
|
|
newarray[k] = last - beforefirst;
|
|
|
|
|
|
|
|
if (i == 0) {
|
|
|
|
System.Array.Copy(_negativeASCII, newarray, 128);
|
|
|
|
_negativeASCII = newarray;
|
|
|
|
}
|
|
|
|
|
|
|
|
_negativeUnicode[i] = newarray;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (_negativeUnicode[i][j] == last - beforefirst)
|
|
|
|
_negativeUnicode[i][j] = last - examine;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
|
|
|
|
internal bool IsMatch(String text, int index, int beglimit, int endlimit) {
|
2008-11-17 18:29:00 -05:00
|
|
|
|
2008-03-05 09:52:00 -05:00
|
|
|
if (!_rightToLeft) {
|
|
|
|
if (index < beglimit || endlimit - index < _pattern.Length)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return(0 == String.Compare(_pattern, 0, text, index, _pattern.Length, _caseInsensitive));//, _culture));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (index > endlimit || index - beglimit < _pattern.Length)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return(0 == String.Compare(_pattern, 0, text, index - _pattern.Length, _pattern.Length, _caseInsensitive));//, _culture));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Scan uses the Boyer-Moore algorithm to find the first occurrence
|
|
|
|
// of the specified string within text, beginning at index, and
|
|
|
|
// constrained within beglimit and endlimit.
|
|
|
|
//
|
|
|
|
// The direction and case-sensitivity of the match is determined
|
|
|
|
// by the arguments to the RegexBoyerMoore constructor.
|
|
|
|
internal int Scan(String text, int index, int beglimit, int endlimit) {
|
|
|
|
int test;
|
|
|
|
int test2;
|
|
|
|
int match;
|
|
|
|
int startmatch;
|
|
|
|
int endmatch;
|
|
|
|
int advance;
|
|
|
|
int defadv;
|
|
|
|
int bump;
|
|
|
|
char chMatch;
|
|
|
|
char chTest;
|
|
|
|
int[] unicodeLookup;
|
|
|
|
|
|
|
|
if (!_rightToLeft) {
|
|
|
|
defadv = _pattern.Length;
|
|
|
|
startmatch = _pattern.Length - 1;
|
|
|
|
endmatch = 0;
|
|
|
|
test = index + defadv - 1;
|
|
|
|
bump = 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
defadv = -_pattern.Length;
|
|
|
|
startmatch = 0;
|
|
|
|
endmatch = -defadv - 1;
|
|
|
|
test = index + defadv;
|
|
|
|
bump = -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
chMatch = _pattern[startmatch];
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if (test >= endlimit || test < beglimit)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
chTest = text[test];
|
|
|
|
|
|
|
|
if (_caseInsensitive)
|
|
|
|
chTest = Char.ToLower(chTest);//, _culture);
|
|
|
|
|
|
|
|
if (chTest != chMatch) {
|
|
|
|
if (chTest < 128)
|
|
|
|
advance = _negativeASCII[chTest];
|
|
|
|
else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8])))
|
|
|
|
advance = unicodeLookup[chTest & 0xFF];
|
|
|
|
else
|
|
|
|
advance = defadv;
|
|
|
|
|
|
|
|
test += advance;
|
|
|
|
}
|
|
|
|
else { // if (chTest == chMatch)
|
|
|
|
test2 = test;
|
|
|
|
match = startmatch;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if (match == endmatch)
|
|
|
|
return(_rightToLeft ? test2 + 1 : test2);
|
|
|
|
|
|
|
|
match -= bump;
|
|
|
|
test2 -= bump;
|
|
|
|
|
|
|
|
chTest = text[test2];
|
|
|
|
|
|
|
|
if (_caseInsensitive)
|
|
|
|
chTest = Char.ToLower(chTest);//, _culture);
|
|
|
|
|
|
|
|
if (chTest != _pattern[match]) {
|
|
|
|
advance = _positive[match];
|
|
|
|
if ((chTest & 0xFF80) == 0)
|
|
|
|
test2 = (match - startmatch) + _negativeASCII[chTest];
|
|
|
|
else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8])))
|
|
|
|
test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF];
|
|
|
|
else {
|
|
|
|
test += advance;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (_rightToLeft ? test2 < advance : test2 > advance)
|
|
|
|
advance = test2;
|
|
|
|
|
|
|
|
test += advance;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Used when dumping for debugging.
|
|
|
|
public override String ToString() {
|
|
|
|
return _pattern;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if DBG
|
|
|
|
public String Dump(String indent) {
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
|
|
|
|
|
|
sb.Append(indent + "BM Pattern: " + _pattern + "\n");
|
|
|
|
sb.Append(indent + "Positive: ");
|
|
|
|
for (int i = 0; i < _positive.Length; i++) {
|
|
|
|
sb.Append(_positive[i].ToString() + " ");
|
|
|
|
}
|
|
|
|
sb.Append("\n");
|
|
|
|
|
|
|
|
if (_negativeASCII != null) {
|
|
|
|
sb.Append(indent + "Negative table\n");
|
|
|
|
for (int i = 0; i < _negativeASCII.Length; i++) {
|
|
|
|
if (_negativeASCII[i] != _pattern.Length) {
|
|
|
|
sb.Append(indent + " " + Regex.Escape(Convert.ToString((char)i)) + " " + _negativeASCII[i].ToString() + "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return sb.ToString();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|