//------------------------------------------------------------------------------ // // // Copyright (c) Microsoft Corporation. All rights reserved. // // //------------------------------------------------------------------------------ // The RegexBoyerMoore object precomputes the Boyer-Moore // tables for fast string scanning. These tables allow // you to scan for the first occurrence of a string within // a large body of text without examining every character. // The performance of the heuristic depends on the actual // string and the text being searched, but usually, the longer // the string that is being searched for, the fewer characters // need to be examined. // namespace System.Text.RegularExpressions { using System.Collections; using System.Diagnostics; using System.Globalization; internal sealed class RegexBoyerMoore { internal int[] _positive; internal int[] _negativeASCII; internal int[][] _negativeUnicode; internal String _pattern; internal int _lowASCII; internal int _highASCII; internal bool _rightToLeft; internal bool _caseInsensitive; internal CultureInfo _culture; internal const int infinite = 0x7FFFFFFF; // Constructs a Boyer-Moore state machine for searching for the string // pattern. The string must not be zero-length. internal RegexBoyerMoore(String pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) { Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf"); int beforefirst; int last; int bump; int examine; int scan; int match; char ch; if (caseInsensitive) { // pattern = pattern.ToLower(culture); pattern = pattern.ToLower(); } _pattern = pattern; _rightToLeft = rightToLeft; _caseInsensitive = caseInsensitive; _culture = culture; if (!rightToLeft) { beforefirst = -1; last = pattern.Length - 1; bump = 1; } else { beforefirst = pattern.Length; last = 0; bump = -1; } // PART I - the good-suffix shift table // // compute the positive requirement: // if char "i" is the first one from the right that doesn't match, // then we know the matcher can advance by _positive[i]. // _positive = new int[pattern.Length]; examine = last; ch = pattern[examine]; _positive[examine] = bump; examine -= bump; for (;;) { // find an internal char (examine) that matches the tail for (;;) { if (examine == beforefirst) goto OuterloopBreak; if (pattern[examine] == ch) break; examine -= bump; } match = last; scan = examine; // find the length of the match for (;;) { if (scan == beforefirst || pattern[match] != pattern[scan]) { // at the end of the match, note the difference in _positive // this is not the length of the match, but the distance from the internal match // to the tail suffix. if (_positive[match] == 0) _positive[match] = match - scan; // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); break; } scan -= bump; match -= bump; } examine -= bump; } OuterloopBreak: match = last - bump; // scan for the chars for which there are no shifts that yield a different candidate while (match != beforefirst) { if (_positive[match] == 0) _positive[match] = bump; match -= bump; } //System.Diagnostics.Debug.WriteLine("good suffix shift table:"); //for (int i=0; i<_positive.Length; i++) // System.Diagnostics.Debug.WriteLine("\t_positive[" + i + "] = " + _positive[i]); // PART II - the bad-character shift table // // compute the negative requirement: // if char "ch" is the reject character when testing position "i", // we can slide up by _negative[ch]; // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) // // the lookup table is divided into ASCII and Unicode portions; // only those parts of the Unicode 16-bit code set that actually // appear in the string are in the table. (Maximum size with // Unicode is 65K; ASCII only case is 512 bytes.) _negativeASCII = new int[128]; for (int i = 0; i < 128; i++) _negativeASCII[i] = last - beforefirst; _lowASCII = 127; _highASCII = 0; for (examine = last; examine != beforefirst; examine -= bump) { ch = pattern[examine]; if (ch < 128) { if (_lowASCII > ch) _lowASCII = ch; if (_highASCII < ch) _highASCII = ch; if (_negativeASCII[ch] == last - beforefirst) _negativeASCII[ch] = last - examine; } else { int i = ch >> 8; int j = ch & 0xFF; if (_negativeUnicode == null) { _negativeUnicode = new int[256][]; } if (_negativeUnicode[i] == null) { int[] newarray = new int[256]; for (int k = 0; k < 256; k++) newarray[k] = last - beforefirst; if (i == 0) { System.Array.Copy(_negativeASCII, newarray, 128); _negativeASCII = newarray; } _negativeUnicode[i] = newarray; } if (_negativeUnicode[i][j] == last - beforefirst) _negativeUnicode[i][j] = last - examine; } } } // When a regex is anchored, we can do a quick IsMatch test instead of a Scan internal bool IsMatch(String text, int index, int beglimit, int endlimit) { if (!_rightToLeft) { if (index < beglimit || endlimit - index < _pattern.Length) return false; return(0 == String.Compare(_pattern, 0, text, index, _pattern.Length, _caseInsensitive));//, _culture)); } else { if (index > endlimit || index - beglimit < _pattern.Length) return false; return(0 == String.Compare(_pattern, 0, text, index - _pattern.Length, _pattern.Length, _caseInsensitive));//, _culture)); } } // Scan uses the Boyer-Moore algorithm to find the first occurrence // of the specified string within text, beginning at index, and // constrained within beglimit and endlimit. // // The direction and case-sensitivity of the match is determined // by the arguments to the RegexBoyerMoore constructor. internal int Scan(String text, int index, int beglimit, int endlimit) { int test; int test2; int match; int startmatch; int endmatch; int advance; int defadv; int bump; char chMatch; char chTest; int[] unicodeLookup; if (!_rightToLeft) { defadv = _pattern.Length; startmatch = _pattern.Length - 1; endmatch = 0; test = index + defadv - 1; bump = 1; } else { defadv = -_pattern.Length; startmatch = 0; endmatch = -defadv - 1; test = index + defadv; bump = -1; } chMatch = _pattern[startmatch]; for (;;) { if (test >= endlimit || test < beglimit) return -1; chTest = text[test]; if (_caseInsensitive) chTest = Char.ToLower(chTest);//, _culture); if (chTest != chMatch) { if (chTest < 128) advance = _negativeASCII[chTest]; else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) advance = unicodeLookup[chTest & 0xFF]; else advance = defadv; test += advance; } else { // if (chTest == chMatch) test2 = test; match = startmatch; for (;;) { if (match == endmatch) return(_rightToLeft ? test2 + 1 : test2); match -= bump; test2 -= bump; chTest = text[test2]; if (_caseInsensitive) chTest = Char.ToLower(chTest);//, _culture); if (chTest != _pattern[match]) { advance = _positive[match]; if ((chTest & 0xFF80) == 0) test2 = (match - startmatch) + _negativeASCII[chTest]; else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF]; else { test += advance; break; } if (_rightToLeft ? test2 < advance : test2 > advance) advance = test2; test += advance; break; } } } } } // Used when dumping for debugging. public override String ToString() { return _pattern; } #if DBG public String Dump(String indent) { StringBuilder sb = new StringBuilder(); sb.Append(indent + "BM Pattern: " + _pattern + "\n"); sb.Append(indent + "Positive: "); for (int i = 0; i < _positive.Length; i++) { sb.Append(_positive[i].ToString() + " "); } sb.Append("\n"); if (_negativeASCII != null) { sb.Append(indent + "Negative table\n"); for (int i = 0; i < _negativeASCII.Length; i++) { if (_negativeASCII[i] != _pattern.Length) { sb.Append(indent + " " + Regex.Escape(Convert.ToString((char)i)) + " " + _negativeASCII[i].ToString() + "\n"); } } } return sb.ToString(); } #endif } }