| | 1 | | using MoreStructures.KnuthMorrisPratt.PrefixFunction; |
| | 2 | | using MoreStructures.Strings.Matching; |
| | 3 | |
|
| | 4 | | namespace MoreStructures.KnuthMorrisPratt.Matching; |
| | 5 | |
|
| | 6 | | /// <summary> |
| | 7 | | /// Exposes utility methods to match a text against a pattern, using the Knuth-Morris-Pratt algorithm. |
| | 8 | | /// </summary> |
| | 9 | | public static class Matcher |
| | 10 | | { |
| 1 | 11 | | private static readonly IPrefixFunctionCalculator PrefixFunctionCalculator = new FastPrefixFunctionCalculator(); |
| | 12 | |
|
| | 13 | | /// <summary> |
| | 14 | | /// Tries to match the provided <paramref name="pattern"/> against the provided <paramref name="text"/>, using the |
| | 15 | | /// Knuth-Morris-Pratt algorithm. Returns all matches found. |
| | 16 | | /// </summary> |
| | 17 | | /// <param name="text">The text, to match the pattern against.</param> |
| | 18 | | /// <param name="pattern">The pattern to match.</param> |
| | 19 | | /// <param name="separator"> |
| | 20 | | /// A special character, to be used as separator in the concatenated string, between <paramref name="pattern"/> and |
| | 21 | | /// <paramref name="text"/>. The special character is required by the algorithm and must be absent from both the |
| | 22 | | /// <paramref name="pattern"/> and the <paramref name="text"/>. |
| | 23 | | /// </param> |
| | 24 | | /// <param name="validateSeparator"> |
| | 25 | | /// Whether an additional validation check should be performed, before actually running the algorithm, on whether |
| | 26 | | /// <paramref name="text"/> and <paramref name="pattern"/> contain <paramref name="separator"/> or not. The check |
| | 27 | | /// requires going through all chars of text and pattern, and can be expensive. This is why it has to be |
| | 28 | | /// explicitely opted-in. |
| | 29 | | /// </param> |
| | 30 | | /// <returns> |
| | 31 | | /// A sequence of all matches, from the one starting at the lowest index in <paramref name="text"/> to the one |
| | 32 | | /// starting at the highest index. |
| | 33 | | /// </returns> |
| | 34 | | /// <remarks> |
| | 35 | | /// <para id="algo"> |
| | 36 | | /// ALGORITHM |
| | 37 | | /// <br/> |
| | 38 | | /// The Knuth-Morris-Pratt algorithm: |
| | 39 | | /// <br/> |
| | 40 | | /// - First builds a concatenated string in the form "pattern-separator-text". |
| | 41 | | /// <br/> |
| | 42 | | /// - Then calculates the Prefix Function of the concatenated string. |
| | 43 | | /// <br/> |
| | 44 | | /// - Finally, iterates over the Prefix Function, retaining all indexes i such that the value of the Prefix |
| | 45 | | /// Function at i is equal to the length of the pattern. That means that the prefix of the concatenated |
| | 46 | | /// string from 0 to i (included) has a border of length equal to the length of the pattern, which means that |
| | 47 | | /// the prefix contains the pattern. |
| | 48 | | /// <br/> |
| | 49 | | /// - For each retained index i, a successful match is emitted, starting at index i - 2 * m and matching |
| | 50 | | /// exactly m chars, where m is the length of the pattern. |
| | 51 | | /// </para> |
| | 52 | | /// <para id="complexity"> |
| | 53 | | /// COMPLEXITY |
| | 54 | | /// <br/> |
| | 55 | | /// - The only validation check which is not done in constant time is whether text and pattern contain the |
| | 56 | | /// separator. This validation check is by default not done, and has to be opted in via the flag |
| | 57 | | /// <paramref name="validateSeparator"/>. When executed it takes O(n + m) time, since it has to compare the |
| | 58 | | /// separator against each char of the text and of the pattern. |
| | 59 | | /// <br/> |
| | 60 | | /// - The concatenated string has length n + m + 1, where n is the length of the text and m is the length of |
| | 61 | | /// the pattern. Building it requires iterating both over the pattern and the text. |
| | 62 | | /// <br/> |
| | 63 | | /// - Calculating the Prefix Function of the concatenated string via <see cref="FastPrefixFunctionCalculator"/> |
| | 64 | | /// is a O(n + m + 1) operation, both in time and space. |
| | 65 | | /// <br/> |
| | 66 | | /// - Iterating over the indexes of the Prefix Function which refer to the text is an O(n) operation. |
| | 67 | | /// <br/> |
| | 68 | | /// - Each iteration makes a constant-time comparison and yield the result. |
| | 69 | | /// <br/> |
| | 70 | | /// - Therefore, both Time and Space Complexity are O(n + m). |
| | 71 | | /// </para> |
| | 72 | | /// </remarks> |
| | 73 | | public static IEnumerable<Match<int>> Match( |
| | 74 | | string text, string pattern, char separator, bool validateSeparator = false) |
| 26 | 75 | | { |
| 26 | 76 | | if (string.IsNullOrEmpty(pattern)) |
| 1 | 77 | | throw new ArgumentException("Must be non-empty.", nameof(pattern)); |
| | 78 | |
|
| 25 | 79 | | if (validateSeparator) |
| 12 | 80 | | { |
| 12 | 81 | | if (text.Contains(separator)) |
| 4 | 82 | | throw new ArgumentException($"Should not contain {nameof(separator)}.", nameof(text)); |
| 8 | 83 | | if (pattern.Contains(separator)) |
| 2 | 84 | | throw new ArgumentException($"Should not contain {nameof(separator)}.", nameof(pattern)); |
| 6 | 85 | | } |
| | 86 | |
|
| 19 | 87 | | return MatchEnumerable(text, pattern, separator); |
| 19 | 88 | | } |
| | 89 | |
|
| | 90 | | private static IEnumerable<Match<int>> MatchEnumerable( |
| | 91 | | string text, string pattern, char separator) |
| 19 | 92 | | { |
| 19 | 93 | | if (string.IsNullOrEmpty(text)) |
| 1 | 94 | | yield break; |
| | 95 | |
|
| 18 | 96 | | var patternAndText = string.Concat(pattern, separator, text); |
| 18 | 97 | | var prefixFunction = PrefixFunctionCalculator.GetValues(patternAndText).ToList(); |
| 18 | 98 | | var m = pattern.Length; |
| | 99 | |
|
| 304 | 100 | | for (var i = m + 1; i < patternAndText.Length; i++) |
| 134 | 101 | | if (prefixFunction[i] == m) |
| 22 | 102 | | yield return new Match<int>(true, i - m * 2, m, i); |
| 18 | 103 | | } |
| | 104 | | } |