| | 1 | | namespace MoreStructures.KnuthMorrisPratt.PrefixFunction; |
| | 2 | |
|
| | 3 | | /// <inheritdoc cref="IPrefixFunctionCalculator" path="//*[not(self::summary or self::remarks)]"/> |
| | 4 | | /// <summary> |
| | 5 | | /// An implementation of <see cref="IPrefixFunctionCalculator"/> which takes advantage of the property that any |
| | 6 | | /// non-longest border is a also a border of the longest border, to calculate the Prefix Function efficiently. |
| | 7 | | /// </summary> |
| | 8 | | /// <remarks> |
| | 9 | | /// <para id="algo"> |
| | 10 | | /// ALGORITHM |
| | 11 | | /// <br/> |
| | 12 | | /// - The algorithm iterates over all the chars of the text, from the first to the last, calculating the Prefix |
| | 13 | | /// Function s for each index i. |
| | 14 | | /// <br/> |
| | 15 | | /// - The Prefix Function at index 0, <c>s(0) = 0</c>, since a 1-char string can't have any border. |
| | 16 | | /// <br/> |
| | 17 | | /// - For each any other index, it calculates the value of the Prefix Function based on values of s at lower |
| | 18 | | /// indexes and based on T. |
| | 19 | | /// <br/> |
| | 20 | | /// - if <c>T[i] = T[s(i - 1)]</c>, where <c>s(i - 1)</c> has been calculated at the previous step, then |
| | 21 | | /// <c>s(i) = s(i - 1) + 1</c>. |
| | 22 | | /// <br/> |
| | 23 | | /// - This is because <c>T[0..s(i - 1)]</c> is, by definition of s, a border of <c>T[0..i]</c> and, if |
| | 24 | | /// <c>T[i] = T[s(i - 1)]</c>, then <c>T[0..s(i - 1) + 1] = T[i - s(i)..i]</c>. So <c>T[0..s(i - 1) + 1]</c> is a |
| | 25 | | /// border of <c>T[0..(i + 1)]</c> and there can't be any longer border, since <c>s(i + 1) <= s(i) + 1</c> |
| | 26 | | /// (the Prefix Function can grow of at most 1 at each index). |
| | 27 | | /// <br/> |
| | 28 | | /// - if <c>T[i] != T[s(i - 1)]</c>, <c>s(i)</c> will be smaller than <c>s(i - 1) + 1</c>. While s can increase of |
| | 29 | | /// at most 1 at each index, it can decrease to 0 in a single step (if T[i] is such that there are no borders of |
| | 30 | | /// T[0..(i + 1)]). |
| | 31 | | /// <br/> |
| | 32 | | /// - Because a border of T[0..(i + 1)] (prefix of T including up to the i-th char) is also a border of T[0..i] |
| | 33 | | /// (prefix of T at previous step, i.e. including up to the (i - 1)-th char), EXCEPT for the char T[i], which |
| | 34 | | /// would come just after the border of T[0..i], the longest border of T[0..(i + 1)] (whose length is s(i)), |
| | 35 | | /// can be found by looking for the longest border of T[0..i] followed by T[i]. |
| | 36 | | /// <br/> |
| | 37 | | /// - If such a border of length w is found, then there is a border of T[0..i], T[0..w], followed by T[w] = T[i]. |
| | 38 | | /// <br/> |
| | 39 | | /// - Which means that <c>T[0..(w + 1)] == T[(i - w)..(i + 1)]</c>. |
| | 40 | | /// <br/> |
| | 41 | | /// - Therefore, T[0..(w + 1)] is a border of T[0..(i + 1)] and <c>s(i + 1) = w</c>. |
| | 42 | | /// <br/> |
| | 43 | | /// - If no borders are found, the Prefix Function at index i is zero. |
| | 44 | | /// </para> |
| | 45 | | /// <para id="complexity"> |
| | 46 | | /// COMPLEXITY |
| | 47 | | /// <br/> |
| | 48 | | /// - There are as many top-level iterations as indexes of the Prefix Function and chars of the text. |
| | 49 | | /// <br/> |
| | 50 | | /// - For each top-level iteration, there are at most as many sub-iterations as borders of the prefix, each one |
| | 51 | | /// checking a single char of the text. In the worst case there are O(n) borders, meaning that each sub-iteration |
| | 52 | | /// requires O(n) char comparisons. |
| | 53 | | /// <br/> |
| | 54 | | /// - That would seem to imply that the number of sub-iterations in total, throughout all top-level iterations, |
| | 55 | | /// would be quadratic. |
| | 56 | | /// <br/> |
| | 57 | | /// - However, the length of the border calculated by the inner loop is always non-negative and is increased of at |
| | 58 | | /// most n times (since it is increased of at most 1 per char of text). |
| | 59 | | /// <br/> |
| | 60 | | /// - Moreover, it is progressively shorten by the inner loop, decreased of at least 1 per sub-iteration. This is |
| | 61 | | /// because, when the char following the current border is different than the current char (which is the |
| | 62 | | /// condition of the inner loop), the next border is the longest border of the current border, which is strictly |
| | 63 | | /// shorter than the current border. |
| | 64 | | /// <br/> |
| | 65 | | /// - Therefore the length of the border can reach at most n thoughout all top-level iterations, and is always |
| | 66 | | /// non-negative and decreased by 1 at each sub-iteration, which means that the total number of inner loop |
| | 67 | | /// iterations through all top-level iterations is O(n). |
| | 68 | | /// <br/> |
| | 69 | | /// - All accesses to values calculated in previous steps and to chars of the text are done in constant time. |
| | 70 | | /// <br/> |
| | 71 | | /// - An array of integers, as big as the numbers of chars in the text, is allocated at the beginning, to memoize |
| | 72 | | /// the value calculated at each step, since that value may be used in following iterations. |
| | 73 | | /// <br/> |
| | 74 | | /// - No other data structure is allocated by the algorithm. |
| | 75 | | /// <br/> |
| | 76 | | /// - Therefore, Time Complexity is O(n) and Space Complexity is O(n). |
| | 77 | | /// </para> |
| | 78 | | /// </remarks> |
| | 79 | | public class FastPrefixFunctionCalculator : IPrefixFunctionCalculator |
| | 80 | | { |
| | 81 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 82 | | /// <remarks> |
| | 83 | | /// <inheritdoc cref="FastPrefixFunctionCalculator" path="/remarks"/> |
| | 84 | | /// </remarks> |
| | 85 | | public IEnumerable<int> GetValues(string text) |
| 38 | 86 | | { |
| 38 | 87 | | if (text.Length == 0) |
| 2 | 88 | | yield break; |
| | 89 | |
|
| 36 | 90 | | var s = new int[text.Length]; |
| 36 | 91 | | var i = 0; |
| | 92 | |
|
| | 93 | | // The 1-char prefix of text cannot have any border => s[0] = 0 |
| 36 | 94 | | s[0] = 0; |
| 36 | 95 | | yield return s[i++]; |
| | 96 | |
|
| | 97 | | // All indexes > 0 are calculated based on results from previous steps |
| 286 | 98 | | while (i < text.Length) |
| 250 | 99 | | { |
| 250 | 100 | | var w = s[i - 1]; |
| 250 | 101 | | var found = text[w] == text[i]; |
| 287 | 102 | | while (w > 0 && !found) |
| 37 | 103 | | { |
| 37 | 104 | | w = s[w - 1]; |
| 37 | 105 | | found = text[w] == text[i]; |
| 37 | 106 | | } |
| | 107 | |
|
| 250 | 108 | | yield return s[i] = (found ? w + 1 : 0); |
| | 109 | |
|
| 250 | 110 | | i++; |
| 250 | 111 | | } |
| 36 | 112 | | } |
| | 113 | | } |