| | 1 | | using MoreLinq; |
| | 2 | | using MoreStructures.Utilities; |
| | 3 | |
|
| | 4 | | namespace MoreStructures.SuffixArrays.LongestCommonPrefix; |
| | 5 | |
|
| | 6 | | /// <summary> |
| | 7 | | /// An <see cref="ILcpArrayBuilder"/> implementation which uses the Kasai's algorithm (2001) to compute the LCP Array |
| | 8 | | /// in linear time. |
| | 9 | | /// </summary> |
| | 10 | | /// <remarks> |
| | 11 | | /// <para id="advantages"> |
| | 12 | | /// ADVANTAGES AND DISADVANTAGES |
| | 13 | | /// <br/> |
| | 14 | | /// - Compared to the naive implementation following the definition of LCP Array, implemented by |
| | 15 | | /// <see cref="NaiveLcpArrayBuilder"/>, this algorithm has better runtime and equal Space Complexity. |
| | 16 | | /// <br/> |
| | 17 | | /// - The better runtime comes at the cost of the complexity of the algorithm itself, which is harder to analyze. |
| | 18 | | /// <br/> |
| | 19 | | /// - Moreover, unlike in the naive implementation, the construction of the array doesn't proceed in order, i.e. |
| | 20 | | /// it doesn't proceed from the first element to the last element of the array. |
| | 21 | | /// <br/> |
| | 22 | | /// - That is, the Kasai's algorithm is not online and, in its base form, cannot be easily made lazy. |
| | 23 | | /// </para> |
| | 24 | | /// <para id="algorithm"> |
| | 25 | | /// ALGORITHM |
| | 26 | | /// <br/> |
| | 27 | | /// - The algorithm requires 3 data structures to operate: the text T, its Suffix Array SA and the Inverted Suffix |
| | 28 | | /// Array ISA. The three data structures have n items (chars or integers). |
| | 29 | | /// <br/> |
| | 30 | | /// - T and SA are externally provided, whereas ISA is calculated scanning linearly SA and populating a dictionary |
| | 31 | | /// of positions of SA: <c>ISA[v] = i such that SA[i] = v</c>. Because there is no better general algorithm than |
| | 32 | | /// a single linear scan of the input, ISA is internally calculated, rather than externally provided. |
| | 33 | | /// <br/> |
| | 34 | | /// - The current LCP length, named here CLCP, is initialized to 0, and the index of the current suffix in T, named |
| | 35 | | /// here CS, is initialized to the first item of SA: <c>CLCP = 0; CS = SA[0]</c>. This is the setup before the |
| | 36 | | /// main loop. |
| | 37 | | /// <br/> |
| | 38 | | /// - Then n iterations are performed, filling in the resulting LPC array, named here LCPA, initialized to an empty |
| | 39 | | /// array of length n - 1. |
| | 40 | | /// <br/> |
| | 41 | | /// - At each iteration the index in SA of the current suffix (starting in T at index CS), is calculated via ISA: |
| | 42 | | /// <c>k = ISA[CS]</c>. |
| | 43 | | /// <br/> |
| | 44 | | /// - Then, the next suffix in T, named here NS, is calculated as the successor of k in SA: <c>NS = SA[k + 1]</c>. |
| | 45 | | /// <br/> |
| | 46 | | /// - Important: if <c>ISA[CS]</c> is the last index of SA (i.e. n - 1), then CLCP is reset and the current |
| | 47 | | /// iteration skipped. |
| | 48 | | /// <br/> |
| | 49 | | /// - A new value of CLCP is calculated as the longest common prefix between T[CS..] and T[NS..], skipping the |
| | 50 | | /// first CLCP - 1 chars, which are equal by construction. |
| | 51 | | /// <br/> |
| | 52 | | /// - Such new value of CLCP is then assigned to LCPA at index |
| | 53 | | /// <br/> |
| | 54 | | /// - Finally, the index of the current suffix in T, CS, is incremented modulo n and the current iteration |
| | 55 | | /// terminates. |
| | 56 | | /// <br/> |
| | 57 | | /// - After the loop, LCPA is fully populated and can be returned as result. |
| | 58 | | /// </para> |
| | 59 | | /// <para id="complexity"> |
| | 60 | | /// COMPLEXITY |
| | 61 | | /// <br/> |
| | 62 | | /// - Text and Suffix Array evaluation (required to have direct access by index) are O(n) operations, both in time |
| | 63 | | /// and space. |
| | 64 | | /// <br/> |
| | 65 | | /// - Inversion of the Suffix Array is also a O(n) operation. |
| | 66 | | /// <br/> |
| | 67 | | /// - Initialization of the current LCP and the current prefix are constant-time operations. |
| | 68 | | /// <br/> |
| | 69 | | /// - The main loop of the algorithm runs n iterations. |
| | 70 | | /// <br/> |
| | 71 | | /// - The only operation which doesn't take constant time in the body of the loop is the LCP between the current |
| | 72 | | /// suffix, initialized at the first item of the Suffix Array and then incremented by 1 modulo n at every |
| | 73 | | /// iteration, and the next suffix, calculated via Suffix Array and its inverted version. |
| | 74 | | /// <br/> |
| | 75 | | /// - While the above is true, it should be noted that the current LCP is increased once per match and decreased |
| | 76 | | /// once per iteration. Because the current LCP cannot be bigger than n (that's the biggest number of chars a |
| | 77 | | /// prefix can have, hence the bigger number of chars in common between two prefixes), the total number of |
| | 78 | | /// comparisons across all iterations is O(n). |
| | 79 | | /// <br/> |
| | 80 | | /// - Therefore, Time and Space Complexity are O(n). |
| | 81 | | /// </para> |
| | 82 | | /// </remarks> |
| | 83 | | public class KasaiLcpArrayBuilder : NaiveLcpArrayBuilder |
| | 84 | | { |
| | 85 | | /// <inheritdoc cref="KasaiLcpArrayBuilder"/> |
| | 86 | | /// <param name="text"> |
| | 87 | | /// <inheritdoc cref="NaiveLcpArrayBuilder.Text" path="/summary"/> |
| | 88 | | /// </param> |
| | 89 | | /// <param name="suffixArray"> |
| | 90 | | /// <inheritdoc cref="NaiveLcpArrayBuilder.SuffixArray" path="/summary"/> |
| | 91 | | /// </param> |
| 34 | 92 | | public KasaiLcpArrayBuilder(TextWithTerminator text, SuffixArray suffixArray) : base(text, suffixArray) |
| 34 | 93 | | { |
| 34 | 94 | | } |
| | 95 | |
|
| | 96 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 97 | | /// <remarks> |
| | 98 | | /// <inheritdoc cref="KasaiLcpArrayBuilder"/> |
| | 99 | | /// </remarks> |
| | 100 | | public override LcpArray Build() |
| 34 | 101 | | { |
| 34 | 102 | | var text = string.Concat(Text); |
| 34 | 103 | | var lcpArray = new int[Text.Length - 1]; |
| 34 | 104 | | var suffixArray = SuffixArray.Indexes.ToList(); |
| 426 | 105 | | var invertedSuffixArray = suffixArray.Index().ToDictionary(kvp => kvp.Value, kvp => kvp.Key); |
| | 106 | |
|
| 34 | 107 | | var currentLcp = 0; |
| 34 | 108 | | var currentSuffix = suffixArray[0]; |
| 460 | 109 | | for (var i = 0; i < text.Length; i++) |
| 196 | 110 | | { |
| 196 | 111 | | var indexOfCurrentSuffixInSuffixArray = invertedSuffixArray[currentSuffix]; |
| 196 | 112 | | if (indexOfCurrentSuffixInSuffixArray == text.Length - 1) |
| 34 | 113 | | { |
| 34 | 114 | | currentLcp = 0; |
| 34 | 115 | | } |
| | 116 | | else |
| 162 | 117 | | { |
| 162 | 118 | | var nextSuffix = suffixArray[indexOfCurrentSuffixInSuffixArray + 1]; |
| 162 | 119 | | var delta = Math.Max(currentLcp - 1, 0); |
| 162 | 120 | | currentLcp = StringUtilities.LongestCommonPrefix( |
| 162 | 121 | | text, currentSuffix + delta, text, nextSuffix + delta) + delta; |
| | 122 | |
|
| 162 | 123 | | lcpArray[indexOfCurrentSuffixInSuffixArray] = currentLcp; |
| 162 | 124 | | } |
| | 125 | |
|
| 196 | 126 | | currentSuffix = (currentSuffix + 1) % text.Length; |
| 196 | 127 | | } |
| | 128 | |
|
| 34 | 129 | | return new LcpArray(lcpArray); |
| 34 | 130 | | } |
| | 131 | | } |