| | 1 | | namespace MoreStructures.SuffixArrays.CyclicShifts; |
| | 2 | |
|
| | 3 | | /// <summary> |
| | 4 | | /// An implementation of <see cref="IDoubleLengthPcsSorter"/> using Counting Sort, to perform sorting of partial cyclic |
| | 5 | | /// shifts (PCS) of length 2 * L of a string, given the order and the equivalence classes of the PCS of length L. |
| | 6 | | /// </summary> |
| | 7 | | /// <remarks> |
| | 8 | | /// <para id="advantages"> |
| | 9 | | /// ADVANTAGES AND DISADVANTAGES |
| | 10 | | /// <br/> |
| | 11 | | /// - Compared to <see cref="NaiveDoubleLengthPcsSorter"/>, this implementation has way better runtime (linear |
| | 12 | | /// instead of cubic). |
| | 13 | | /// <br/> |
| | 14 | | /// - Unlike <see cref="NaiveDoubleLengthPcsSorter"/>, this implementation requires specific data structures |
| | 15 | | /// to be provided, in alternative to the input, to calculate the position list. |
| | 16 | | /// <br/> |
| | 17 | | /// - However, unlike <see cref="NaiveDoubleLengthPcsSorter"/>, it does not require to know whether |
| | 18 | | /// input contains a terminator or not. |
| | 19 | | /// <br/> |
| | 20 | | /// - This is because such piece of information would only be needed when running comparisons between PCS. |
| | 21 | | /// <br/> |
| | 22 | | /// - This sorter, on the other hand, uses externally provided lists precisely in order to avoid the |
| | 23 | | /// need for costly PCS comparisons, which are "embedded" in the externally provided data structures. |
| | 24 | | /// </para> |
| | 25 | | /// <para id="algo"> |
| | 26 | | /// ALGORITHM |
| | 27 | | /// <br/> |
| | 28 | | /// - The algorithm takes advantage of the fact that PCS of length L have been already ordered, and their order |
| | 29 | | /// from the smallest to the biggest, is defined in the provided position list O, of the PCS of the input string |
| | 30 | | /// of length L. |
| | 31 | | /// <br/> |
| | 32 | | /// - The PCS of length 2 * L which the algorithm sorts are the ones starting at index i - L, for each i in O. |
| | 33 | | /// Let's call O' this extension of O to the the L chars preceding each index of O. |
| | 34 | | /// <br/> |
| | 35 | | /// - PCS of length 2 * L starting at index i - L are made of two halves: the first, from index i - L to index i |
| | 36 | | /// excluded, which is unsorted; and the second, from index i to i + L excluded, which is already sorted and |
| | 37 | | /// whose order is the one defined in O. |
| | 38 | | /// <br/> |
| | 39 | | /// - The algorithm uses a stable implementation of the Counting Sort to sort in linear time O', under Counting |
| | 40 | | /// Sort assumption of a small alphabet. |
| | 41 | | /// <br/> |
| | 42 | | /// - Moreover, the algorithm only looks at the first half of each PCS of length 2 * L, because the second half is |
| | 43 | | /// already sorted in O, hence in O', and the implementation of the sorting is stable, i.e. it preserves order of |
| | 44 | | /// suffixes, as they appear in the input being sorted, to the output, among the strings sharing the same prefix. |
| | 45 | | /// <br/> |
| | 46 | | /// - Counting Sort is adapted in the following way: instead of directly building an histogram of occurrences of |
| | 47 | | /// the first halves (PCS of length L), it builds an histogram of their equivalence classes, which are integer |
| | 48 | | /// simple to use as a key in Counting Sort and limited in number by the number of chars in the input string. |
| | 49 | | /// <br/> |
| | 50 | | /// - The order list of the PCS of length 2 * L is calculated by iterating in reverse order over the equivalence |
| | 51 | | /// classes of the second halves, which are sorted in lexicographic ascending order in the provided position |
| | 52 | | /// list. |
| | 53 | | /// </para> |
| | 54 | | /// <para id="complexity"> |
| | 55 | | /// COMPLEXITY |
| | 56 | | /// <br/> |
| | 57 | | /// - The algorithm is just a variation of the Counting Sort, where, instead of sorting by PCS (which would require |
| | 58 | | /// comparing O(n) strings, or having a much larger alphabet), a sorting by equivalence classes is done. |
| | 59 | | /// <br/> |
| | 60 | | /// - Direct access to the position list, as well as to the list of equivalence class is done in constant time. |
| | 61 | | /// <br/> |
| | 62 | | /// - Therefore, Time and Space Complexity are both O(n + sigma), where n is the length of the input string, and |
| | 63 | | /// also of the position and equivalence classes lists and sigma is the length of the alphabet, i.e. the number |
| | 64 | | /// of distinct equivalence classes of PCS of length L. |
| | 65 | | /// <br/> |
| | 66 | | /// - Because there are n PCS of length L in the input string, there are at most n distinct equivalent classes. |
| | 67 | | /// <br/> |
| | 68 | | /// - Therefore, Time and Space Complexity are actually O(n). |
| | 69 | | /// </para> |
| | 70 | | /// </remarks> |
| | 71 | | public class CountingSortDoubleLengthPcsSorter : IDoubleLengthPcsSorter |
| | 72 | | { |
| | 73 | | /// <inheritdoc/> |
| 369 | 74 | | public int PcsLength { get; } |
| | 75 | |
|
| | 76 | | /// <summary> |
| | 77 | | /// The position list of the already sorted PCS of length <see name="PcsLength"/>. |
| | 78 | | /// </summary> |
| 421 | 79 | | public IList<int> Order { get; } |
| | 80 | |
|
| | 81 | | /// <summary> |
| | 82 | | /// The equivalence classes of the already sorted PCS of length <see name="PcsLength"/>. |
| | 83 | | /// </summary> |
| 842 | 84 | | public IList<int> EqClasses { get; } |
| | 85 | |
|
| | 86 | | /// <summary> |
| | 87 | | /// <inheritdoc cref="CountingSortDoubleLengthPcsSorter"/> |
| | 88 | | /// </summary> |
| | 89 | | /// <param name="pcsLength"><inheritdoc cref="PcsLength" path="/summary"/></param> |
| | 90 | | /// <param name="order"><inheritdoc cref="Order" path="/summary"/></param> |
| | 91 | | /// <param name="eqClasses"><inheritdoc cref="EqClasses" path="/summary"/></param> |
| 56 | 92 | | public CountingSortDoubleLengthPcsSorter(int pcsLength, IList<int> order, IList<int> eqClasses) |
| 56 | 93 | | { |
| 56 | 94 | | if (pcsLength <= 0) |
| 2 | 95 | | throw new ArgumentOutOfRangeException(nameof(pcsLength), $"Must be positive."); |
| | 96 | |
|
| 54 | 97 | | if (order.Count != eqClasses.Count) |
| 2 | 98 | | throw new ArgumentException($"Must have the same number of items as {nameof(eqClasses)}.", nameof(order)); |
| | 99 | |
|
| 52 | 100 | | PcsLength = pcsLength; |
| 52 | 101 | | Order = order; |
| 52 | 102 | | EqClasses = eqClasses; |
| 52 | 103 | | } |
| | 104 | |
|
| | 105 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 106 | | /// <remarks> |
| | 107 | | /// <inheritdoc cref="CountingSortDoubleLengthPcsSorter" path="/remarks"/> |
| | 108 | | /// </remarks> |
| | 109 | | public IList<int> Sort() |
| 52 | 110 | | { |
| 52 | 111 | | var n = EqClasses.Count; |
| 52 | 112 | | var sigma = EqClasses[Order[^1]] + 1; |
| | 113 | |
|
| | 114 | | // Build histogram |
| 52 | 115 | | var counts = new int[sigma]; |
| | 116 | |
|
| 842 | 117 | | for (var i = 0; i < n; i++) |
| 369 | 118 | | counts[EqClasses[i]]++; |
| | 119 | |
|
| | 120 | | // Make histogram cumulative |
| 592 | 121 | | for (var i = 1; i < sigma; i++) |
| 244 | 122 | | counts[i] += counts[i - 1]; |
| | 123 | |
|
| | 124 | | // Calculate order list of double PCS |
| 52 | 125 | | var doublePcsOrder = new int[n]; |
| 842 | 126 | | for (var i = n - 1; i >= 0; i--) |
| 369 | 127 | | { |
| 369 | 128 | | var inputIndex = (n + Order[i] - PcsLength) % n; |
| 369 | 129 | | var alphabetIndex = EqClasses[inputIndex]; |
| 369 | 130 | | var position = --counts[alphabetIndex]; |
| 369 | 131 | | doublePcsOrder[position] = inputIndex; |
| 369 | 132 | | } |
| | 133 | |
|
| 52 | 134 | | return doublePcsOrder; |
| 52 | 135 | | } |
| | 136 | | } |