| | 1 | | using MoreStructures.Strings.Sorting; |
| | 2 | | using MoreStructures.SuffixArrays.CyclicShifts; |
| | 3 | |
|
| | 4 | | namespace MoreStructures.SuffixArrays.Builders; |
| | 5 | |
|
| | 6 | | /// <summary> |
| | 7 | | /// An algorithm for building the <see cref="SuffixArray"/> based on fast PCS comparison. |
| | 8 | | /// </summary> |
| | 9 | | /// <remarks> |
| | 10 | | /// <para id="assumptions"> |
| | 11 | | /// <b>Remark: the following analysis is based on the default implementations used for each of the four steps of |
| | 12 | | /// the Suffix Array building algorithm, as set by |
| | 13 | | /// <see cref="PcsBasedSuffixArrayBuilder(TextWithTerminator, IDictionary{char, int})"/>. Using a builder different |
| | 14 | | /// from the default for any of the step would result into a different runtime and complexity.</b> |
| | 15 | | /// </para> |
| | 16 | | /// <para id="algo"> |
| | 17 | | /// ALGORITHM |
| | 18 | | /// <br/> |
| | 19 | | /// The algorithm is based on two lists, position list and equivalence class list, which are iteratively |
| | 20 | | /// recalculated, over and over again, for longer and longer PCS of <see cref="Text"/>, until the full content of |
| | 21 | | /// <see cref="Text"/> is covered by the PCS. |
| | 22 | | /// <br/> |
| | 23 | | /// More in detail: |
| | 24 | | /// <br/> |
| | 25 | | /// - Position and equivalence class lists are first calculated for 1-char PCS, i.e. for single chars, using |
| | 26 | | /// <see cref="CountingSortCharsSorter"/> and <see cref="OrderBasedSingleCharPcsClassifier"/> respectively. |
| | 27 | | /// <br/> |
| | 28 | | /// - Position and equivalence class lists are then calculated for PCS of length double than the current one, using |
| | 29 | | /// <see cref="CountingSortDoubleLengthPcsSorter"/> and <see cref="EqClassesBasedDoubleLengthPcsClassifier"/> |
| | 30 | | /// respectively. |
| | 31 | | /// <br/> |
| | 32 | | /// - The operation is repeated until the current PCS length becomes bigger or equal than the length of |
| | 33 | | /// <see cref="Text"/>. |
| | 34 | | /// <br/> |
| | 35 | | /// - At this point, the last calculated order is the order of PCS which are at least as long as the full |
| | 36 | | /// <see cref="Text"/>. |
| | 37 | | /// <br/> |
| | 38 | | /// - Because all such strings include a full suffixes of <see cref="Text"/> terminated by a terminator, which is a |
| | 39 | | /// unique char, the resulting order is the order of all suffixes of <see cref="Text"/>, which is by definition |
| | 40 | | /// the Suffix Array of <see cref="Text"/>. |
| | 41 | | /// </para> |
| | 42 | | /// <para id="complexity"> |
| | 43 | | /// COMPLEXITY |
| | 44 | | /// <br/> |
| | 45 | | /// - The algorithm has a bootstrap part (1-char PCS) and an iterative part (doubling length PCS). |
| | 46 | | /// <br/> |
| | 47 | | /// - It first calculates position and equivalence lists for 1-char PCS, which are respectively |
| | 48 | | /// O(n + sigma) and O(n) operations, both in Time and Space Complexity, where n is the length of |
| | 49 | | /// <see cref="Text"/> and sigma is the size of the alphabet of <see cref="Text"/>. |
| | 50 | | /// <br/> |
| | 51 | | /// - Then the iterative part is executed. The number of top-level iterations is logarithmic with n, because the |
| | 52 | | /// PCS length doubles at every iteration and the termination condition is that the PCS length is at least as big |
| | 53 | | /// as the length of <see cref="Text"/>. |
| | 54 | | /// <br/> |
| | 55 | | /// - The two operations done in the iteration loop both have linear Time and Space Complexity. |
| | 56 | | /// <br/> |
| | 57 | | /// - Therefore, both Time and Space Complexity are O(n * log(n) + sigma). |
| | 58 | | /// <br/> |
| | 59 | | /// - If sigma is O(n), Time and Space Complexity are O(n * log(n)). |
| | 60 | | /// </para> |
| | 61 | | /// </remarks> |
| | 62 | | public class PcsBasedSuffixArrayBuilder : ISuffixArrayBuilder |
| | 63 | | { |
| | 64 | | /// <summary> |
| | 65 | | /// The <see cref="TextWithTerminator"/>, to build the <see cref="SuffixArray"/> of. |
| | 66 | | /// </summary> |
| 14 | 67 | | public TextWithTerminator Text { get; } |
| | 68 | |
|
| | 69 | | /// <summary> |
| | 70 | | /// The builder of the <see cref="ICharsSorter"/> implementation used to sort 1-char PCS in the Suffix Array |
| | 71 | | /// building algorithm. |
| | 72 | | /// </summary> |
| | 73 | | /// <remarks> |
| | 74 | | /// Takes as input a single <see cref="string"/> parameter, containing the input text, whose chars have to be |
| | 75 | | /// sorted. |
| | 76 | | /// <br/> |
| | 77 | | /// Returns a suitable <see cref="ICharsSorter"/> implementation. |
| | 78 | | /// </remarks> |
| 14 | 79 | | public Func<string, ICharsSorter> SingleCharPcsSorterBuilder { get; } |
| | 80 | |
|
| | 81 | | /// <summary> |
| | 82 | | /// The builder of the <see cref="ISingleCharPcsClassifier"/> implementation used to find equivalence classes of |
| | 83 | | /// 1-char PCS in the Suffix Array building algorithm. |
| | 84 | | /// </summary> |
| | 85 | | /// <remarks> |
| | 86 | | /// Takes as first input parameter a <see cref="string"/>, containing the input text, whose chars have to be |
| | 87 | | /// classified. |
| | 88 | | /// <br/> |
| | 89 | | /// Takes as second input parameter a <see cref="IList{T}"/> of <see cref="int"/>, containing the order of the |
| | 90 | | /// 1-char PCS, previously calculated. |
| | 91 | | /// <br/> |
| | 92 | | /// Returns a suitable <see cref="ISingleCharPcsClassifier"/> implementation. |
| | 93 | | /// </remarks> |
| 14 | 94 | | public Func<string, IList<int>, ISingleCharPcsClassifier> SingleCharPcsClassifierBuilder { get; } |
| | 95 | |
|
| | 96 | | /// <summary> |
| | 97 | | /// The builder of the <see cref="IDoubleLengthPcsSorter"/> implementation used in the Suffix Array building |
| | 98 | | /// algorithm, to sort PCS of length 2 * L, once PCS of length L have been sorted and classified. |
| | 99 | | /// </summary> |
| | 100 | | /// <remarks> |
| | 101 | | /// Takes as first input parameter a <see cref="int"/>, containing the length L of the PCS. |
| | 102 | | /// <br/> |
| | 103 | | /// Takes as second input parameter a <see cref="IList{T}"/> of <see cref="int"/>, containing the order of the |
| | 104 | | /// PCS of length L, previously calculated and potentially needed to calculate the order of PCS of length 2 * L. |
| | 105 | | /// <br/> |
| | 106 | | /// Takes as third input parameter a <see cref="IList{T}"/> of <see cref="int"/>, containing the equivalence |
| | 107 | | /// classes of the PCS of length L, previously calculated and potentially needed to calculate the order of PCS of |
| | 108 | | /// length 2 * L. |
| | 109 | | /// <br/> |
| | 110 | | /// Returns a suitable <see cref="IDoubleLengthPcsSorter"/> implementation. |
| | 111 | | /// </remarks> |
| 36 | 112 | | public Func<int, IList<int>, IList<int>, IDoubleLengthPcsSorter> DoubleLengthPcsSorterBuilder { get; } |
| | 113 | |
|
| | 114 | | /// <summary> |
| | 115 | | /// The builder of the <see cref="IDoubleLengthPcsClassifier"/> implementation used in the Suffix Array building |
| | 116 | | /// algorithm, to classify PCS of length 2 * L, once PCS of length L have been sorted and classified and PCS of |
| | 117 | | /// length 2 * L have been sorted. |
| | 118 | | /// </summary> |
| | 119 | | /// <remarks> |
| | 120 | | /// Takes as first input parameter a <see cref="int"/>, containing the length L of the PCS. |
| | 121 | | /// <br/> |
| | 122 | | /// Takes as second input parameter a <see cref="IList{T}"/> of <see cref="int"/>, containing the equivalence |
| | 123 | | /// classes of the PCS of length L, previously calculated and potentially needed to calculate the equivalence |
| | 124 | | /// classes of PCS of length 2 * L. |
| | 125 | | /// <br/> |
| | 126 | | /// Takes as third input parameter a <see cref="IList{T}"/> of <see cref="int"/>, containing the order of the |
| | 127 | | /// PCS of length L, previously calculated and potentially needed to calculate the equivalence classes of PCS of |
| | 128 | | /// length 2 * L. |
| | 129 | | /// <br/> |
| | 130 | | /// Returns a suitable <see cref="IDoubleLengthPcsClassifier"/> implementation. |
| | 131 | | /// </remarks> |
| 36 | 132 | | public Func<int, IList<int>, IList<int>, IDoubleLengthPcsClassifier> DoubleLengthPcsClassifierBuilder { get; } |
| | 133 | |
|
| | 134 | | /// <summary> |
| | 135 | | /// <inheritdoc cref="NaiveSuffixArrayBuilder" path="/summary"/> |
| | 136 | | /// </summary> |
| | 137 | | /// <remarks> |
| | 138 | | /// Allows to specify the algorithm to be used for each of the four steps of the Suffix Array building |
| | 139 | | /// algorithm, each one via a dedicated builder. |
| | 140 | | /// </remarks> |
| | 141 | | /// <param name="text"> |
| | 142 | | /// <inheritdoc cref="Text" path="/summary"/> |
| | 143 | | /// </param> |
| | 144 | | /// <param name="singleCharPcsSorterBuilder"> |
| | 145 | | /// <inheritdoc cref="SingleCharPcsSorterBuilder" path="/summary"/> |
| | 146 | | /// </param> |
| | 147 | | /// <param name="singleCharPcsClassifierBuilder"> |
| | 148 | | /// <inheritdoc cref="SingleCharPcsClassifierBuilder" path="/summary"/> |
| | 149 | | /// </param> |
| | 150 | | /// <param name="doubleLengthPcsSorterBuilder"> |
| | 151 | | /// <inheritdoc cref="DoubleLengthPcsSorterBuilder" path="/summary"/> |
| | 152 | | /// </param> |
| | 153 | | /// <param name="doubleLengthPcsClassifierBuilder"> |
| | 154 | | /// <inheritdoc cref="DoubleLengthPcsClassifierBuilder" path="/summary"/> |
| | 155 | | /// </param> |
| 7 | 156 | | public PcsBasedSuffixArrayBuilder( |
| 7 | 157 | | TextWithTerminator text, |
| 7 | 158 | | Func<string, ICharsSorter> singleCharPcsSorterBuilder, |
| 7 | 159 | | Func<string, IList<int>, ISingleCharPcsClassifier> singleCharPcsClassifierBuilder, |
| 7 | 160 | | Func<int, IList<int>, IList<int>, IDoubleLengthPcsSorter> doubleLengthPcsSorterBuilder, |
| 7 | 161 | | Func<int, IList<int>, IList<int>, IDoubleLengthPcsClassifier> doubleLengthPcsClassifierBuilder) |
| 7 | 162 | | { |
| 7 | 163 | | Text = text; |
| 7 | 164 | | SingleCharPcsSorterBuilder = singleCharPcsSorterBuilder; |
| 7 | 165 | | SingleCharPcsClassifierBuilder = singleCharPcsClassifierBuilder; |
| 7 | 166 | | DoubleLengthPcsSorterBuilder = doubleLengthPcsSorterBuilder; |
| 7 | 167 | | DoubleLengthPcsClassifierBuilder = doubleLengthPcsClassifierBuilder; |
| 7 | 168 | | } |
| | 169 | |
|
| | 170 | | /// <summary> |
| | 171 | | /// <inheritdoc cref="NaiveSuffixArrayBuilder" path="/summary"/> |
| | 172 | | /// </summary> |
| | 173 | | /// <remarks> |
| | 174 | | /// Uses the best implementations for each of the four steps of the Suffix Array building algorithm, resulting |
| | 175 | | /// in an overall linear Time and Space Complexity. |
| | 176 | | /// </remarks> |
| | 177 | | /// <param name="text"> |
| | 178 | | /// <inheritdoc cref="Text" path="/summary"/> |
| | 179 | | /// </param> |
| | 180 | | /// <param name="alphabet"> |
| | 181 | | /// The alphabet of <see cref="Text"/>, i.e. the list of all <see cref="char"/> potentially appearing in |
| | 182 | | /// <see cref="Text"/>, mapped to an alphabet index. Required by the Counting Sort algorithm, which builds the |
| | 183 | | /// histogram of occurrences in the input, of all chars of the alphabet of the input. |
| | 184 | | /// </param> |
| 7 | 185 | | public PcsBasedSuffixArrayBuilder(TextWithTerminator text, IDictionary<char, int> alphabet) |
| 7 | 186 | | { |
| 7 | 187 | | Text = text; |
| 7 | 188 | | SingleCharPcsSorterBuilder = |
| 7 | 189 | | input => |
| 14 | 190 | | new CountingSortCharsSorter(alphabet); |
| 7 | 191 | | SingleCharPcsClassifierBuilder = |
| 7 | 192 | | (input, order) => |
| 14 | 193 | | new OrderBasedSingleCharPcsClassifier(input, order); |
| 7 | 194 | | DoubleLengthPcsSorterBuilder = |
| 7 | 195 | | (pcsLength, order, eqClasses) => |
| 25 | 196 | | new CountingSortDoubleLengthPcsSorter(pcsLength, order, eqClasses); |
| 7 | 197 | | DoubleLengthPcsClassifierBuilder = |
| 7 | 198 | | (pcsLength, eqClassesPcsHalfLength, order) => |
| 25 | 199 | | new EqClassesBasedDoubleLengthPcsClassifier(pcsLength, eqClassesPcsHalfLength, order); |
| 7 | 200 | | } |
| | 201 | |
|
| | 202 | | /// <summary> |
| | 203 | | /// <inheritdoc cref="PcsBasedSuffixArrayBuilder" path="/summary"/> |
| | 204 | | /// </summary> |
| | 205 | | public SuffixArray Build() |
| 14 | 206 | | { |
| 14 | 207 | | var input = string.Concat(Text); |
| | 208 | |
|
| 14 | 209 | | var singleCharPcsSorter = SingleCharPcsSorterBuilder(input); |
| 14 | 210 | | var order = singleCharPcsSorter.Sort(input); |
| | 211 | |
|
| 14 | 212 | | var singleCharEqClassClassifier = SingleCharPcsClassifierBuilder(input, order); |
| 14 | 213 | | var eqClasses = singleCharEqClassClassifier.Classify(); |
| | 214 | |
|
| 14 | 215 | | var pcsLength = 1; |
| 50 | 216 | | while (pcsLength < input.Length) |
| 36 | 217 | | { |
| 36 | 218 | | pcsLength *= 2; |
| | 219 | |
|
| 36 | 220 | | var doubleLengthPcsSorter = DoubleLengthPcsSorterBuilder(pcsLength / 2, order, eqClasses); |
| 36 | 221 | | order = doubleLengthPcsSorter.Sort(); |
| 36 | 222 | | var doubleLengthPcsClassifier = DoubleLengthPcsClassifierBuilder(pcsLength, eqClasses, order); |
| 36 | 223 | | eqClasses = doubleLengthPcsClassifier.Classify(); |
| 36 | 224 | | } |
| | 225 | |
|
| 14 | 226 | | return new SuffixArray(order); |
| 14 | 227 | | } |
| | 228 | | } |