| | 1 | | using MoreStructures.Strings.Sorting; |
| | 2 | |
|
| | 3 | | namespace MoreStructures.SuffixArrays.CyclicShifts; |
| | 4 | |
|
| | 5 | | /// <summary> |
| | 6 | | /// A <see cref="ISingleCharPcsClassifier"/> implementation which uses an externally provided position list |
| | 7 | | /// <see cref="Order"/> of the 1-char PCS of the <see cref="Input"/>, to calculate equivalence classes. |
| | 8 | | /// </summary> |
| | 9 | | /// <remarks> |
| | 10 | | /// <inheritdoc cref="ISingleCharPcsClassifier"/> |
| | 11 | | /// <para id="advantages"> |
| | 12 | | /// ADVANTAGES |
| | 13 | | /// <br/> |
| | 14 | | /// - Compared to the naive implementation of <see cref="NaiveSingleCharPcsClassifier"/>, it has better runtime and |
| | 15 | | /// only allocates an array of n elements, where n is the length of the <see cref="Input"/>. |
| | 16 | | /// <br/> |
| | 17 | | /// - However, it requires the position list of the <see cref="Input"/> to be provided to the classifier at |
| | 18 | | /// construction time. |
| | 19 | | /// <br/> |
| | 20 | | /// - Calculating the position list is a linear time operation only when some assumptions can be made on the input, |
| | 21 | | /// such as an alphabet of limited size (which is the main scenario for <see cref="CountingSortCharsSorter"/>). |
| | 22 | | /// In all other cases, the runtime has a worst case of O(n * log(n)). |
| | 23 | | /// </para> |
| | 24 | | /// <para id="algorithm"> |
| | 25 | | /// ALGORITHM |
| | 26 | | /// <br/> |
| | 27 | | /// - An array of n elements EC, to accomodate the equivalence classes of the result, is allocated upfront. |
| | 28 | | /// <br/> |
| | 29 | | /// - The item of EC with index <c>P[0]</c>, where P is the list of positions, is set to 0. This is because |
| | 30 | | /// <c>P[0]</c> is the index in the <see cref="Input"/> T, of the smallest char in T. Therefore, there are no |
| | 31 | | /// smaller chars in T and the equivalence class of <c>T[P[0]]</c> is hence the smallest, i.e. 0. |
| | 32 | | /// <br/> |
| | 33 | | /// - <c>EC[O[i + 1]]</c> can be calculated from <c>EC[O[i]]</c>: two scenarios are possible. |
| | 34 | | /// <br/> |
| | 35 | | /// - If <c>T[O[i + 1]] == T[O[i]]</c>, it means that the two chars <c>T[O[i + 1]]</c> and <c>T[O[i]]</c>, which |
| | 36 | | /// come one after the other one in the position list, are the same and should have the same equivalence class. |
| | 37 | | /// Therefore, the equivalence class of <c>T[O[i + 1]]</c>, <c>EC[O[i + 1]]</c> can be set to the equivalence |
| | 38 | | /// class of <c>T[O[i]]</c>, <c>EQ[O[i]]</c>. |
| | 39 | | /// <br/> |
| | 40 | | /// - If, on the other hand, <c>T[O[i + 1]] != T[O[i]]</c>, it means that the two chars <c>T[O[i + 1]]</c> and |
| | 41 | | /// <c>T[O[i]]</c>, which come one after the other one in the position list, are different and should have |
| | 42 | | /// different equivalence classes. Therefore, the equivalence class of <c>T[O[i + 1]]</c>, <c>EC[O[i + 1]]</c> |
| | 43 | | /// can be set to the successor of the equivalence class of <c>T[O[i]]</c>, <c>EQ[O[i]]</c>. |
| | 44 | | /// </para> |
| | 45 | | /// <para id="complexity"> |
| | 46 | | /// COMPLEXITY |
| | 47 | | /// <br/> |
| | 48 | | /// - There are as many iterations as items of the position list. |
| | 49 | | /// <br/> |
| | 50 | | /// - Within each iteration, direct accesses to items of <see cref="Input"/> and <see cref="Order"/> by index are |
| | 51 | | /// done in constant time. |
| | 52 | | /// <br/> |
| | 53 | | /// - Therefore, Time and Space Complexity are O(n), excluding the cost of calculating the position list, which is |
| | 54 | | /// externally provided. Check <see cref="ICharsSorter"/> implementations for the complexity of the algorithms |
| | 55 | | /// calculating the position list of a string. |
| | 56 | | /// </para> |
| | 57 | | /// </remarks> |
| | 58 | | public class OrderBasedSingleCharPcsClassifier : ISingleCharPcsClassifier |
| | 59 | | { |
| | 60 | | /// <inheritdoc/> |
| 233 | 61 | | public string Input { get; } |
| | 62 | |
|
| | 63 | | /// <summary> |
| | 64 | | /// The position list of the 1-char PCS of the <see cref="Input"/>. |
| | 65 | | /// </summary> |
| | 66 | | /// <remarks> |
| | 67 | | /// Has as many items as chars in the <see cref="Input"/>. |
| | 68 | | /// <br/> |
| | 69 | | /// It is specifically required by this implementation of the algorithm and has to be externally provided. |
| | 70 | | /// <br/> |
| | 71 | | /// It can be calculated with any implementation of <see cref="ICharsSorter.Sort(string)"/>. |
| | 72 | | /// </remarks> |
| 488 | 73 | | public IList<int> Order { get; } |
| | 74 | |
|
| | 75 | | /// <summary> |
| | 76 | | /// <inheritdoc cref="OrderBasedSingleCharPcsClassifier"/> |
| | 77 | | /// </summary> |
| | 78 | | /// <param name="input"><inheritdoc cref="Input" path="/summary"/></param> |
| | 79 | | /// <param name="order"><inheritdoc cref="Order" path="/summary"/></param> |
| 28 | 80 | | public OrderBasedSingleCharPcsClassifier(string input, IList<int> order) |
| 28 | 81 | | { |
| 28 | 82 | | if (input.Length != order.Count) |
| 3 | 83 | | throw new ArgumentException($"Must have as many items as chars in {nameof(input)}.", nameof(order)); |
| | 84 | |
|
| 25 | 85 | | Input = input; |
| 25 | 86 | | Order = order; |
| 25 | 87 | | } |
| | 88 | |
|
| | 89 | | /// <inheritdoc cref="ISingleCharPcsClassifier" path="//*[not(self::summary or self::remarks)]"/> |
| | 90 | | /// <summary> |
| | 91 | | /// <inheritdoc/> |
| | 92 | | /// </summary> |
| | 93 | | /// <remarks> |
| | 94 | | /// <inheritdoc cref="OrderBasedSingleCharPcsClassifier"/> |
| | 95 | | /// </remarks> |
| | 96 | | public IList<int> Classify() |
| 25 | 97 | | { |
| 25 | 98 | | if (Input.Length == 0) |
| 1 | 99 | | return Array.Empty<int>(); |
| | 100 | |
|
| 24 | 101 | | var eqClasses = new int[Order.Count]; |
| 24 | 102 | | eqClasses[Order[0]] = 0; |
| 24 | 103 | | var currentEqClass = 0; |
| 256 | 104 | | for (int i = 1; i < Order.Count; i++) |
| 104 | 105 | | eqClasses[Order[i]] = Input[Order[i]] == Input[Order[i - 1]] ? currentEqClass : ++currentEqClass; |
| 24 | 106 | | return eqClasses; |
| 25 | 107 | | } |
| | 108 | | } |