| | 1 | | using MoreStructures.Utilities; |
| | 2 | |
|
| | 3 | | namespace MoreStructures.SuffixArrays.CyclicShifts; |
| | 4 | |
|
| | 5 | | /// <summary> |
| | 6 | | /// An implementation of <see cref="IDoubleLengthPcsClassifier"/> which uses the externally provided list of |
| | 7 | | /// equivalence classes <see cref="EqClassesPcsHalfLength"/>, of the PCS of length <see cref="PcsLength"/> / 2, as well |
| | 8 | | /// as the position list of the PCS of length <see cref="PcsLength"/>, and doesn't require the input string, to |
| | 9 | | /// generate equivalence classes of the PCS of length <see cref="PcsLength"/>. |
| | 10 | | /// </summary> |
| | 11 | | /// <remarks> |
| | 12 | | /// <para id="advantages"> |
| | 13 | | /// ADVANTAGES AND DISADVANTAGES |
| | 14 | | /// <br/> |
| | 15 | | /// - Compared to <see cref="NaiveDoubleLengthPcsClassifier"/>, it has way better runtime (linear time, instead of |
| | 16 | | /// cubic). |
| | 17 | | /// <br/> |
| | 18 | | /// - Compared <see cref="OrderBasedDoubleLengthPcsClassifier"/>, it still has better runtime (linear time, instead |
| | 19 | | /// of quadratic). |
| | 20 | | /// <br/> |
| | 21 | | /// - However, it requires both the position list <see cref="Order"/> and the equivalence class list |
| | 22 | | /// <see cref="EqClassesPcsHalfLength"/>, to be externally provided. |
| | 23 | | /// <br/> |
| | 24 | | /// - Compared to other implementations, <b>this algorithm requires <see cref="PcsLength"/> to be even.</b> |
| | 25 | | /// <br/> |
| | 26 | | /// - Unlike <see cref="NaiveDoubleLengthPcsClassifier"/>, this implementation requires specific data structures |
| | 27 | | /// to be provided, in alternative to the input, to calculate equivalence classes. |
| | 28 | | /// <br/> |
| | 29 | | /// - However, unlike <see cref="NaiveDoubleLengthPcsClassifier"/>, it does not require to know whether |
| | 30 | | /// input contains a terminator or not. |
| | 31 | | /// <br/> |
| | 32 | | /// - This is because such piece of information would only be needed when running comparisons between PCS. |
| | 33 | | /// <br/> |
| | 34 | | /// - This classifier, on the other hand, uses an externally provided lists precisely in order to avoid the |
| | 35 | | /// need for costly PCS comparisons, which are "embedded" in the externally provided data structures. |
| | 36 | | /// </para> |
| | 37 | | /// <para id="algo"> |
| | 38 | | /// ALGORITHM |
| | 39 | | /// <br/> |
| | 40 | | /// - The externally provided ordered sequence of PCS <see cref="Order"/> is iterated over. |
| | 41 | | /// <br/> |
| | 42 | | /// - A new equivalence class is generated (increasing the current counter) every time a PCS differs from the |
| | 43 | | /// previous one. The equivalence class is assigned to the output at the index of the PCS being checked. |
| | 44 | | /// <br/> |
| | 45 | | /// - The comparison between a PCS and the previous one (both of length <see cref="PcsLength"/>), is not done by |
| | 46 | | /// comparing chars, rather by comparing equivalence classes of the PCS of length <see cref="PcsLength"/> / 2, |
| | 47 | | /// defined in the externally provided <see cref="EqClassesPcsHalfLength"/>. |
| | 48 | | /// <br/> |
| | 49 | | /// - More in detail: to compare the PCS of even length L = <see cref="PcsLength"/> at index j1 = |
| | 50 | | /// <see cref="Order"/>[i] with the one at index j2 = <see cref="Order"/>[i - 1], the following two comparisons |
| | 51 | | /// are done: <c><see cref="EqClassesPcsHalfLength"/>[j1] == <see cref="EqClassesPcsHalfLength"/>[j2]</c> and |
| | 52 | | /// <c><see cref="EqClassesPcsHalfLength"/>[j1 + L / 2] == <see cref="EqClassesPcsHalfLength"/>[j2 + L / 2]</c>. |
| | 53 | | /// <br/> |
| | 54 | | /// - Finally, the equivalence class list is returned. |
| | 55 | | /// </para> |
| | 56 | | /// <para id="complexity"> |
| | 57 | | /// COMPLEXITY |
| | 58 | | /// <br/> |
| | 59 | | /// - Iterating over the n PCS of length L is an O(n) operation. Each iteration does O(1) work, since |
| | 60 | | /// direct access to the equivalence class list of PCS of half length is done in constant time and comparison |
| | 61 | | /// between the current PCS and the previous one is a comparison of the equivalence classes of the first and |
| | 62 | | /// second halves of both PCS, and requires two pairs of integers to be compared. |
| | 63 | | /// <br/> |
| | 64 | | /// - Instantiating the equivalence class list of output is also an O(n) operation. |
| | 65 | | /// <br/> |
| | 66 | | /// - Therefore, Time and Space Complexity are O(n). |
| | 67 | | /// </para> |
| | 68 | | /// </remarks> |
| | 69 | | public class EqClassesBasedDoubleLengthPcsClassifier : IDoubleLengthPcsClassifier |
| | 70 | | { |
| | 71 | | /// <summary> |
| | 72 | | /// The length the PCS of input string to be classified. |
| | 73 | | /// </summary> |
| 410 | 74 | | public int PcsLength { get; } |
| | 75 | |
|
| | 76 | | /// <summary> |
| | 77 | | /// The list of equivalence classes of the PCS of length <see cref="PcsLength"/> / 2, to be used to calculate the |
| | 78 | | /// equivalence classes of the PCS of double the length (<see cref="PcsLength"/>). |
| | 79 | | /// </summary> |
| 953 | 80 | | public IList<int> EqClassesPcsHalfLength { get; } |
| | 81 | |
|
| | 82 | | /// <summary> |
| | 83 | | /// The position list of the PCS of length <see cref="PcsLength"/>, to be used, in addition to the |
| | 84 | | /// <see cref="EqClassesPcsHalfLength"/>, to calculate the equivalence classes of length <see cref="PcsLength"/>. |
| | 85 | | /// </summary> |
| 59 | 86 | | public IList<int> Order { get; } |
| | 87 | |
|
| | 88 | | /// <summary> |
| | 89 | | /// <inheritdoc cref="EqClassesBasedDoubleLengthPcsClassifier"/> |
| | 90 | | /// </summary> |
| | 91 | | /// <param name="pcsLength"><inheritdoc cref="PcsLength" path="/summary"/></param> |
| | 92 | | /// <param name="eqClassesPcsHalfLength"><inheritdoc cref="EqClassesPcsHalfLength" path="/summary"/></param> |
| | 93 | | /// <param name="order"><inheritdoc cref="Order" path="/summary"/></param> |
| 69 | 94 | | public EqClassesBasedDoubleLengthPcsClassifier(int pcsLength, IList<int> eqClassesPcsHalfLength, IList<int> order) |
| 69 | 95 | | { |
| 69 | 96 | | if (pcsLength % 2 != 0) |
| 3 | 97 | | throw new ArgumentException( |
| 3 | 98 | | $"Must be even.", nameof(pcsLength)); |
| 66 | 99 | | if (pcsLength <= 0) |
| 2 | 100 | | throw new ArgumentOutOfRangeException( |
| 2 | 101 | | nameof(pcsLength), $"Must be positive."); |
| 64 | 102 | | if (order.Count != eqClassesPcsHalfLength.Count) |
| 5 | 103 | | throw new ArgumentException( |
| 5 | 104 | | $"Must have the same number of items as {nameof(eqClassesPcsHalfLength)}.", nameof(order)); |
| | 105 | |
|
| 59 | 106 | | PcsLength = pcsLength; |
| 59 | 107 | | EqClassesPcsHalfLength = eqClassesPcsHalfLength; |
| 59 | 108 | | Order = order; |
| 59 | 109 | | } |
| | 110 | |
|
| | 111 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 112 | | /// <remarks> |
| | 113 | | /// <inheritdoc cref="EqClassesBasedDoubleLengthPcsClassifier" path="/remarks"/> |
| | 114 | | /// </remarks> |
| | 115 | | public IList<int> Classify() |
| 59 | 116 | | { |
| 59 | 117 | | var n = EqClassesPcsHalfLength.Count; |
| 59 | 118 | | var eqClasses = new int[n]; |
| | 119 | |
|
| 59 | 120 | | var (headList, tail) = Order.EnumerateExactlyFirst(1); |
| 59 | 121 | | var head = headList.Single(); |
| | 122 | |
|
| 59 | 123 | | var currentEqClass = 0; |
| 59 | 124 | | eqClasses[head] = currentEqClass; |
| | 125 | |
|
| 59 | 126 | | var previousFirstHalfIndex = head; |
| 59 | 127 | | var previousSecondHalfIndex = (previousFirstHalfIndex + PcsLength / 2) % n; |
| 879 | 128 | | foreach (var firstHalfIndex in tail) |
| 351 | 129 | | { |
| 351 | 130 | | var secondHalfIndex = (firstHalfIndex + PcsLength / 2) % n; |
| 351 | 131 | | eqClasses[firstHalfIndex] = |
| 351 | 132 | | EqClassesPcsHalfLength[firstHalfIndex] == EqClassesPcsHalfLength[previousFirstHalfIndex] && |
| 351 | 133 | | EqClassesPcsHalfLength[secondHalfIndex] == EqClassesPcsHalfLength[previousSecondHalfIndex] |
| 351 | 134 | | ? currentEqClass |
| 351 | 135 | | : ++currentEqClass; |
| | 136 | |
|
| 351 | 137 | | previousFirstHalfIndex = firstHalfIndex; |
| 351 | 138 | | previousSecondHalfIndex = secondHalfIndex; |
| 351 | 139 | | } |
| | 140 | |
|
| 59 | 141 | | return eqClasses; |
| 59 | 142 | | } |
| | 143 | | } |