| | 1 | | using MoreStructures.Utilities; |
| | 2 | |
|
| | 3 | | namespace MoreStructures.SuffixArrays.CyclicShifts; |
| | 4 | |
|
| | 5 | | /// <summary> |
| | 6 | | /// An implementation of <see cref="IDoubleLengthPcsClassifier"/> which uses the externally provided position list of |
| | 7 | | /// the PCS of length <see cref="PcsLength"/>, in addition to the input string, to generate equivalence classes. |
| | 8 | | /// </summary> |
| | 9 | | /// <remarks> |
| | 10 | | /// <para id="advantages"> |
| | 11 | | /// ADVANTAGES AND DISADVANTAGES |
| | 12 | | /// <br/> |
| | 13 | | /// - Unlike <see cref="NaiveDoubleLengthPcsClassifier"/>, this implementation requires additional data structures |
| | 14 | | /// to be provided, in addition to the <see cref="Input"/>, to calculate equivalence classes. |
| | 15 | | /// <br/> |
| | 16 | | /// - However, unlike <see cref="NaiveDoubleLengthPcsClassifier"/>, it does not require to know whether |
| | 17 | | /// <see cref="Input"/> contains a terminator or not. |
| | 18 | | /// <br/> |
| | 19 | | /// - This is because such piece of information would only be needed when running comparisons between PCS. |
| | 20 | | /// <br/> |
| | 21 | | /// - This classifier, on the other hand, uses an externally provided position list precisely in order to avoid the |
| | 22 | | /// need for costly PCS comparisons, which are "embedded" in the externally provided position list. |
| | 23 | | /// <br/> |
| | 24 | | /// - PCS are actually still compared for equality. However, comparison for equality, unlike comparison for |
| | 25 | | /// sorting, doesn't require to know whether a char is terminator or not, since a terminator is a char, it is |
| | 26 | | /// only equal to itself and different from any other char. |
| | 27 | | /// </para> |
| | 28 | | /// <para id="algo"> |
| | 29 | | /// ALGORITHM |
| | 30 | | /// <br/> |
| | 31 | | /// - The externally provided ordered sequence of PCS <see cref="Order"/> is iterated over. |
| | 32 | | /// <br/> |
| | 33 | | /// - A new equivalence class is generated (increasing the current counter) every time a PCS differs from the |
| | 34 | | /// previous one. The equivalence class is assigned to the output at the index of the PCS being checked. |
| | 35 | | /// <br/> |
| | 36 | | /// - The comparison between a PCS and the previous one (both of length <see cref="PcsLength"/>), is done comparing |
| | 37 | | /// the <see cref="PcsLength"/> chars of the two strings. |
| | 38 | | /// <br/> |
| | 39 | | /// - Finally, the equivalence class list is returned. |
| | 40 | | /// </para> |
| | 41 | | /// <para id="complexity"> |
| | 42 | | /// COMPLEXITY |
| | 43 | | /// <br/> |
| | 44 | | /// - Iterating over the n PCS of length L is an O(n) operation. Each iteration does O(L) work, since, while |
| | 45 | | /// direct access to the equivalence class list is done in constant time, comparison between the current PCS |
| | 46 | | /// and the previous one is a comparison of two strings of length L, and requires all L chars to be comparerd in |
| | 47 | | /// the worst case. |
| | 48 | | /// <br/> |
| | 49 | | /// - Instantiating the equivalence class list of output is also an O(n) operation. |
| | 50 | | /// <br/> |
| | 51 | | /// - Therefore, Time Complexity, as driven by the iteration over <see cref="Order"/>, is O(n * L). Space |
| | 52 | | /// Complexity, driven as well by iteration over <see cref="Order"/> storing the previous PCS, is O(n + L). |
| | 53 | | /// </para> |
| | 54 | | /// </remarks> |
| | 55 | | public class OrderBasedDoubleLengthPcsClassifier : IDoubleLengthPcsClassifier |
| | 56 | | { |
| | 57 | | /// <summary> |
| | 58 | | /// The input text, whose PCS of length <see cref="PcsLength"/> have to be classified. |
| | 59 | | /// </summary> |
| 137 | 60 | | public string Input { get; } |
| | 61 | |
|
| | 62 | | /// <summary> |
| | 63 | | /// The length the PCS of <see cref="Input"/> to be classified. |
| | 64 | | /// </summary> |
| 114 | 65 | | public int PcsLength { get; } |
| | 66 | |
|
| | 67 | | /// <summary> |
| | 68 | | /// The position list of the PCS of length <see cref="PcsLength"/>, to be used, in addition to the |
| | 69 | | /// <see cref="Input"/>, to calculate the equivalence classes. |
| | 70 | | /// </summary> |
| 23 | 71 | | public IList<int> Order { get; } |
| | 72 | |
|
| | 73 | | /// <summary> |
| | 74 | | /// <inheritdoc cref="OrderBasedDoubleLengthPcsClassifier"/> |
| | 75 | | /// </summary> |
| | 76 | | /// <param name="input"><inheritdoc cref="Input" path="/summary"/></param> |
| | 77 | | /// <param name="pcsLength"><inheritdoc cref="PcsLength" path="/summary"/></param> |
| | 78 | | /// <param name="order"><inheritdoc cref="Order" path="/summary"/></param> |
| 31 | 79 | | public OrderBasedDoubleLengthPcsClassifier(string input, int pcsLength, IList<int> order) |
| 31 | 80 | | { |
| 31 | 81 | | if (pcsLength <= 0 || pcsLength > input.Length) |
| 5 | 82 | | throw new ArgumentOutOfRangeException( |
| 5 | 83 | | nameof(pcsLength), $"Must be positive and at most the length of {nameof(input)}."); |
| 26 | 84 | | if (order.Count != input.Length) |
| 3 | 85 | | throw new ArgumentException( |
| 3 | 86 | | $"Must have the same number of items as chars in {nameof(input)}.", nameof(order)); |
| | 87 | |
|
| 23 | 88 | | Input = input; |
| 23 | 89 | | PcsLength = pcsLength; |
| 23 | 90 | | Order = order; |
| 23 | 91 | | } |
| | 92 | |
|
| | 93 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 94 | | /// <remarks> |
| | 95 | | /// <inheritdoc cref="OrderBasedDoubleLengthPcsClassifier" path="/remarks"/> |
| | 96 | | /// </remarks> |
| | 97 | | public IList<int> Classify() |
| 23 | 98 | | { |
| 23 | 99 | | var eqClasses = new int[Input.Length]; |
| | 100 | |
|
| 23 | 101 | | var (headList, tail) = Order.EnumerateExactlyFirst(1); |
| 23 | 102 | | var head = headList.Single(); |
| | 103 | |
|
| 23 | 104 | | var currentEqClass = 0; |
| 23 | 105 | | eqClasses[head] = currentEqClass; |
| | 106 | |
|
| 23 | 107 | | var previousPcs = PcsUtils.ExtractPcsOf(Input, head, PcsLength); |
| 251 | 108 | | foreach (var currentIndex in tail) |
| 91 | 109 | | { |
| 91 | 110 | | var currentPcs = PcsUtils.ExtractPcsOf(Input, currentIndex, PcsLength); |
| 91 | 111 | | eqClasses[currentIndex] = currentPcs == previousPcs ? currentEqClass : ++currentEqClass; |
| 91 | 112 | | previousPcs = currentPcs; |
| 91 | 113 | | } |
| | 114 | |
|
| 23 | 115 | | return eqClasses; |
| 23 | 116 | | } |
| | 117 | | } |