| | 1 | | using MoreStructures.Utilities; |
| | 2 | |
|
| | 3 | | namespace MoreStructures.SuffixArrays.CyclicShifts; |
| | 4 | |
|
| | 5 | | /// <summary> |
| | 6 | | /// An implementation of <see cref="IDoubleLengthPcsClassifier"/> which solely depends on the input string, to generate |
| | 7 | | /// equivalence classes. |
| | 8 | | /// </summary> |
| | 9 | | /// <remarks> |
| | 10 | | /// <para id="advantages"> |
| | 11 | | /// ADVANTAGES AND DISADVANTAGES |
| | 12 | | /// <br/> |
| | 13 | | /// - Unlike other implementations, such as <see cref="OrderBasedDoubleLengthPcsClassifier"/> or |
| | 14 | | /// <see cref="EqClassesBasedDoubleLengthPcsClassifier"/>, this classifier needs to know whether the input being |
| | 15 | | /// provided includes a terminator (i.e. its last char is "special" as it uniquely identify the end of the |
| | 16 | | /// string) or not. |
| | 17 | | /// <br/> |
| | 18 | | /// - This information is required since this classifier performs classification by actually sorting PCS, and |
| | 19 | | /// sorting PCS requires knowing whether a terminator is present or not (and which one it is). |
| | 20 | | /// <br/> |
| | 21 | | /// - Other implementations don't require such information, since they never compare PCS against each other: they |
| | 22 | | /// either use an externally provided position list or compare via the equivalence class list. |
| | 23 | | /// </para> |
| | 24 | | /// <para id="algo"> |
| | 25 | | /// ALGORITHM |
| | 26 | | /// <br/> |
| | 27 | | /// - PCS of length L are generated as actual strings from the input string, then sorted in lexicographic order via |
| | 28 | | /// the <see cref="Enumerable.OrderBy{TSource, TKey}(IEnumerable{TSource}, Func{TSource, TKey})"/> method. |
| | 29 | | /// <br/> |
| | 30 | | /// - The ordered sequence of PCS with corresponding starting index in the input string is then iterated over. |
| | 31 | | /// <br/> |
| | 32 | | /// - A new equivalence class is generated (increasing the current counter) every time a PCS differs from the |
| | 33 | | /// previous one. The equivalence class is assigned to the output at the index of the PCS. |
| | 34 | | /// <br/> |
| | 35 | | /// - Finally, the equivalence class list is returned. |
| | 36 | | /// </para> |
| | 37 | | /// <para id="complexity"> |
| | 38 | | /// COMPLEXITY |
| | 39 | | /// <br/> |
| | 40 | | /// - Extracting the n PCS of length L from the input string has O(n * L) Time and Space Complexity. |
| | 41 | | /// <br/> |
| | 42 | | /// - Sorting the n PCS of length L via the LINQ-provided QuickSort has O(n^2 * L) Time Complexity and O(n) Space |
| | 43 | | /// Complexity. |
| | 44 | | /// <br/> |
| | 45 | | /// - Iterating over the n PCS of length L is an O(n) operation. Each iteration does O(L) work, since, while |
| | 46 | | /// direct access to the equivalence class list is done in constant time, comparison between the current PCS |
| | 47 | | /// and the previous one is a comparison of two strings of length L, and requires all L chars to be comparerd in |
| | 48 | | /// the worst case. |
| | 49 | | /// <br/> |
| | 50 | | /// - Instantiating the equivalence class list of output is also an O(n) operation. |
| | 51 | | /// <br/> |
| | 52 | | /// - Therefore, Time Complexity, as driven by sorting, is O(n^2 * L) and Space Complexity, as driven by the PCS |
| | 53 | | /// generating and iteration, is O(n * L). |
| | 54 | | /// </para> |
| | 55 | | /// </remarks> |
| | 56 | | public class NaiveDoubleLengthPcsClassifier : IDoubleLengthPcsClassifier |
| | 57 | | { |
| 23 | 58 | | private IComparer<string> PcsComparer { get; } |
| | 59 | |
|
| | 60 | | /// <summary> |
| | 61 | | /// The input text, whose PCS of length <see cref="PcsLength"/> have to be classified. |
| | 62 | | /// </summary> |
| 46 | 63 | | public string Input { get; } |
| | 64 | |
|
| | 65 | | /// <summary> |
| | 66 | | /// The length of the PCS of <see cref="Input"/> to be classified. |
| | 67 | | /// </summary> |
| 23 | 68 | | public int PcsLength { get; } |
| | 69 | |
|
| | 70 | | /// <summary> |
| | 71 | | /// <inheritdoc cref="NaiveDoubleLengthPcsClassifier"/> |
| | 72 | | /// </summary> |
| | 73 | | /// <param name="input"><inheritdoc cref="Input" path="/summary"/></param> |
| | 74 | | /// <param name="pcsLength"><inheritdoc cref="PcsLength" path="/summary"/></param> |
| | 75 | | /// <param name="inputWithTerminator"> |
| | 76 | | /// Whether <paramref name="input"/> is terminated by a terminator char. If so, the last char of |
| | 77 | | /// <paramref name="input"/> will be treated as a terminator char when comparing PCS of the input. |
| | 78 | | /// Otherwise, <see cref="Comparer{T}.Default"/> for <see cref="string"/> will be used. |
| | 79 | | /// </param> |
| 28 | 80 | | public NaiveDoubleLengthPcsClassifier(string input, int pcsLength, bool inputWithTerminator) |
| 28 | 81 | | { |
| 28 | 82 | | if (pcsLength <= 0 || pcsLength > input.Length) |
| 5 | 83 | | throw new ArgumentOutOfRangeException( |
| 5 | 84 | | nameof(pcsLength), $"Must be positive and at most the length of {nameof(input)}."); |
| | 85 | |
|
| 23 | 86 | | Input = input; |
| 23 | 87 | | PcsLength = pcsLength; |
| 23 | 88 | | PcsComparer = inputWithTerminator |
| 23 | 89 | | ? StringIncludingTerminatorComparer.Build(input[^1]) |
| 23 | 90 | | : Comparer<string>.Default; |
| 23 | 91 | | } |
| | 92 | |
|
| | 93 | | private sealed class PcsAndIndexComparer : IComparer<(string pcs, int index)> |
| | 94 | | { |
| | 95 | | private readonly IComparer<string> PcsComparer; |
| | 96 | | private readonly IComparer<int> IndexComparer; |
| | 97 | |
|
| 23 | 98 | | public PcsAndIndexComparer(IComparer<string> pcsComparer) |
| 23 | 99 | | { |
| 23 | 100 | | PcsComparer = pcsComparer; |
| 23 | 101 | | IndexComparer = Comparer<int>.Default; |
| 23 | 102 | | } |
| | 103 | |
|
| | 104 | | public int Compare((string pcs, int index) x, (string pcs, int index) y) |
| 201 | 105 | | { |
| 201 | 106 | | var pcsComparison = PcsComparer.Compare(x.pcs, y.pcs); |
| 201 | 107 | | if (pcsComparison != 0) |
| 193 | 108 | | return pcsComparison; |
| 8 | 109 | | return IndexComparer.Compare(x.index, y.index); |
| 201 | 110 | | } |
| | 111 | | } |
| | 112 | |
|
| | 113 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 114 | | /// <remarks> |
| | 115 | | /// <inheritdoc cref="NaiveDoubleLengthPcsClassifier" path="/remarks"/> |
| | 116 | | /// </remarks> |
| | 117 | | public IList<int> Classify() |
| 23 | 118 | | { |
| 23 | 119 | | var (headList, tail) = PcsUtils |
| 23 | 120 | | .ExtractPcsOf(Input, PcsLength) |
| 114 | 121 | | .OrderBy(s => s, new PcsAndIndexComparer(PcsComparer)) |
| 23 | 122 | | .EnumerateExactlyFirst(1); |
| | 123 | |
|
| 23 | 124 | | var (firstPcs, firstIndex) = headList.Single(); |
| 23 | 125 | | var eqClasses = new int[Input.Length]; |
| | 126 | |
|
| 23 | 127 | | var currentEqClass = 0; |
| 23 | 128 | | eqClasses[firstIndex] = currentEqClass; |
| | 129 | |
|
| 23 | 130 | | var previousPcs = firstPcs; |
| 251 | 131 | | foreach (var (pcs, index) in tail) |
| 91 | 132 | | { |
| 91 | 133 | | eqClasses[index] = pcs == previousPcs ? currentEqClass : ++currentEqClass; |
| 91 | 134 | | previousPcs = pcs; |
| 91 | 135 | | } |
| | 136 | |
|
| 23 | 137 | | return eqClasses; |
| 23 | 138 | | } |
| | 139 | | } |