| | 1 | | namespace MoreStructures.Strings.Sorting; |
| | 2 | |
|
| | 3 | | /// <summary> |
| | 4 | | /// An implementation of <see cref="ICharsSorter"/> which uses Counting Sort to sort the input. |
| | 5 | | /// </summary> |
| | 6 | | /// <remarks> |
| | 7 | | /// <para id="advantages"> |
| | 8 | | /// ADVANTAGES |
| | 9 | | /// <br/> |
| | 10 | | /// - The algorithm is a Counting Sort adaptation and specialization to strings, seen as lists of chars. |
| | 11 | | /// <br/> |
| | 12 | | /// - It leverages a runtime linear in the size of the input and its alphabet, which is better than any worst-case |
| | 13 | | /// scenario of a comparison-based algorithm, such as QuickSort or MergeSort. |
| | 14 | | /// <br/> |
| | 15 | | /// - However, because its runtime depends not only on the size of the input, but also on the size of the alphabet, |
| | 16 | | /// it is suitable for scenarios of small alphabets only, or at least when there is an upper bound on the number |
| | 17 | | /// of distinct chars in the input (and they are somehow known in advance), which is small enough to fit the |
| | 18 | | /// histogram of occurrences in memory. |
| | 19 | | /// <br/> |
| | 20 | | /// - For example: genome sequences can be mapped to 4-chars strings; English lower-case sentences are limited to |
| | 21 | | /// 26 chars plus space, punctuation, etc; digit sequences consist of an alphabet of 10 chars only etc. |
| | 22 | | /// <br/> |
| | 23 | | /// - It is not a good fit for scenarios where the number of distinct values can be very high, such as an |
| | 24 | | /// unconstrained Unicode string. |
| | 25 | | /// </para> |
| | 26 | | /// <para id="algo"> |
| | 27 | | /// ALGORITHM |
| | 28 | | /// <br/> |
| | 29 | | /// - First, a histogram of the occurrences in the input of all the chars in the alphabet is created, going through |
| | 30 | | /// each char of the input and incrementing the counter of the char in the histogram. |
| | 31 | | /// <br/> |
| | 32 | | /// - Then the histogram is made cumulative, by going through each item of the histogram array, starting from the |
| | 33 | | /// second item, and cumulating the previous sum into the value at the current position. |
| | 34 | | /// <br/> |
| | 35 | | /// - Finally, all chars of the input are iterated once again, in reverse order, to build the order list, i.e. the |
| | 36 | | /// list of positions. |
| | 37 | | /// </para> |
| | 38 | | /// <para id="complexity"> |
| | 39 | | /// COMPLEXITY |
| | 40 | | /// <br/> |
| | 41 | | /// - Building the histogram requires iterating over all the n chars of the input. |
| | 42 | | /// <br/> |
| | 43 | | /// - The histogram has a size equal to the size of the alphabet, sigma, of the input. |
| | 44 | | /// <br/> |
| | 45 | | /// - Making the histogram cumulative requires going through each item of it, and there are sigma items. |
| | 46 | | /// <br/> |
| | 47 | | /// - Finally, the second pass of the string means n more iterations, each one doing constant-time work. |
| | 48 | | /// <br/> |
| | 49 | | /// - The second pass builds the order list, which is a new array of n items (one per index of the input). |
| | 50 | | /// <br/> |
| | 51 | | /// - Therefore, Time Complexity is O(n + sigma) and Space Complexity is O(n + sigma). |
| | 52 | | /// </para> |
| | 53 | | /// </remarks> |
| | 54 | | public class CountingSortCharsSorter : ICharsSorter |
| | 55 | | { |
| | 56 | | /// <summary> |
| | 57 | | /// The alphabet of the input, i.e. the list of all <see cref="char"/> potentially appearing in the input string, |
| | 58 | | /// mapped to an alphabet index. |
| | 59 | | /// </summary> |
| | 60 | | /// <remarks> |
| | 61 | | /// Required by the Counting Sort algorithm, which builds the histogram of occurrences in the input, of all chars |
| | 62 | | /// of the alphabet of the input. |
| | 63 | | /// </remarks> |
| 282 | 64 | | public IDictionary<char, int> Alphabet { get; } |
| | 65 | |
|
| | 66 | | /// <summary> |
| | 67 | | /// <inheritdoc cref="CountingSortCharsSorter"/> |
| | 68 | | /// </summary> |
| | 69 | | /// <remarks> |
| | 70 | | /// <inheritdoc cref="CountingSortCharsSorter"/> |
| | 71 | | /// </remarks> |
| | 72 | | /// <param name="alphabet"><inheritdoc cref="Alphabet" path="/summary"/></param> |
| 22 | 73 | | public CountingSortCharsSorter(IDictionary<char, int> alphabet) |
| 22 | 74 | | { |
| 22 | 75 | | Alphabet = alphabet; |
| 22 | 76 | | } |
| | 77 | |
|
| | 78 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 79 | | /// <remarks> |
| | 80 | | /// <inheritdoc cref="CountingSortCharsSorter" path="/remarks"/> |
| | 81 | | /// </remarks> |
| | 82 | | public IList<int> Sort(string input) |
| 22 | 83 | | { |
| | 84 | | // Build histogram |
| 22 | 85 | | var counts = new int[Alphabet.Count]; |
| 304 | 86 | | for (int i = 0; i < input.Length; i++) |
| 130 | 87 | | counts[Alphabet[input[i]]]++; |
| | 88 | |
|
| | 89 | | // Make histogram cumulative |
| 556 | 90 | | for (int i = 1; i < counts.Length; i++) |
| 256 | 91 | | counts[i] += counts[i - 1]; |
| | 92 | |
|
| | 93 | | // Calculate order list |
| 22 | 94 | | var order = new int[input.Length]; |
| 304 | 95 | | for (int i = input.Length - 1; i >= 0; i--) |
| 130 | 96 | | { |
| 130 | 97 | | var alphabetIndex = Alphabet[input[i]]; |
| 130 | 98 | | var position = counts[alphabetIndex] = counts[alphabetIndex] - 1; |
| 130 | 99 | | order[position] = i; |
| 130 | 100 | | } |
| | 101 | |
|
| 22 | 102 | | return order; |
| 22 | 103 | | } |
| | 104 | | } |