| | 1 | | namespace MoreStructures.BurrowsWheelerTransform.Builders.LastFirstFinders; |
| | 2 | |
|
| | 3 | |
|
| | 4 | | /// <summary> |
| | 5 | | /// A <see cref="BinarySearchFinder"/> refinement which precalculate an hash-map of all the positions by each |
| | 6 | | /// char, for both BWT and its sorted version, which takes ~ 2 * n space and makes calls to |
| | 7 | | /// <see cref="FindIndexOfNthOccurrenceInBWT(int, int)"/> and <see cref="FindIndexOfNthOccurrenceInSortedBWT"/> |
| | 8 | | /// executed in constant time. |
| | 9 | | /// </summary> |
| | 10 | | /// <remarks> |
| | 11 | | /// Calls to <see cref="FindOccurrenceRankOfCharInBWT(int)"/> are still executed in O(n / sigma) time, where sigma is |
| | 12 | | /// the size of the alphabet, or the number of distinct values in the BWT. If sigma is constant w.r.t. n, complexity |
| | 13 | | /// is still linear. |
| | 14 | | /// <br/> |
| | 15 | | /// Calls to <see cref="FindOccurrenceRankOfCharInSortedBWT(int)"/> are still executed in O(log(n / sigma)), which is |
| | 16 | | /// O(log(n)) when sigma is constant w.r.t. n. |
| | 17 | | /// </remarks> |
| | 18 | | public class PrecomputedFinder : BinarySearchFinder |
| | 19 | | { |
| | 20 | | /// <summary> |
| | 21 | | /// The <see cref="Lists.Searching.ISearch"/> implementation to be used when searching for items in lists not |
| | 22 | | /// sorted in any order. |
| | 23 | | /// </summary> |
| 275 | 24 | | protected static Lists.Searching.ISearch UnorderedListSearch { get; } = new Lists.Searching.LinearSearch(); |
| | 25 | |
|
| | 26 | | private readonly IDictionary<char, IList<int>> _bwtOccurrenceIndexesOfChar; |
| | 27 | | private readonly IDictionary<char, IList<int>> _sbwtOccurrenceIndexesOfChar; |
| | 28 | |
|
| | 29 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 30 | | /// <remarks> |
| | 31 | | /// <inheritdoc cref="PrecomputedFinder" path="/summary"/> |
| | 32 | | /// </remarks> |
| | 33 | | public PrecomputedFinder(RotatedTextWithTerminator lastBWMColumn, RotatedTextWithTerminator firstBWMColumn) |
| 63 | 34 | | : base(lastBWMColumn, firstBWMColumn) |
| 63 | 35 | | { |
| 63 | 36 | | _bwtOccurrenceIndexesOfChar = GetOccurrenceIndexesOfAllCharsIn(BWT); |
| 63 | 37 | | _sbwtOccurrenceIndexesOfChar = GetOccurrenceIndexesOfAllCharsIn(SortedBWT); |
| 63 | 38 | | } |
| | 39 | |
|
| | 40 | | private static IDictionary<char, IList<int>> GetOccurrenceIndexesOfAllCharsIn(IEnumerable<char> chars) |
| 126 | 41 | | { |
| 126 | 42 | | var result = new Dictionary<char, IList<int>>() { }; |
| 126 | 43 | | var index = 0; |
| 2650 | 44 | | foreach (var c in chars) |
| 1136 | 45 | | { |
| 1136 | 46 | | if (!result.ContainsKey(c)) |
| 578 | 47 | | result[c] = new List<int> { }; |
| 1136 | 48 | | result[c].Add(index); |
| 1136 | 49 | | index++; |
| 1136 | 50 | | } |
| 126 | 51 | | return result; |
| 126 | 52 | | } |
| | 53 | |
|
| | 54 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 55 | | /// <remarks> |
| | 56 | | /// This implementation uses a precomputed hash-map of all the positions by each char. |
| | 57 | | /// <br/> |
| | 58 | | /// Time Complexity = O(1). Space Complexity = O(1). |
| | 59 | | /// </remarks> |
| | 60 | | public override int FindIndexOfNthOccurrenceInBWT(int indexOfCharInBWT, int occurrenceRank) |
| 16 | 61 | | { |
| 16 | 62 | | if (indexOfCharInBWT < 0 || indexOfCharInBWT >= BWT.Length) |
| 1 | 63 | | throw new ArgumentOutOfRangeException(nameof(indexOfCharInBWT), $"Invalid {nameof(indexOfCharInBWT)}."); |
| | 64 | |
|
| 15 | 65 | | var charToFind = BWT[indexOfCharInBWT]; |
| | 66 | |
|
| 15 | 67 | | return _bwtOccurrenceIndexesOfChar[charToFind][occurrenceRank]; |
| 9 | 68 | | } |
| | 69 | |
|
| | 70 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 71 | | /// <remarks> |
| | 72 | | /// <inheritdoc cref="FindIndexOfNthOccurrenceInBWT(int, int)"/> |
| | 73 | | /// </remarks> |
| | 74 | | public override int FindIndexOfNthOccurrenceInSortedBWT(int indexOfCharInBWT, int occurrenceRank) |
| 283 | 75 | | { |
| 283 | 76 | | if (indexOfCharInBWT < 0 || indexOfCharInBWT >= BWT.Length) |
| 1 | 77 | | throw new ArgumentOutOfRangeException(nameof(indexOfCharInBWT), $"Invalid value: {indexOfCharInBWT}."); |
| | 78 | |
|
| 282 | 79 | | var charToFind = BWT[indexOfCharInBWT]; |
| | 80 | |
|
| 282 | 81 | | if (occurrenceRank < 0 || occurrenceRank >= _sbwtOccurrenceIndexesOfChar[charToFind].Count) |
| 6 | 82 | | throw new ArgumentOutOfRangeException(nameof(occurrenceRank), $"Invalid value: {occurrenceRank}."); |
| | 83 | |
|
| 276 | 84 | | return _sbwtOccurrenceIndexesOfChar[charToFind].First() + occurrenceRank; |
| 276 | 85 | | } |
| | 86 | |
|
| | 87 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 88 | | /// <remarks> |
| | 89 | | /// <para id="algo"> |
| | 90 | | /// ALGORITHM |
| | 91 | | /// <br/> |
| | 92 | | /// This implementation uses a precomputed hash-map of all the positions by each char. |
| | 93 | | /// <br/> |
| | 94 | | /// However, unlike <see cref="ILastFirstFinder.SortedBWT"/>, <see cref="ILastFirstFinder.BWT"/> is not sorted, |
| | 95 | | /// so the precomputed list storing all the indexes where the char of <see cref="ILastFirstFinder.BWT"/> at |
| | 96 | | /// index <paramref name="indexOfCharInBWT"/> appears can be accessed in O(1) but has to be iterated over |
| | 97 | | /// linearly. |
| | 98 | | /// <br/> |
| | 99 | | /// Such a list has in average n / sigma elements, where sigma is the number of distinct chars in the text. |
| | 100 | | /// If sigma is constant, the Time Complexity is O(n). |
| | 101 | | /// <br/> |
| | 102 | | /// Space Complexity is always O(1), since O(n * sigma) space has already been allocated to host the result of |
| | 103 | | /// counts and first occurrences precomputation. |
| | 104 | | /// </para> |
| | 105 | | /// </remarks> |
| | 106 | | public override int FindOccurrenceRankOfCharInBWT(int indexOfCharInBWT) |
| 276 | 107 | | { |
| 276 | 108 | | if (indexOfCharInBWT < 0 || indexOfCharInBWT >= BWT.Length) |
| 2 | 109 | | throw new ArgumentOutOfRangeException(nameof(indexOfCharInBWT), $"Invalid value: {indexOfCharInBWT}"); |
| | 110 | |
|
| 274 | 111 | | var indexesOfChar = _bwtOccurrenceIndexesOfChar[BWT[indexOfCharInBWT]]; |
| 274 | 112 | | return UnorderedListSearch.First(indexesOfChar, indexOfCharInBWT); |
| 274 | 113 | | } |
| | 114 | |
|
| | 115 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 116 | | /// <remarks> |
| | 117 | | /// <para id="algo"> |
| | 118 | | /// ALGORITHM |
| | 119 | | /// <br/> |
| | 120 | | /// This implementation uses a precomputed hash-map of all the positions by each char. |
| | 121 | | /// <br/> |
| | 122 | | /// It also takes advantage of the fact that <see cref="ILastFirstFinder.SortedBWT"/> is sorted, by running a |
| | 123 | | /// Binary Search on it, which takes logarithmic time over the list of indexes for the char at position |
| | 124 | | /// <paramref name="indexOfCharInSortedBWT"/> in <see cref="ILastFirstFinder.BWT"/>. |
| | 125 | | /// <br/> |
| | 126 | | /// Such list has average size = n / sigma, where n = number of chars in |
| | 127 | | /// <see cref="ILastFirstFinder.SortedBWT"/> and sigma = size of the alphabet of |
| | 128 | | /// <see cref="ILastFirstFinder.SortedBWT"/>. |
| | 129 | | /// <br/> |
| | 130 | | /// If sigma is constant, the list has a size O(n). |
| | 131 | | /// </para> |
| | 132 | | /// <para id="complexity"> |
| | 133 | | /// COMPLEXITY |
| | 134 | | /// <br/> |
| | 135 | | /// Therefore, Time Complexity = O(log(n)) and Space Complexity = O(1). |
| | 136 | | /// </para> |
| | 137 | | /// </remarks> |
| | 138 | | public override int FindOccurrenceRankOfCharInSortedBWT(int indexOfCharInSortedBWT) |
| 9 | 139 | | { |
| 9 | 140 | | if (indexOfCharInSortedBWT < 0 || indexOfCharInSortedBWT >= SortedBWT.Length) |
| 2 | 141 | | throw new ArgumentOutOfRangeException( |
| 2 | 142 | | nameof(indexOfCharInSortedBWT), $"Invalid value: {indexOfCharInSortedBWT}"); |
| | 143 | |
|
| 7 | 144 | | var indexesOfChar = _sbwtOccurrenceIndexesOfChar[SortedBWT[indexOfCharInSortedBWT]]; |
| 7 | 145 | | return OrderedAscListSearch.First(indexesOfChar, indexOfCharInSortedBWT); |
| 7 | 146 | | } |
| | 147 | | } |