| | | 1 | | namespace MoreStructures.BurrowsWheelerTransform.Builders.LastFirstFinders; |
| | | 2 | | |
| | | 3 | | |
| | | 4 | | /// <summary> |
| | | 5 | | /// A <see cref="BinarySearchFinder"/> refinement which precalculate an hash-map of all the positions by each |
| | | 6 | | /// char, for both BWT and its sorted version, which takes ~ 2 * n space and makes calls to |
| | | 7 | | /// <see cref="FindIndexOfNthOccurrenceInBWT(int, int)"/> and <see cref="FindIndexOfNthOccurrenceInSortedBWT"/> |
| | | 8 | | /// executed in constant time. |
| | | 9 | | /// </summary> |
| | | 10 | | /// <remarks> |
| | | 11 | | /// Calls to <see cref="FindOccurrenceRankOfCharInBWT(int)"/> are still executed in O(n / sigma) time, where sigma is |
| | | 12 | | /// the size of the alphabet, or the number of distinct values in the BWT. If sigma is constant w.r.t. n, complexity |
| | | 13 | | /// is still linear. |
| | | 14 | | /// <br/> |
| | | 15 | | /// Calls to <see cref="FindOccurrenceRankOfCharInSortedBWT(int)"/> are still executed in O(log(n / sigma)), which is |
| | | 16 | | /// O(log(n)) when sigma is constant w.r.t. n. |
| | | 17 | | /// </remarks> |
| | | 18 | | public class PrecomputedFinder : BinarySearchFinder |
| | | 19 | | { |
| | | 20 | | /// <summary> |
| | | 21 | | /// The <see cref="Lists.Searching.ISearch"/> implementation to be used when searching for items in lists not |
| | | 22 | | /// sorted in any order. |
| | | 23 | | /// </summary> |
| | 275 | 24 | | protected static Lists.Searching.ISearch UnorderedListSearch { get; } = new Lists.Searching.LinearSearch(); |
| | | 25 | | |
| | | 26 | | private readonly IDictionary<char, IList<int>> _bwtOccurrenceIndexesOfChar; |
| | | 27 | | private readonly IDictionary<char, IList<int>> _sbwtOccurrenceIndexesOfChar; |
| | | 28 | | |
| | | 29 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | | 30 | | /// <remarks> |
| | | 31 | | /// <inheritdoc cref="PrecomputedFinder" path="/summary"/> |
| | | 32 | | /// </remarks> |
| | | 33 | | public PrecomputedFinder(RotatedTextWithTerminator lastBWMColumn, RotatedTextWithTerminator firstBWMColumn) |
| | 63 | 34 | | : base(lastBWMColumn, firstBWMColumn) |
| | 63 | 35 | | { |
| | 63 | 36 | | _bwtOccurrenceIndexesOfChar = GetOccurrenceIndexesOfAllCharsIn(BWT); |
| | 63 | 37 | | _sbwtOccurrenceIndexesOfChar = GetOccurrenceIndexesOfAllCharsIn(SortedBWT); |
| | 63 | 38 | | } |
| | | 39 | | |
| | | 40 | | private static IDictionary<char, IList<int>> GetOccurrenceIndexesOfAllCharsIn(IEnumerable<char> chars) |
| | 126 | 41 | | { |
| | 126 | 42 | | var result = new Dictionary<char, IList<int>>() { }; |
| | 126 | 43 | | var index = 0; |
| | 2650 | 44 | | foreach (var c in chars) |
| | 1136 | 45 | | { |
| | 1136 | 46 | | if (!result.ContainsKey(c)) |
| | 578 | 47 | | result[c] = new List<int> { }; |
| | 1136 | 48 | | result[c].Add(index); |
| | 1136 | 49 | | index++; |
| | 1136 | 50 | | } |
| | 126 | 51 | | return result; |
| | 126 | 52 | | } |
| | | 53 | | |
| | | 54 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | | 55 | | /// <remarks> |
| | | 56 | | /// This implementation uses a precomputed hash-map of all the positions by each char. |
| | | 57 | | /// <br/> |
| | | 58 | | /// Time Complexity = O(1). Space Complexity = O(1). |
| | | 59 | | /// </remarks> |
| | | 60 | | public override int FindIndexOfNthOccurrenceInBWT(int indexOfCharInBWT, int occurrenceRank) |
| | 16 | 61 | | { |
| | 16 | 62 | | if (indexOfCharInBWT < 0 || indexOfCharInBWT >= BWT.Length) |
| | 1 | 63 | | throw new ArgumentOutOfRangeException(nameof(indexOfCharInBWT), $"Invalid {nameof(indexOfCharInBWT)}."); |
| | | 64 | | |
| | 15 | 65 | | var charToFind = BWT[indexOfCharInBWT]; |
| | | 66 | | |
| | 15 | 67 | | return _bwtOccurrenceIndexesOfChar[charToFind][occurrenceRank]; |
| | 9 | 68 | | } |
| | | 69 | | |
| | | 70 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | | 71 | | /// <remarks> |
| | | 72 | | /// <inheritdoc cref="FindIndexOfNthOccurrenceInBWT(int, int)"/> |
| | | 73 | | /// </remarks> |
| | | 74 | | public override int FindIndexOfNthOccurrenceInSortedBWT(int indexOfCharInBWT, int occurrenceRank) |
| | 283 | 75 | | { |
| | 283 | 76 | | if (indexOfCharInBWT < 0 || indexOfCharInBWT >= BWT.Length) |
| | 1 | 77 | | throw new ArgumentOutOfRangeException(nameof(indexOfCharInBWT), $"Invalid value: {indexOfCharInBWT}."); |
| | | 78 | | |
| | 282 | 79 | | var charToFind = BWT[indexOfCharInBWT]; |
| | | 80 | | |
| | 282 | 81 | | if (occurrenceRank < 0 || occurrenceRank >= _sbwtOccurrenceIndexesOfChar[charToFind].Count) |
| | 6 | 82 | | throw new ArgumentOutOfRangeException(nameof(occurrenceRank), $"Invalid value: {occurrenceRank}."); |
| | | 83 | | |
| | 276 | 84 | | return _sbwtOccurrenceIndexesOfChar[charToFind].First() + occurrenceRank; |
| | 276 | 85 | | } |
| | | 86 | | |
| | | 87 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | | 88 | | /// <remarks> |
| | | 89 | | /// <para id="algo"> |
| | | 90 | | /// ALGORITHM |
| | | 91 | | /// <br/> |
| | | 92 | | /// This implementation uses a precomputed hash-map of all the positions by each char. |
| | | 93 | | /// <br/> |
| | | 94 | | /// However, unlike <see cref="ILastFirstFinder.SortedBWT"/>, <see cref="ILastFirstFinder.BWT"/> is not sorted, |
| | | 95 | | /// so the precomputed list storing all the indexes where the char of <see cref="ILastFirstFinder.BWT"/> at |
| | | 96 | | /// index <paramref name="indexOfCharInBWT"/> appears can be accessed in O(1) but has to be iterated over |
| | | 97 | | /// linearly. |
| | | 98 | | /// <br/> |
| | | 99 | | /// Such a list has in average n / sigma elements, where sigma is the number of distinct chars in the text. |
| | | 100 | | /// If sigma is constant, the Time Complexity is O(n). |
| | | 101 | | /// <br/> |
| | | 102 | | /// Space Complexity is always O(1), since O(n * sigma) space has already been allocated to host the result of |
| | | 103 | | /// counts and first occurrences precomputation. |
| | | 104 | | /// </para> |
| | | 105 | | /// </remarks> |
| | | 106 | | public override int FindOccurrenceRankOfCharInBWT(int indexOfCharInBWT) |
| | 276 | 107 | | { |
| | 276 | 108 | | if (indexOfCharInBWT < 0 || indexOfCharInBWT >= BWT.Length) |
| | 2 | 109 | | throw new ArgumentOutOfRangeException(nameof(indexOfCharInBWT), $"Invalid value: {indexOfCharInBWT}"); |
| | | 110 | | |
| | 274 | 111 | | var indexesOfChar = _bwtOccurrenceIndexesOfChar[BWT[indexOfCharInBWT]]; |
| | 274 | 112 | | return UnorderedListSearch.First(indexesOfChar, indexOfCharInBWT); |
| | 274 | 113 | | } |
| | | 114 | | |
| | | 115 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | | 116 | | /// <remarks> |
| | | 117 | | /// <para id="algo"> |
| | | 118 | | /// ALGORITHM |
| | | 119 | | /// <br/> |
| | | 120 | | /// This implementation uses a precomputed hash-map of all the positions by each char. |
| | | 121 | | /// <br/> |
| | | 122 | | /// It also takes advantage of the fact that <see cref="ILastFirstFinder.SortedBWT"/> is sorted, by running a |
| | | 123 | | /// Binary Search on it, which takes logarithmic time over the list of indexes for the char at position |
| | | 124 | | /// <paramref name="indexOfCharInSortedBWT"/> in <see cref="ILastFirstFinder.BWT"/>. |
| | | 125 | | /// <br/> |
| | | 126 | | /// Such list has average size = n / sigma, where n = number of chars in |
| | | 127 | | /// <see cref="ILastFirstFinder.SortedBWT"/> and sigma = size of the alphabet of |
| | | 128 | | /// <see cref="ILastFirstFinder.SortedBWT"/>. |
| | | 129 | | /// <br/> |
| | | 130 | | /// If sigma is constant, the list has a size O(n). |
| | | 131 | | /// </para> |
| | | 132 | | /// <para id="complexity"> |
| | | 133 | | /// COMPLEXITY |
| | | 134 | | /// <br/> |
| | | 135 | | /// Therefore, Time Complexity = O(log(n)) and Space Complexity = O(1). |
| | | 136 | | /// </para> |
| | | 137 | | /// </remarks> |
| | | 138 | | public override int FindOccurrenceRankOfCharInSortedBWT(int indexOfCharInSortedBWT) |
| | 9 | 139 | | { |
| | 9 | 140 | | if (indexOfCharInSortedBWT < 0 || indexOfCharInSortedBWT >= SortedBWT.Length) |
| | 2 | 141 | | throw new ArgumentOutOfRangeException( |
| | 2 | 142 | | nameof(indexOfCharInSortedBWT), $"Invalid value: {indexOfCharInSortedBWT}"); |
| | | 143 | | |
| | 7 | 144 | | var indexesOfChar = _sbwtOccurrenceIndexesOfChar[SortedBWT[indexOfCharInSortedBWT]]; |
| | 7 | 145 | | return OrderedAscListSearch.First(indexesOfChar, indexOfCharInSortedBWT); |
| | 7 | 146 | | } |
| | | 147 | | } |