| | 1 | | using MoreStructures.BurrowsWheelerTransform.Builders.LastFirstFinders; |
| | 2 | | using MoreStructures.Lists.Counting; |
| | 3 | | using MoreStructures.Lists.Searching; |
| | 4 | |
|
| | 5 | | namespace MoreStructures.BurrowsWheelerTransform.Matching; |
| | 6 | |
|
| | 7 | | /// <summary> |
| | 8 | | /// A <see cref="NarrowingIntervalMatcher"/> refinement which precalculate the count of occurrences of each item at |
| | 9 | | /// index i in BWT[0..(i - 1)], and the index of first occurrence of each char in SortedBWT, and later uses them |
| | 10 | | /// to perform in constant time interval narrowing operations within the top-level loop of chars to match. |
| | 11 | | /// </summary> |
| | 12 | | /// <remarks> |
| | 13 | | /// Precalculating counts requires iterating over all the chars of the BWT and populating a table of n rows and sigma |
| | 14 | | /// columns. |
| | 15 | | /// <br/> |
| | 16 | | /// Precalculating first occurrences also requires iterating over the BWT, and storing a dictionary of n items. |
| | 17 | | /// <br/> |
| | 18 | | /// Therefore the cost paid upfront is O(n) in time and O(n * sigma) in space. |
| | 19 | | /// </remarks> |
| | 20 | | public class CountBasedNarrowingIntervalMatcher : NarrowingIntervalMatcher |
| | 21 | | { |
| | 22 | | /// <summary> |
| | 23 | | /// The <see cref="ISearch"/> implementation to be used when looking for the 1st occurrence of each of the items |
| | 24 | | /// of an enumerable. |
| | 25 | | /// </summary> |
| 22 | 26 | | protected static ISearch LinearSearch { get; } = new LinearSearch(); |
| | 27 | |
|
| | 28 | | /// <summary> |
| | 29 | | /// The <see cref="IOccurrencesCounter"/> implementation to be used when counting the number of occurrences of |
| | 30 | | /// each of the items of an enumerable. |
| | 31 | | /// </summary> |
| 22 | 32 | | protected static IOccurrencesCounter OccurrencesCounter { get; } = new DictionaryBasedOccurrencesCounter(); |
| | 33 | |
|
| | 34 | | private readonly IDictionary<char, int> _sbwtFirstOccurrences; |
| | 35 | | private readonly IDictionary<char, IDictionary<int, int>> _bwtCounts; |
| | 36 | |
|
| | 37 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 38 | | /// <remarks> |
| | 39 | | /// This specific implementation also precalculates two dictionaries: the counts of <see cref="IMatcher.BWT"/> |
| | 40 | | /// and the first occurrence of each of the chars of <see cref="IMatcher.SortedBWT"/>. These two data structures |
| | 41 | | /// makes single char matching a linear operation. |
| | 42 | | /// </remarks> |
| | 43 | | public CountBasedNarrowingIntervalMatcher(RotatedTextWithTerminator bwt, RotatedTextWithTerminator sbwt) |
| 23 | 44 | | : base(bwt, sbwt) |
| 21 | 45 | | { |
| 21 | 46 | | _bwtCounts = OccurrencesCounter.Count(BWT); |
| 21 | 47 | | _sbwtFirstOccurrences = LinearSearch.FirstAll(SortedBWT); |
| 21 | 48 | | } |
| | 49 | |
|
| | 50 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 51 | | /// <remarks> |
| | 52 | | /// Unlike <see cref="NarrowingIntervalMatcher"/>, this implementation of <see cref="IMatcher"/> doesn't make |
| | 53 | | /// explicit calls to <see cref="ILastFirstFinder.LastToFirst(int)"/>. Instead it solely uses its precomputed |
| | 54 | | /// structures and uses the last-first property implicitely when narrowing the current interval via such strucutes |
| | 55 | | /// in <see cref="NarrowInterval(char, ILastFirstFinder, int, int)"/>. |
| | 56 | | /// <br/> |
| | 57 | | /// Because of that, it doesn't need an optimized <see cref="ILastFirstFinder"/>, and in particular one which does |
| | 58 | | /// precomputation (such as the <see cref="PrecomputedFinder"/> used by <see cref="NarrowingIntervalMatcher"/>), |
| | 59 | | /// and can just instantiate a <see cref="NaiveFinder"/> instead. |
| | 60 | | /// </remarks> |
| 20 | 61 | | protected override ILastFirstFinder BuildLastFirstFinder() => new NaiveFinder(BWT, SortedBWT); |
| | 62 | |
|
| | 63 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 64 | | /// <remarks> |
| | 65 | | /// <para id="algo"> |
| | 66 | | /// ALGORITHM |
| | 67 | | /// <br/> |
| | 68 | | /// Narrowing is performed in three sub-steps (compared to the five in <see cref="NarrowingIntervalMatcher"/>): |
| | 69 | | /// <br/> |
| | 70 | | /// 1. The new start index is calculated as the 1st occurrence in <see cref="IMatcher.SortedBWT"/> of the |
| | 71 | | /// current char + the count of such char in <see cref="IMatcher.BWT"/> up to the current start index excluded |
| | 72 | | /// (i.e. the number of occurrences of the char up to the index before the current start index). |
| | 73 | | /// <br/> |
| | 74 | | /// 2. The new end index is calculated as the 1st occurrence in <see cref="IMatcher.SortedBWT"/> of the current |
| | 75 | | /// char + the count of such char in <see cref="IMatcher.BWT"/> up to the current end index included, short of |
| | 76 | | /// one (i.e. the number of occurrences of the char up to the current end index - 1). |
| | 77 | | /// <br/> |
| | 78 | | /// 3. The narrowed interval in Sorted BWT is returned. |
| | 79 | | /// </para> |
| | 80 | | /// <para id="complexity"> |
| | 81 | | /// COMPLEXITY |
| | 82 | | /// <br/> |
| | 83 | | /// Total amortized cost is O(1), both in time and space. |
| | 84 | | /// </para> |
| | 85 | | /// </remarks> |
| | 86 | | protected override (bool success, int narrowedStartIndex, int narrowedEndIndex) NarrowInterval( |
| | 87 | | char currentChar, ILastFirstFinder finder, int startIndex, int endIndex) |
| 29 | 88 | | { |
| 29 | 89 | | if (!_sbwtFirstOccurrences.TryGetValue(currentChar, out var currentChar1stOccurrence)) |
| 1 | 90 | | return (false, startIndex, endIndex); |
| 28 | 91 | | var currentCharCounts = _bwtCounts[currentChar]; |
| 28 | 92 | | var narrowedStartIndex = currentChar1stOccurrence + (startIndex >= 1 ? currentCharCounts[startIndex - 1] : 0); |
| 28 | 93 | | var narrowedEndIndex = currentChar1stOccurrence + currentCharCounts[endIndex] - 1; |
| 28 | 94 | | return (true, narrowedStartIndex, narrowedEndIndex); |
| 29 | 95 | | } |
| | 96 | | } |