| | 1 | | using MoreStructures.BurrowsWheelerTransform.Builders.LastFirstFinders; |
| | 2 | | using MoreStructures.Lists.Searching; |
| | 3 | | using MoreStructures.Utilities; |
| | 4 | |
|
| | 5 | | namespace MoreStructures.BurrowsWheelerTransform.Matching; |
| | 6 | |
|
| | 7 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 8 | | /// <remarks> |
| | 9 | | /// <para id="algo"> |
| | 10 | | /// ALGORITHM |
| | 11 | | /// <br/> |
| | 12 | | /// This is a basic implementation, narrowing the matching interval at every iteration with two linear scans of the |
| | 13 | | /// <see cref="BWT"/>: |
| | 14 | | /// <br/> |
| | 15 | | /// - the first from the beginning of the current interval and up to the first char matching the current char; |
| | 16 | | /// <br/> |
| | 17 | | /// - and the second from the end of the current interval and up to the last char matching the current char. |
| | 18 | | /// </para> |
| | 19 | | /// <para id="complexity"> |
| | 20 | | /// COMPLEXITY |
| | 21 | | /// <br/> |
| | 22 | | /// - No precomputation cost is paid on instantiation, except for sorting of the <see cref="BWT"/> to build the |
| | 23 | | /// <see cref="SortedBWT"/>, which takes O(n * log(n)) time using <see cref="BWTransform.QuickSort"/>, but can |
| | 24 | | /// also run in linear time for a constant size alphabet using the Counting Sort. |
| | 25 | | /// <br/> |
| | 26 | | /// - Either way, the predominant cost is the main narrowing interval algorithm, which runs for each char in the |
| | 27 | | /// BWT (i.e. n times) two linear scans of the BWT itself (on the order of n), resulting in quadratic time |
| | 28 | | /// execution. |
| | 29 | | /// <br/> |
| | 30 | | /// - Therefore, Time Complexity is O(n^2) and Space Complexity is O(n). |
| | 31 | | /// </para> |
| | 32 | | /// </remarks> |
| | 33 | | public class NarrowingIntervalMatcher : IMatcher |
| | 34 | | { |
| | 35 | | /// <summary> |
| | 36 | | /// The <see cref="ISearch"/> implementation to be used when searching for items in lists sorted |
| | 37 | | /// in ascending order. |
| | 38 | | /// </summary> |
| 41 | 39 | | protected static ISearch OrderedAscListSearch { get; } = new BinarySearch(); |
| | 40 | |
|
| | 41 | | /// <inheritdoc/> |
| 186 | 42 | | public RotatedTextWithTerminator BWT { get; } |
| | 43 | |
|
| | 44 | | /// <inheritdoc/> |
| 101 | 45 | | public RotatedTextWithTerminator SortedBWT { get; } |
| | 46 | |
|
| | 47 | | /// <summary> |
| | 48 | | /// The implementation of <see cref="IComparer{T}"/> of <see cref="char"/> to be used to comparer chars of |
| | 49 | | /// <see cref="BWT"/> or <see cref="SortedBWT"/>. |
| | 50 | | /// </summary> |
| 40 | 51 | | protected IComparer<char> CharComparer { get; } |
| | 52 | |
|
| | 53 | | /// <summary> |
| | 54 | | /// Builds an instance of this finder, for the provided <paramref name="bwt"/> and |
| | 55 | | /// <paramref name="sbwt"/>. Because both BWT and SortedBWT are provided, no sorting happens. |
| | 56 | | /// </summary> |
| | 57 | | /// <param name="bwt"> |
| | 58 | | /// The Burrows-Wheeler Transform. Corresponds to the last column of the Burrows-Wheeler Matrix. |
| | 59 | | /// </param> |
| | 60 | | /// <param name="sbwt"> |
| | 61 | | /// The sorted version of the Burrows-Wheeler Transform. |
| | 62 | | /// </param> |
| 46 | 63 | | public NarrowingIntervalMatcher(RotatedTextWithTerminator bwt, RotatedTextWithTerminator sbwt) |
| 46 | 64 | | { |
| 46 | 65 | | if (bwt.Terminator != sbwt.Terminator || bwt.Length != sbwt.Length) |
| 4 | 66 | | throw new ArgumentException($"{nameof(bwt)} and {nameof(sbwt)} are not consistent with each other."); |
| | 67 | |
|
| 42 | 68 | | BWT = bwt; |
| 42 | 69 | | SortedBWT = sbwt; |
| 42 | 70 | | CharComparer = CharOrTerminatorComparer.Build(BWT.Terminator); |
| 42 | 71 | | } |
| | 72 | |
|
| | 73 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 74 | | /// <remarks> |
| | 75 | | /// The pattern matching is done via successive narrowing of a interval, defined by a start and an end index.<br/> |
| | 76 | | /// 1. At the beginning the interval is as big as the provided <see cref="BWTransform"/> (and its text). |
| | 77 | | /// <br/> |
| | 78 | | /// 2. The algorithm proceeds in reverse: from the last char of the pattern P, P[^1] to the first, P[0]. |
| | 79 | | /// <br/> |
| | 80 | | /// 3. Search in Sorted BWT for the range of indexes (first1, last1) having value P[^1] via a <see cref="ISearch"/> |
| | 81 | | /// implementation (because the input is sorted, binary search is possible). |
| | 82 | | /// <br/> |
| | 83 | | /// 4. The char in BWT at indexes first1 and last1 represent the predecessor of all instances of P[^1] in P. The |
| | 84 | | /// interval (first1, last1) can then be narrowed down to (first2, last2), taking into account only the chars |
| | 85 | | /// in BWT which match the predecessor of P[^1], P[^2]. |
| | 86 | | /// <br/> |
| | 87 | | /// 5. By last-first property, new indexes (first3, last3) of the chars in Sorted BWT corresponding to first2 and |
| | 88 | | /// last2 in BWT, can be found. Those are the first and last of the new narrowed range, ready for step 4. |
| | 89 | | /// <br/> |
| | 90 | | /// 6. When all chars of P, up to P[0], have been consumed, all matches have been identified as an interval in |
| | 91 | | /// Sorted BWT. |
| | 92 | | /// </remarks> |
| | 93 | | public virtual Match Match(IEnumerable<char> pattern) |
| 42 | 94 | | { |
| 42 | 95 | | if (!pattern.Any()) |
| 2 | 96 | | throw new ArgumentException("The pattern should be non-empty.", nameof(pattern)); |
| | 97 | |
|
| 40 | 98 | | var patternReversed = pattern.Reverse(); |
| 40 | 99 | | var finder = BuildLastFirstFinder(); |
| | 100 | |
|
| 40 | 101 | | var (startIndex, endIndex) = OrderedAscListSearch.Interval(SortedBWT, patternReversed.First(), CharComparer); |
| 40 | 102 | | if (startIndex < 0) |
| 4 | 103 | | return new Match(false, 0, -1, -1); |
| | 104 | |
|
| 36 | 105 | | var charsMatched = 1; |
| 222 | 106 | | foreach (var currentChar in patternReversed.Skip(1)) |
| 58 | 107 | | { |
| 58 | 108 | | (var success, startIndex, endIndex) = NarrowInterval(currentChar, finder, startIndex, endIndex); |
| | 109 | |
|
| 58 | 110 | | if (!success) |
| 2 | 111 | | return new Match(false, charsMatched, startIndex, endIndex); |
| | 112 | |
|
| 56 | 113 | | charsMatched++; |
| 56 | 114 | | } |
| | 115 | |
|
| 34 | 116 | | return new Match(true, charsMatched, startIndex, endIndex); |
| 40 | 117 | | } |
| | 118 | |
|
| | 119 | | /// <summary> |
| | 120 | | /// Builds the <see cref="ILastFirstFinder"/> instance which is then (potentially) used by all iterations of the |
| | 121 | | /// matching algorithm over the pattern to match against <see cref="IMatcher.BWT"/> and |
| | 122 | | /// <see cref="IMatcher.SortedBWT"/>. |
| | 123 | | /// </summary> |
| | 124 | | /// <returns>An instance of <see cref="ILastFirstFinder"/>.</returns> |
| | 125 | | /// <remarks> |
| | 126 | | /// The <see cref="ILastFirstFinder"/> implementation used is <see cref="PrecomputedFinder"/>. |
| | 127 | | /// </remarks> |
| 20 | 128 | | protected virtual ILastFirstFinder BuildLastFirstFinder() => new PrecomputedFinder(BWT, SortedBWT); |
| | 129 | |
|
| | 130 | | /// <summary> |
| | 131 | | /// Narrows the provided (<paramref name="startIndex"/>, <paramref name="endIndex"/>) interval, (possibly) using |
| | 132 | | /// the provided <see cref="ILastFirstFinder"/> <paramref name="finder"/> for last-first matching. |
| | 133 | | /// </summary> |
| | 134 | | /// <param name="currentChar">The char currently being processed.</param> |
| | 135 | | /// <param name="finder"> |
| | 136 | | /// The finder used to perform last-first matching, if needed. When pattern matching a single instance is shared |
| | 137 | | /// across all iterations over the pattern. |
| | 138 | | /// </param> |
| | 139 | | /// <param name="startIndex">The lower extreme of the interval to be narrowed.</param> |
| | 140 | | /// <param name="endIndex">The higher extreme of the interval to be narrowed.</param> |
| | 141 | | /// <returns> |
| | 142 | | /// An interval narrower than the one provided as input, or (-1, -1), if narrowing resulted into an empty set |
| | 143 | | /// (i.e. overall matching has failed). |
| | 144 | | /// </returns> |
| | 145 | | /// <remarks> |
| | 146 | | /// Narrowing is performed in five sub-steps: |
| | 147 | | /// <br/> |
| | 148 | | /// 1. a linear scan in BWT from <paramref name="startIndex"/> downwards is done, to identify the narrowed start; |
| | 149 | | /// <br/> |
| | 150 | | /// 2. a linear scan in BWT from <paramref name="endIndex"/> upwards is done, to identify the narrowed end; |
| | 151 | | /// <br/> |
| | 152 | | /// 3. a last-first of the narrowed start is done, to find the corresponding narrowed start in the Sorted BWT; |
| | 153 | | /// <br/> |
| | 154 | | /// 4. a last-first of the narrowed end is done, to find the corresponding narrowed end in the Sorted BWT; |
| | 155 | | /// <br/> |
| | 156 | | /// 5. the narrowed interval in Sorted BWT is returned. |
| | 157 | | /// </remarks> |
| | 158 | | protected virtual (bool success, int narrowedStartIndex, int narrowedEndIndex) NarrowInterval( |
| | 159 | | char currentChar, ILastFirstFinder finder, int startIndex, int endIndex) |
| 29 | 160 | | { |
| 29 | 161 | | var startIndexNarrowed = Enumerable |
| 29 | 162 | | .Range(startIndex, endIndex - startIndex + 1) |
| 79 | 163 | | .FirstOrDefault(i => BWT[i] == currentChar, -1); |
| | 164 | |
|
| 29 | 165 | | if (startIndexNarrowed == -1) |
| 1 | 166 | | return (false, startIndex, endIndex); |
| | 167 | |
|
| 28 | 168 | | var endIndexNarrowed = Enumerable |
| 28 | 169 | | .Range(startIndex, endIndex - startIndex + 1) |
| 28 | 170 | | .Reverse() |
| 61 | 171 | | .First(i => BWT[i] == currentChar); |
| | 172 | |
|
| 28 | 173 | | (startIndex, _) = finder.LastToFirst(startIndexNarrowed); |
| 28 | 174 | | (endIndex, _) = finder.LastToFirst(endIndexNarrowed); |
| | 175 | |
|
| 28 | 176 | | return (true, startIndex, endIndex); |
| 29 | 177 | | } |
| | 178 | | } |