| | 1 | | using MoreStructures.BurrowsWheelerTransform.Matching.Comparers; |
| | 2 | | using MoreStructures.Lists.Searching; |
| | 3 | | using MoreStructures.SuffixArrays; |
| | 4 | | using MoreStructures.SuffixArrays.Builders; |
| | 5 | |
|
| | 6 | | namespace MoreStructures.BurrowsWheelerTransform.Matching; |
| | 7 | |
|
| | 8 | | /// <summary> |
| | 9 | | /// A <see cref="IMatcher"/> implementation which uses a <see cref="SuffixArrays.SuffixArray"/> to perform text pattern |
| | 10 | | /// matching. |
| | 11 | | /// </summary> |
| | 12 | | /// <remarks> |
| | 13 | | /// <para id="info"> |
| | 14 | | /// <see cref="Text"/> and its <see cref="SuffixArrays.SuffixArray"/> has to be provided as an additional input. |
| | 15 | | /// <br/> |
| | 16 | | /// If not already available, it can be built via a |
| | 17 | | /// <see cref="SuffixStructureBasedSuffixArrayBuilder{TEdge, TNode}"/>, if a Suffix Structure of the provided text |
| | 18 | | /// is already available (such as a Suffix Tree or Trie), and via <see cref="NaiveSuffixArrayBuilder"/> otherwise. |
| | 19 | | /// </para> |
| | 20 | | /// <para id="algo"> |
| | 21 | | /// The algorithm performs two binary searches over the <see cref="SortedBWT"/> in sequence. |
| | 22 | | /// <br/> |
| | 23 | | /// - The first binary search looks for the first index i of the <see cref="SortedBWT"/> such that the provided |
| | 24 | | /// pattern matches the i-th element of the <see cref="SuffixArrays.SuffixArray"/> (meaning that the pattern is |
| | 25 | | /// fully contained in the i-th suffix of <see cref="Text"/> in <see cref="SuffixArray"/>). |
| | 26 | | /// <br/> |
| | 27 | | /// - If i is not found, a failing <see cref="Matching.Match"/> is returned. Otherwise, a second binary search is |
| | 28 | | /// performed. |
| | 29 | | /// <br/> |
| | 30 | | /// - The second binary search looks for the last index j of the <see cref="SortedBWT"/> respecting the same |
| | 31 | | /// condition as the first binary search. This search starts from the index found in the first search (included). |
| | 32 | | /// <br/> |
| | 33 | | /// - The second search is guaranteed to succeed (as in the worst case the last occurrence corresponds to the first |
| | 34 | | /// one, found in the first search. |
| | 35 | | /// </para> |
| | 36 | | /// <para id="complexity"> |
| | 37 | | /// - The array of indexes is as long as the length n of <see cref="SortedBWT"/>, which is also the length of the |
| | 38 | | /// text and the <see cref="SuffixArrays.SuffixArray"/>. |
| | 39 | | /// <br/> |
| | 40 | | /// - Each binary search does a number of comparisons which is logarithmic over n. |
| | 41 | | /// <br/> |
| | 42 | | /// - Each comparison is of at most n chars. |
| | 43 | | /// <br/> |
| | 44 | | /// - So Time Complexity is O(n * log(n)) and Space Complexity is O(n). |
| | 45 | | /// </para> |
| | 46 | | /// </remarks> |
| | 47 | | public class SuffixArrayBasedMatcher : IMatcher |
| | 48 | | { |
| | 49 | | /// <summary> |
| | 50 | | /// The <see cref="ISearch"/> implementation to be used when searching for items in lists sorted in ascending |
| | 51 | | /// order. |
| | 52 | | /// </summary> |
| 55 | 53 | | protected static ISearch OrderedAscListSearch { get; } = new BinarySearch(); |
| | 54 | |
|
| | 55 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 56 | | /// <remarks> |
| | 57 | | /// Unlike <see cref="SortedBWT"/>, <see cref="BWT"/> is not required to perform Pattern Matching against |
| | 58 | | /// <see cref="Text"/>, and is not supported. |
| | 59 | | /// </remarks> |
| | 60 | | public RotatedTextWithTerminator BWT => |
| 1 | 61 | | throw new NotSupportedException($"{nameof(BWT)} is not defined for {nameof(SuffixArrayBasedMatcher)}"); |
| | 62 | |
|
| | 63 | | /// <inheritdoc/> |
| 31 | 64 | | public RotatedTextWithTerminator SortedBWT { get; } |
| | 65 | |
|
| | 66 | | /// <summary> |
| | 67 | | /// The <see cref="TextWithTerminator"/>, to do pattern matching against. |
| | 68 | | /// </summary> |
| | 69 | | /// <remarks> |
| | 70 | | /// Requires to get the actual suffix from the i-th element of <see cref="SuffixArray"/>, to be compared against |
| | 71 | | /// the pattern. |
| | 72 | | /// </remarks> |
| 54 | 73 | | public TextWithTerminator Text { get; } |
| | 74 | |
|
| | 75 | | /// <summary> |
| | 76 | | /// The <see cref="SuffixArrays.SuffixArray"/> of <see cref="Text"/>. |
| | 77 | | /// </summary> |
| 54 | 78 | | public SuffixArray SuffixArray { get; } |
| | 79 | |
|
| | 80 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 81 | | /// <remarks> |
| | 82 | | /// <inheritdoc cref="SuffixArrayBasedMatcher" path="/remarks"/> |
| | 83 | | /// </remarks> |
| 26 | 84 | | public SuffixArrayBasedMatcher(RotatedTextWithTerminator sbwt, TextWithTerminator text, SuffixArray suffixArray) |
| 26 | 85 | | { |
| 26 | 86 | | SortedBWT = sbwt; |
| 26 | 87 | | Text = text; |
| 26 | 88 | | SuffixArray = suffixArray; |
| 26 | 89 | | } |
| | 90 | |
|
| | 91 | | /// <inheritdoc path="//*[not(self::summary or self::remarks)]"/> |
| | 92 | | /// <summary> |
| | 93 | | /// Tries to match the provided <paramref name="pattern"/> against <see cref="Text"/>, via the |
| | 94 | | /// <see cref="SortedBWT"/> and the <see cref="SuffixArray"/> of <see cref="Text"/>. |
| | 95 | | /// </summary> |
| | 96 | | /// <remarks> |
| | 97 | | /// <inheritdoc cref="SuffixArrayBasedMatcher" path="/remarks"/> |
| | 98 | | /// </remarks> |
| | 99 | | public Match Match(IEnumerable<char> pattern) |
| 31 | 100 | | { |
| 31 | 101 | | var indexes = Enumerable.Range(0, SortedBWT.Length).ToList(); |
| 31 | 102 | | var againstPatternComparerStart = BuildComparer(pattern); |
| 31 | 103 | | var startIndex = OrderedAscListSearch.First(indexes, -1, againstPatternComparerStart); |
| 31 | 104 | | var longestMatchIndex1 = againstPatternComparerStart.LongestMatchFirstValue; |
| | 105 | |
|
| 31 | 106 | | if (startIndex >= 0) |
| 23 | 107 | | { |
| 23 | 108 | | var againstPatternComparerEnd = new SuffixArrayAgainstPatternComparer(Text, SuffixArray, pattern); |
| 23 | 109 | | var endIndex = OrderedAscListSearch.Last(indexes, -1, againstPatternComparerEnd, startIndex); |
| | 110 | |
|
| 23 | 111 | | return new Match(true, againstPatternComparerStart.LongestMatch, startIndex, endIndex); |
| | 112 | | } |
| | 113 | |
|
| 8 | 114 | | return new Match(false, againstPatternComparerStart.LongestMatch, longestMatchIndex1, longestMatchIndex1); |
| 31 | 115 | | } |
| | 116 | |
|
| | 117 | | /// <summary> |
| | 118 | | /// Builds a <see cref="SuffixArrayAgainstPatternComparer"/> instance, or a derivation of it, to be used in |
| | 119 | | /// <see cref="Match(IEnumerable{char})"/>, to find the start and end indexes in <see cref="SortedBWT"/>, |
| | 120 | | /// corresponding to first and last matches of the pattern in <see cref="Text"/>. |
| | 121 | | /// </summary> |
| | 122 | | /// <param name="pattern"><inheritdoc cref="Match(IEnumerable{char})" path="/param[@name='pattern']"/></param> |
| | 123 | | /// <returns>An instance of <see cref="SuffixArrayAgainstPatternComparer"/> instance, or a derivation of it.</return |
| | 124 | | protected virtual SuffixArrayAgainstPatternComparer BuildComparer(IEnumerable<char> pattern) => |
| 31 | 125 | | new(Text, SuffixArray, pattern); |
| | 126 | | } |