| | 1 | | using MoreStructures.RecImmTrees; |
| | 2 | | using MoreStructures.RecImmTrees.Paths; |
| | 3 | | using MoreStructures.RecImmTrees.Visitor; |
| | 4 | | using MoreStructures.SuffixStructures.Builders; |
| | 5 | | using MoreStructures.SuffixTries; |
| | 6 | | using MoreStructures.SuffixTries.Builders; |
| | 7 | | using MoreStructures.Utilities; |
| | 8 | |
|
| | 9 | | namespace MoreStructures.SuffixStructures.Matching; |
| | 10 | |
|
| | 11 | | /// <summary> |
| | 12 | | /// A <see cref="ISnssFinder"/> implementation and <see cref="SuffixStructureBasedSnssFinder"/> concretion which uses |
| | 13 | | /// a <see cref="SuffixTrieNode"/> structure to implement |
| | 14 | | /// <see cref="ISnssFinder.Find(IEnumerable{char}, IEnumerable{char})"/>. |
| | 15 | | /// </summary> |
| | 16 | | /// <remarks> |
| | 17 | | /// <para id="advantages"> |
| | 18 | | /// ADVANTAGES AND DISADVANTAGES |
| | 19 | | /// <br/> |
| | 20 | | /// - Compared to the naive implementation of <see cref="NaiveSnssFinder"/>, has better aberage runtime, at the |
| | 21 | | /// cost of space used (which is O(1) for the naive implementation and at least quadratic here). |
| | 22 | | /// <br/> |
| | 23 | | /// - Compared to the SuffixTree-based implementation, it has way worse Time and Space Complexity, but it's easier |
| | 24 | | /// to implement, visualize and debug step-by-step. |
| | 25 | | /// </para> |
| | 26 | | /// <para id="algo"> |
| | 27 | | /// ALGORITHM |
| | 28 | | /// <br/> |
| | 29 | | /// - First uses the <see cref="NaivePartiallyRecursiveSuffixTrieBuilder"/> to build a suffix trie of the |
| | 30 | | /// concatenation of first text, <see cref="SuffixStructureBasedSnssFinder.Terminator1"/>, second text and |
| | 31 | | /// <see cref="SuffixStructureBasedSnssFinder.Terminator2"/>, a.k.a. generalized suffix trie. |
| | 32 | | /// <br/> |
| | 33 | | /// - Visit the suffix trie breadth-first, stopping at the first node such that the root-to-node prefix is |
| | 34 | | /// substring of text1 but not of text2. |
| | 35 | | /// <br/> |
| | 36 | | /// - The root-to-node prefix is a substring of text1 when there is a path-to-leaf which contains an edge including |
| | 37 | | /// <see cref="SuffixStructureBasedSnssFinder.Terminator1"/>. |
| | 38 | | /// <br/> |
| | 39 | | /// - The root-to-node prefix is NOT a substring of text2 when there is no path-to-leaf which doesn't contain |
| | 40 | | /// <see cref="SuffixStructureBasedSnssFinder.Terminator1"/>. |
| | 41 | | /// <br/> |
| | 42 | | /// - Such substring of text1 is guaranteed to be the shortest by the visit order imposed by the breadth-first |
| | 43 | | /// search. This is true because trie edges are labelled by a single chars, so root-to-node paths which are |
| | 44 | | /// longer by the number of edges correspond to longer prefixes. This is not true in general for Suffix Trees, |
| | 45 | | /// where edge labels can vary in length, and a path which is composed of less edges may be longer in chars than |
| | 46 | | /// a path with more edges but shorter labels in average. |
| | 47 | | /// </para> |
| | 48 | | /// <para id="complexity"> |
| | 49 | | /// COMPLEXITY |
| | 50 | | /// <br/> |
| | 51 | | /// - Validating the input requires going through text1 and text2, and takes linear time in the the number of chars |
| | 52 | | /// n of the concatenated text "text1 ## separator1 ## text2 ## separator2", and constant space. |
| | 53 | | /// <br/> |
| | 54 | | /// - Building the Generalized Suffix Trie takes time and space at least proportional to the number of nodes of the |
| | 55 | | /// trie, which is quadratic with n (unlike for Suffix Trees). |
| | 56 | | /// <br/> |
| | 57 | | /// - For each level of the breadth-first traversal of the trie, all node-to-leaf paths are checked (in the worst |
| | 58 | | /// case). |
| | 59 | | /// <br/> |
| | 60 | | /// - There are at most n levels in the trie, since there can't be a path longer than a suffix of the concatenated |
| | 61 | | /// text, and all suffixes should be covered by the trie. The higher is the level, the shorter are node-to-leaf |
| | 62 | | /// paths. However, their number is always the same or lower. |
| | 63 | | /// <br/> |
| | 64 | | /// - For each node there are as many node-to-leaf paths as leaves, and there are at most n leaves in the trie |
| | 65 | | /// (since each suffix can add potentially multiple intermediate nodes, but always a single leaf, having |
| | 66 | | /// terminator2 as incoming edge). |
| | 67 | | /// <br/> |
| | 68 | | /// - Checking whether a path contains terminator1 takes constant space and a time proportional to the number of |
| | 69 | | /// nodes in the path, which is O(n). |
| | 70 | | /// <br/> |
| | 71 | | /// - Rebuilding the string from the identified path takes O(n) time and space. |
| | 72 | | /// <br/> |
| | 73 | | /// - So in conclusion, Time Complexity is O(n^3) and Space Complexity is O(n^2). |
| | 74 | | /// </para> |
| | 75 | | /// </remarks> |
| | 76 | | public class SuffixTrieBasedSnssFinder : SuffixStructureBasedSnssFinder |
| | 77 | | { |
| 1 | 78 | | private static readonly IBuilder<SuffixTrieEdge, SuffixTrieNode> SuffixTrieBuilder = |
| 1 | 79 | | new NaivePartiallyRecursiveSuffixTrieBuilder(); |
| | 80 | |
|
| 1 | 81 | | private static readonly INodeToLeafPathsBuilder NodeToLeafPathsBuilder = |
| 1 | 82 | | new FullyIterativeNodeToLeafPathsBuilder(); |
| | 83 | |
|
| | 84 | | /// <inheritdoc/> |
| 18 | 85 | | public SuffixTrieBasedSnssFinder(char terminator1, char terminator2) : base(terminator1, terminator2) |
| 16 | 86 | | { |
| 16 | 87 | | } |
| | 88 | |
|
| | 89 | | /// <inheritdoc cref="SuffixStructureBasedSnssFinder" path="//*[not(self::summary or self::remarks)]"/> |
| | 90 | | /// <summary> |
| | 91 | | /// <inheritdoc cref="SuffixStructureBasedSnssFinder.Find(IEnumerable{char}, IEnumerable{char})"/> |
| | 92 | | /// <br/> |
| | 93 | | /// This implementation builds and uses a <see cref="SuffixTrieNode"/> structure to perform the search. |
| | 94 | | /// </summary> |
| | 95 | | /// <remarks> |
| | 96 | | /// <inheritdoc cref="SuffixTrieBasedSnssFinder" path="/remarks"/> |
| | 97 | | /// </remarks> |
| | 98 | | public override IEnumerable<string> Find(IEnumerable<char> text1, IEnumerable<char> text2) |
| 20 | 99 | | { |
| 20 | 100 | | if (ValidateInput) |
| 20 | 101 | | ValidateTexts(text1, text2); |
| | 102 | |
|
| | 103 | | // Build Generalized Suffix Trie |
| 14 | 104 | | var text1WithTerminator = new TextWithTerminator(text1, Terminator1); |
| 14 | 105 | | var text2WithTerminator = new TextWithTerminator(text2, Terminator2); |
| 14 | 106 | | var text1And2 = new TextWithTerminator(text1.Append(Terminator1).Concat(text2), Terminator2); |
| 14 | 107 | | var suffixTrieRoot = SuffixTrieBuilder.BuildTree(text1WithTerminator, text2WithTerminator); |
| | 108 | |
|
| | 109 | | // Breadth First Visit of the Trie |
| 14 | 110 | | var terminator1Index = text1WithTerminator.TerminatorIndex; |
| 14 | 111 | | var terminator2Index = text1And2.TerminatorIndex; |
| 14 | 112 | | var breadthFirstTraversal = new FullyIterativeBreadthFirstTraversal<SuffixTrieEdge, SuffixTrieNode>() |
| 14 | 113 | | { |
| 438 | 114 | | ChildrenSorter = visit => visit.Node.Children.Where( |
| 1174 | 115 | | child => child.Key.Index != terminator1Index && child.Key.Index != terminator2Index), |
| 14 | 116 | | TraversalOrder = TreeTraversalOrder.ParentFirst, |
| 14 | 117 | | }; |
| 14 | 118 | | var cachedVisits = new Dictionary<SuffixTrieNode, TreeTraversalVisit<SuffixTrieEdge, SuffixTrieNode>> { }; |
| 14 | 119 | | var visits = breadthFirstTraversal |
| 14 | 120 | | .Visit(suffixTrieRoot) |
| 14 | 121 | | .Select(visit => |
| 290 | 122 | | { |
| 290 | 123 | | cachedVisits[visit.Node] = visit; |
| 290 | 124 | | return visit; |
| 304 | 125 | | }); |
| | 126 | |
|
| 14 | 127 | | var results = |
| 14 | 128 | | from visit in visits |
| 290 | 129 | | let pathsToLeaf = NodeToLeafPathsBuilder.GetAllNodeToLeafPaths<SuffixTrieEdge, SuffixTrieNode>(visit.Node) |
| 14 | 130 | | // All path-to-leaf contain Terminator1 => root-to-node prefix is not substring of text2 |
| 787 | 131 | | where pathsToLeaf.All(path => path.ContainsIndex(terminator1Index)) |
| 53 | 132 | | let prefix = string.Concat(CollectPrefixChars(text1And2, visit.Node, cachedVisits).Reverse()) |
| 67 | 133 | | select prefix; |
| | 134 | |
|
| 14 | 135 | | var (firstOrEmpty, _) = results.EnumerateAtMostFirst(1); |
| 14 | 136 | | if (!firstOrEmpty.Any()) |
| 3 | 137 | | return firstOrEmpty; |
| | 138 | |
|
| 11 | 139 | | var first = firstOrEmpty.Single(); |
| 53 | 140 | | return results.TakeWhile(s => s.Length == first.Length).Prepend(first); |
| 14 | 141 | | } |
| | 142 | | } |