| | 1 | | using MoreStructures.RecImmTrees.Paths; |
| | 2 | | using MoreStructures.RecImmTrees.Visitor; |
| | 3 | | using MoreStructures.SuffixStructures.Builders; |
| | 4 | | using MoreStructures.SuffixTrees; |
| | 5 | | using MoreStructures.SuffixTrees.Builders; |
| | 6 | | using MoreStructures.Utilities; |
| | 7 | |
|
| | 8 | | namespace MoreStructures.SuffixStructures.Matching; |
| | 9 | |
|
| | 10 | | /// <summary> |
| | 11 | | /// A <see cref="ISnssFinder"/> implementation and <see cref="SuffixStructureBasedSnssFinder"/> concretion which uses |
| | 12 | | /// a <see cref="SuffixTreeNode"/> structure to implement |
| | 13 | | /// <see cref="ISnssFinder.Find(IEnumerable{char}, IEnumerable{char})"/>. |
| | 14 | | /// </summary> |
| | 15 | | /// <remarks> |
| | 16 | | /// <para id="advantages"> |
| | 17 | | /// ADVANTAGES AND DISADVANTAGES |
| | 18 | | /// <br/> |
| | 19 | | /// - Compared to the naive implementation of <see cref="NaiveSnssFinder"/>, has both average and worst case better |
| | 20 | | /// runtime, at the cost of space used (which is O(1) for the naive implementation). |
| | 21 | | /// <br/> |
| | 22 | | /// - Compared to <see cref="SuffixTrieBasedSnssFinder"/>, it has better Time and Space Complexity, due to Branches |
| | 23 | | /// Coaleascing combined with Path Compression. It's, however, more complex to implement, visualize and debug |
| | 24 | | /// step-by-step. |
| | 25 | | /// </para> |
| | 26 | | /// <para id="algo"> |
| | 27 | | /// ALGORITHM |
| | 28 | | /// <br/> |
| | 29 | | /// - First uses the <see cref="UkkonenSuffixTreeBuilder"/> to build a suffix tree of the |
| | 30 | | /// concatenation of first text, <see cref="SuffixStructureBasedSnssFinder.Terminator1"/>, second text and |
| | 31 | | /// <see cref="SuffixStructureBasedSnssFinder.Terminator2"/>, stopping branches at the first terminator |
| | 32 | | /// encountered. This structure is also known as Generalized Suffix Tree. |
| | 33 | | /// <br/> |
| | 34 | | /// - Visit the suffix tree breadth-first, stopping at the first node such that the root-to-node prefix is |
| | 35 | | /// substring of text1 but not of text2. |
| | 36 | | /// <br/> |
| | 37 | | /// - The root-to-node prefix is a substring of text1 when there is a path-to-leaf which contains an edge including |
| | 38 | | /// <see cref="SuffixStructureBasedSnssFinder.Terminator1"/>. |
| | 39 | | /// <br/> |
| | 40 | | /// - The root-to-node prefix is NOT a substring of text2 when there is no path-to-leaf which doesn't contain |
| | 41 | | /// <see cref="SuffixStructureBasedSnssFinder.Terminator1"/>. |
| | 42 | | /// <br/> |
| | 43 | | /// - Such substring of text1 is guaranteed to be the shortest in number of edges by the visit order imposed by the |
| | 44 | | /// breadth-first search. However, unlike for Suffix Tries, in Suffix Trees the number of chars per edges varies. |
| | 45 | | /// <br/> |
| | 46 | | /// - Therefore, all pontential results have to be collected, then sorted by actual prefix length. |
| | 47 | | /// </para> |
| | 48 | | /// <para id="complexity"> |
| | 49 | | /// COMPLEXITY |
| | 50 | | /// <br/> |
| | 51 | | /// - Validating the input requires going through text1 and text2, and takes linear time in the the number of chars |
| | 52 | | /// n of the concatenated text "text1 ## separator1 ## text2 ## separator2", and constant space. |
| | 53 | | /// <br/> |
| | 54 | | /// - Building the Generalized Suffix Tree via the Ukkonen algorithm takes time and space at least proportional to |
| | 55 | | /// the number of nodes of the tree, which is linear with n. |
| | 56 | | /// <br/> |
| | 57 | | /// - For each level of the breadth-first traversal of the trie, all node-to-leaf paths are checked (in the worst |
| | 58 | | /// case). |
| | 59 | | /// <br/> |
| | 60 | | /// - There are at most n levels in the tree, since there can't be a path longer than a suffix of the concatenated |
| | 61 | | /// text. The higher is the level, the shorter are node-to-leaf paths. However, their number is always the same |
| | 62 | | /// or lower. |
| | 63 | | /// <br/> |
| | 64 | | /// - For each node there are as many node-to-leaf paths as leaves, and there are at most n leaves in the tree |
| | 65 | | /// (since each suffix can add at most a single intermediate node and a single leaf, having terminator 1 or |
| | 66 | | /// terminator2 as incoming edge). |
| | 67 | | /// <br/> |
| | 68 | | /// - Checking whether a path contains terminator1 takes constant space and a time proportional to the number of |
| | 69 | | /// nodes in the path, which is O(n). |
| | 70 | | /// <br/> |
| | 71 | | /// - The following optimization is implemented: if a path P1 starting from a node N1 identifies a prefix p1 which |
| | 72 | | /// is a potential SNSS, all paths starting from nodes Pi which are descendants of N1 would identify prefixes pi |
| | 73 | | /// which would be longer than p1, so they can be excluded. |
| | 74 | | /// <br/> |
| | 75 | | /// - Rebuilding the string from each identified path takes O(n) time and space. Sorting would take time |
| | 76 | | /// O(m * log(m) * n) where m is the number of potential SNSS (which is in average much smaller than n). |
| | 77 | | /// <br/> |
| | 78 | | /// - So in conclusion, Time Complexity is O(n^2) and Space Complexity is O(n). |
| | 79 | | /// </para> |
| | 80 | | /// </remarks> |
| | 81 | | public class SuffixTreeBasedSnssFinder : SuffixStructureBasedSnssFinder |
| | 82 | | { |
| 1 | 83 | | private static readonly IBuilder<SuffixTreeEdge, SuffixTreeNode> SuffixTreeBuilder = |
| 1 | 84 | | new UkkonenSuffixTreeBuilder(); |
| | 85 | |
|
| 1 | 86 | | private static readonly INodeToLeafPathsBuilder NodeToLeafPathsBuilder = |
| 1 | 87 | | new FullyIterativeNodeToLeafPathsBuilder(); |
| | 88 | |
|
| | 89 | | /// <inheritdoc/> |
| 18 | 90 | | public SuffixTreeBasedSnssFinder(char terminator1, char terminator2) : base(terminator1, terminator2) |
| 16 | 91 | | { |
| 16 | 92 | | } |
| | 93 | |
|
| | 94 | | /// <inheritdoc cref="SuffixStructureBasedSnssFinder" path="//*[not(self::summary or self::remarks)]"/> |
| | 95 | | /// <summary> |
| | 96 | | /// <inheritdoc cref="SuffixStructureBasedSnssFinder.Find(IEnumerable{char}, IEnumerable{char})"/> |
| | 97 | | /// <br/> |
| | 98 | | /// This implementation builds and uses a <see cref="SuffixTreeNode"/> structure to perform the search. |
| | 99 | | /// </summary> |
| | 100 | | /// <remarks> |
| | 101 | | /// <inheritdoc cref="SuffixTreeBasedSnssFinder" path="/remarks"/> |
| | 102 | | /// </remarks> |
| | 103 | | public override IEnumerable<string> Find(IEnumerable<char> text1, IEnumerable<char> text2) |
| 20 | 104 | | { |
| 20 | 105 | | if (ValidateInput) |
| 20 | 106 | | ValidateTexts(text1, text2); |
| | 107 | |
|
| | 108 | | // Build Generalized Suffix Tree |
| 14 | 109 | | var text1WithTerminator = new TextWithTerminator(text1, Terminator1); |
| 14 | 110 | | var text2WithTerminator = new TextWithTerminator(text2, Terminator2); |
| 14 | 111 | | var text1And2 = new TextWithTerminator(text1.Append(Terminator1).Concat(text2), Terminator2); |
| 14 | 112 | | var suffixTree = SuffixTreeBuilder.BuildTree(text1WithTerminator, text2WithTerminator); |
| | 113 | |
|
| | 114 | | // Breadth First Visit of the Tree |
| 14 | 115 | | var terminator1Index = text1WithTerminator.TerminatorIndex; |
| 14 | 116 | | var terminator2Index = text1And2.TerminatorIndex; |
| 14 | 117 | | var breadthFirstTraversal = new FullyIterativeBreadthFirstTraversal<SuffixTreeEdge, SuffixTreeNode>() |
| 14 | 118 | | { |
| 112 | 119 | | ChildrenSorter = visit => visit.Node.Children |
| 181 | 120 | | .Where(child => child.Key.Start != terminator1Index && child.Key.Start != terminator2Index) |
| 210 | 121 | | .OrderBy(child => child.Key.Length), |
| 14 | 122 | | TraversalOrder = TreeTraversalOrder.ParentFirst, |
| 14 | 123 | | }; |
| 14 | 124 | | var cachedVisits = new Dictionary<SuffixTreeNode, TreeTraversalVisit<SuffixTreeEdge, SuffixTreeNode>>(); |
| 14 | 125 | | var visits = breadthFirstTraversal |
| 14 | 126 | | .Visit(suffixTree) |
| 14 | 127 | | .Select(visit => |
| 112 | 128 | | { |
| 112 | 129 | | cachedVisits[visit.Node] = visit; |
| 112 | 130 | | return visit; |
| 126 | 131 | | }); |
| | 132 | |
|
| 14 | 133 | | var shortestSubstrNodes = new HashSet<SuffixTreeNode>(EqualityComparer<object>.Default); // Just compare refs |
| | 134 | |
|
| 266 | 135 | | foreach (var visit in visits) |
| 112 | 136 | | { |
| | 137 | | // If the parent of this node has already identified a SNSS => this node would make a longer string => skip |
| 112 | 138 | | if (visit.ParentNode != null && shortestSubstrNodes.Contains(visit.ParentNode)) |
| 1 | 139 | | continue; |
| | 140 | |
|
| | 141 | | // If the incoming edge contains Terminator2 => it's a leaf substring of text2 => skip |
| 111 | 142 | | if (visit.IncomingEdge != null && visit.IncomingEdge.ContainsIndex(terminator2Index)) |
| 27 | 143 | | continue; |
| | 144 | |
|
| 84 | 145 | | var pathsToLeaf = NodeToLeafPathsBuilder.GetAllNodeToLeafPaths<SuffixTreeEdge, SuffixTreeNode>(visit.Node); |
| | 146 | |
|
| | 147 | | // Any path-to-leaf contains Terminator2 => root-to-node prefix is substring of text2 => skip |
| 151 | 148 | | if (pathsToLeaf.Any(path => path.ContainsIndex(terminator2Index))) |
| 64 | 149 | | continue; |
| | 150 | |
|
| 20 | 151 | | shortestSubstrNodes.Add(visit.Node); |
| 20 | 152 | | } |
| | 153 | |
|
| | 154 | | // Collect result, iterating up to the root (take only 1st char of last edge) and find the shortest. |
| 14 | 155 | | var results = |
| 14 | 156 | | from shortestSubstrNode in shortestSubstrNodes |
| 60 | 157 | | let prefix = GetPrefix(shortestSubstrNode, text1And2, cachedVisits) |
| 60 | 158 | | orderby prefix.Length ascending |
| 63 | 159 | | select prefix; |
| | 160 | |
|
| 14 | 161 | | var (firstOrEmpty, _) = results.EnumerateAtMostFirst(1); |
| 14 | 162 | | if (!firstOrEmpty.Any()) |
| 3 | 163 | | return firstOrEmpty; |
| | 164 | |
|
| 11 | 165 | | var first = firstOrEmpty.Single(); |
| 49 | 166 | | return results.TakeWhile(s => s.Length == first.Length).Prepend(first); |
| 14 | 167 | | } |
| | 168 | |
|
| | 169 | | private static string GetPrefix( |
| | 170 | | SuffixTreeNode node, |
| | 171 | | TextWithTerminator text1And2, |
| | 172 | | Dictionary<SuffixTreeNode, TreeTraversalVisit<SuffixTreeEdge, SuffixTreeNode>> cachedVisits) |
| 60 | 173 | | { |
| 60 | 174 | | var edges = CollectPrefixChars(text1And2, node, cachedVisits); |
| 60 | 175 | | var firstCharOfLastEdge = edges.First()[0]; |
| 60 | 176 | | return string.Concat(edges.Skip(1).Reverse()) + firstCharOfLastEdge; |
| 60 | 177 | | } |
| | 178 | | } |