| | 1 | | using MoreStructures.SuffixStructures.Builders; |
| | 2 | | using static MoreStructures.Utilities.StringUtilities; |
| | 3 | | using static MoreStructures.MutableTrees.MutableTree; |
| | 4 | |
|
| | 5 | | namespace MoreStructures.SuffixTrees.Builders; |
| | 6 | |
|
| | 7 | | /// <summary> |
| | 8 | | /// Builds objects, such as edges and nodes, for <see cref="SuffixTreeNode"/> structures. |
| | 9 | | /// </summary> |
| | 10 | | /// <remarks> |
| | 11 | | /// <para id="advantages"> |
| | 12 | | /// ADVANTAGES AND DISADVANTAGES |
| | 13 | | /// <br/> |
| | 14 | | /// - Implemented as an iteration of recursive visit of the tree being built, with as many iterations as the number |
| | 15 | | /// of suffixes of the input (where the longest suffix is the text itself) and one level of recursion per char of |
| | 16 | | /// each suffix. |
| | 17 | | /// <br/> |
| | 18 | | /// - Limited by call stack depth and usable with input text of a "reasonable" length (i.e. string having a length |
| | 19 | | /// < ~1K chars). |
| | 20 | | /// </para> |
| | 21 | | /// <para id="algorithm"> |
| | 22 | | /// ALGORITHM |
| | 23 | | /// <br/> |
| | 24 | | /// - For each suffix S of the input T, start from the root node N of the tree, initialized as a mutable tree made |
| | 25 | | /// of a simple leaf. |
| | 26 | | /// <br/> |
| | 27 | | /// - Compare the first char of S with the first char of each of the edges coming from N. |
| | 28 | | /// <br/> |
| | 29 | | /// - If there is no edge e such that <c>label(e)[0] == S[0]</c>, it means that there is no descendant of N sharing |
| | 30 | | /// a path with S. So, create a new leaf under N, attached by an edge with a label equal to S. |
| | 31 | | /// <br/> |
| | 32 | | /// - Otherwise, it means that there is a path to be shared between the new leaf and at least one of the |
| | 33 | | /// descendants of N. |
| | 34 | | /// <br/> |
| | 35 | | /// - In this case, compare the current suffix and the label of such edge e, for the LCP. |
| | 36 | | /// <br/> |
| | 37 | | /// - If the prefix in common is shorter than the length of the label of the edge e, create an intermediate node, |
| | 38 | | /// push down the child pointed by the edge in the current node and add a new node for the reminder of the suffix |
| | 39 | | /// (up to next terminator) as second child of the intermediate. |
| | 40 | | /// <br/> |
| | 41 | | /// - Otherwise, eat prefixLength chars from the edge, move to the child pointed by the edge entirely matching the |
| | 42 | | /// beginning of the current suffix and repeat the same operation. |
| | 43 | | /// <br/> |
| | 44 | | /// - Finally, after all iterations have been executed and all suffixes of the text have been included in the |
| | 45 | | /// mutable tree, build the final <see cref="SuffixTreeNode"/> structure from the mutable tree, using a |
| | 46 | | /// <see cref="MutableTrees.Conversions.FullyIterativeConversion"/>. |
| | 47 | | /// </para> |
| | 48 | | /// <para id="complexity"> |
| | 49 | | /// COMPLEXITY |
| | 50 | | /// <br/> |
| | 51 | | /// - The generation of the full text, done by |
| | 52 | | /// <see cref="TextWithTerminatorExtensions.GenerateFullText(MoreStructures.TextWithTerminator[])"/>, is a O(n) |
| | 53 | | /// operation, where n is the sum of the length of each <see cref="TextWithTerminator"/> in the input |
| | 54 | | /// (terminators included). |
| | 55 | | /// <br/> |
| | 56 | | /// - The initial tree, just a root-leaf, is created in constant time. |
| | 57 | | /// <br/> |
| | 58 | | /// - The top level loop is executed n times, once per char of the full text. |
| | 59 | | /// <br/> |
| | 60 | | /// - Finding the edge with the same first char is O(avgEdges), where avgEdges is the average number of edges |
| | 61 | | /// coming out of a node of the tree. This operation is done for all cases (leaf creation alone or leaf + |
| | 62 | | /// intermediate node). |
| | 63 | | /// <br/> |
| | 64 | | /// - Over all the n nodes of the tree, taking into account the worst case where avgEdges is O(n), Time Complexity |
| | 65 | | /// would become O(n^2). However, the total number of edges in a tree is O(n), so the overall cost of these |
| | 66 | | /// operations for the entire tree is O(n), and not O(n^2). |
| | 67 | | /// <br/> |
| | 68 | | /// - Finding the LCP between the suffix and the label of the current edge are O(n) operations, since the length |
| | 69 | | /// of a generic suffix of the text is O(n) and they require iterating over all the chars. |
| | 70 | | /// <br/> |
| | 71 | | /// - Such operations are only done when an intermediate node is required. However, in the worst case there are as |
| | 72 | | /// many intermediate as leaves in the tree (i.e. each iteration adds an intermediate node and a leaf). |
| | 73 | | /// Therefore, the number of intermediate nodes is O(n), and the two O(n) operations above are to be repeated |
| | 74 | | /// O(n) times. |
| | 75 | | /// <br/> |
| | 76 | | /// - Final step is building the final tree from the mutable tree, which requires the traversal of the O(n) nodes |
| | 77 | | /// of the tree. <see cref="MutableTrees.Conversions.FullyIterativeConversion"/> does that in O(n) time and |
| | 78 | | /// space, via a stack and a precomputed data structure to find the number of terminators per edge in O(1) time. |
| | 79 | | /// <br/> |
| | 80 | | /// - In conclusion, Time Complexity = O(n^2) and Space Complexity = O(n) where n = length of the text to match. |
| | 81 | | /// <br/> |
| | 82 | | /// - Compared to tries, trees are more compact due to edge coalescing and edge label compression (i.e. edge |
| | 83 | | /// strings stored as pair (start, length), rather than as a substring of length chars). Each recursion add a |
| | 84 | | /// leaf and at most one intermediate node, so Space Complexity ~ 2 * n = O(n). |
| | 85 | | /// </para> |
| | 86 | | /// </remarks> |
| | 87 | | public class NaivePartiallyRecursiveSuffixTreeBuilder |
| | 88 | | : IBuilder<SuffixTreeEdge, SuffixTreeNode> |
| | 89 | | { |
| 1 | 90 | | private static readonly MutableTrees.Conversions.IConversion MutableTreeConversion = |
| 1 | 91 | | new MutableTrees.Conversions.FullyIterativeConversion(); |
| | 92 | |
|
| | 93 | | /// <inheritdoc path="//*[not self::summary or self::remarks]"/> |
| | 94 | | /// <summary> |
| | 95 | | /// <inheritdoc/> |
| | 96 | | /// </summary> |
| | 97 | | /// <remarks> |
| | 98 | | /// <inheritdoc cref="NaivePartiallyRecursiveSuffixTreeBuilder"/> |
| | 99 | | /// </remarks> |
| | 100 | | public SuffixTreeNode BuildTree(params TextWithTerminator[] texts) |
| 32 | 101 | | { |
| 32 | 102 | | var (fullText, terminators) = texts.GenerateFullText(); |
| | 103 | |
|
| 32 | 104 | | var root = Node.BuildRoot(); |
| | 105 | |
|
| 556 | 106 | | for (var suffixIndex = 0; suffixIndex < fullText.Length; suffixIndex++) |
| 246 | 107 | | IncludeSuffixIntoTree(fullText, terminators, suffixIndex, suffixIndex, root); |
| | 108 | |
|
| 32 | 109 | | return MutableTreeConversion.ConvertToSuffixTree(root, fullText, terminators); |
| 32 | 110 | | } |
| | 111 | |
|
| | 112 | | private static void IncludeSuffixIntoTree( |
| | 113 | | TextWithTerminator text, ISet<char> terminators, int suffixCurrentIndex, int suffixIndex, Node node) |
| 311 | 114 | | { |
| 311 | 115 | | var currentChar = text[suffixCurrentIndex]; |
| 1066 | 116 | | var edgeSame1stChar = node.Children.Keys.SingleOrDefault(edge => text[edge.Start] == currentChar); |
| | 117 | |
|
| 311 | 118 | | if (edgeSame1stChar == null) |
| 156 | 119 | | { |
| 156 | 120 | | var edgeToNewLeaf = Edge.Build(suffixCurrentIndex, text.Length - suffixCurrentIndex); |
| 156 | 121 | | var newLeaf = Node.Build(node, edgeToNewLeaf, suffixIndex); |
| | 122 | |
|
| 156 | 123 | | node.Children[edgeToNewLeaf] = newLeaf; |
| 156 | 124 | | } |
| | 125 | | else |
| 155 | 126 | | { |
| 155 | 127 | | var prefixLength = LongestCommonPrefix( |
| 155 | 128 | | text[suffixCurrentIndex..], |
| 155 | 129 | | text[edgeSame1stChar.Start..(edgeSame1stChar.Start + edgeSame1stChar.Length)]); |
| | 130 | |
|
| 155 | 131 | | var oldChild = node.Children[edgeSame1stChar]; |
| 155 | 132 | | if (prefixLength < edgeSame1stChar.Length) |
| 90 | 133 | | { |
| 90 | 134 | | var edgeToNewIntermediate = Edge.Build(edgeSame1stChar.Start, prefixLength); |
| 90 | 135 | | var newIntermediate = Node.Build(node, edgeToNewIntermediate, null); |
| | 136 | |
|
| 90 | 137 | | var edgeToOldChild = Edge.Build( |
| 90 | 138 | | edgeSame1stChar.Start + prefixLength, edgeSame1stChar.Length - prefixLength); |
| 90 | 139 | | var edgeToNewLeaf = Edge.Build( |
| 90 | 140 | | suffixCurrentIndex + prefixLength, text.Length - suffixCurrentIndex - prefixLength); |
| 90 | 141 | | var newLeaf = Node.Build( |
| 90 | 142 | | newIntermediate, edgeToNewLeaf, suffixIndex); |
| | 143 | |
|
| 90 | 144 | | newIntermediate.Children[edgeToOldChild] = oldChild; |
| 90 | 145 | | newIntermediate.Children[edgeToNewLeaf] = newLeaf; |
| | 146 | |
|
| 90 | 147 | | node.Children.Remove(edgeSame1stChar); |
| 90 | 148 | | node.Children[edgeToNewIntermediate] = newIntermediate; |
| 90 | 149 | | } |
| | 150 | | else |
| 65 | 151 | | { |
| 65 | 152 | | IncludeSuffixIntoTree(text, terminators, suffixCurrentIndex + prefixLength, suffixIndex, oldChild); |
| 65 | 153 | | } |
| 155 | 154 | | } |
| 311 | 155 | | } |
| | 156 | | } |