| | 1 | | using MoreStructures.MutableTrees; |
| | 2 | | using MoreStructures.SuffixArrays; |
| | 3 | | using MoreStructures.SuffixArrays.LongestCommonPrefix; |
| | 4 | | using MoreStructures.SuffixStructures.Builders; |
| | 5 | |
|
| | 6 | | namespace MoreStructures.SuffixTrees.Builders; |
| | 7 | |
|
| | 8 | | /// <summary> |
| | 9 | | /// A <see cref="IBuilder{TEdge, TNode}"/> implementation of <see cref="SuffixTreeNode"/> structures, using the |
| | 10 | | /// <see cref="SuffixArray"/> and the <see cref="LcpArray"/> of the full text, to build the tree efficiently. |
| | 11 | | /// </summary> |
| | 12 | | /// <remarks> |
| | 13 | | /// <para id="advantages"> |
| | 14 | | /// ADVANTAGES AND DISADVANTAGES |
| | 15 | | /// <br/> |
| | 16 | | /// - Like <see cref="NaivePartiallyRecursiveSuffixTreeBuilder"/> and <see cref="UkkonenSuffixTreeBuilder"/>, |
| | 17 | | /// it builds the tree one suffix of the text at a time, adding at least a leaf and potentially an intermediate |
| | 18 | | /// node at every iteration. |
| | 19 | | /// <br/> |
| | 20 | | /// - Unlike them, the suffixes are not added to the tree in the order in which they appear in the text, i.e. they |
| | 21 | | /// are not processed from the longest to the shortest, rather from the smallest to the biggest in lexicographic |
| | 22 | | /// ascending order. |
| | 23 | | /// <br/> |
| | 24 | | /// - It also requires the construction of the <see cref="SuffixArray"/> and the <see cref="LcpArray"/> of the |
| | 25 | | /// input, whereas the naive and the Ukkonen implementations don't require any auxiliary structure. |
| | 26 | | /// <br/> |
| | 27 | | /// - Compared to the naive implementation, it has better runtime, even including the cost of building auxiliary |
| | 28 | | /// structures: all-round performance is less than the quadratic performance of the naive approach, but worse |
| | 29 | | /// than the linear performance of the Ukkonen algorithm. |
| | 30 | | /// </para> |
| | 31 | | /// <para id="algorithm"> |
| | 32 | | /// ALGORITHM |
| | 33 | | /// <br/> |
| | 34 | | /// - First, build full text, from the provided <see cref="TextWithTerminator"/> instances, and the corresponding |
| | 35 | | /// <see cref="SuffixArray"/> SA and <see cref="LcpArray"/> LCPA, which are both fully enumerated in order to be |
| | 36 | | /// able to direct access them by index. |
| | 37 | | /// <br/> |
| | 38 | | /// - Then, build the initial mutable tree: just the root node with a single leaf linked by an edge labelled with |
| | 39 | | /// the 1st suffix in the <see cref="SuffixArray"/>: <c>SA[0]</c>. |
| | 40 | | /// <br/> |
| | 41 | | /// - This represents the initial state of the mutable tree, which will be modified iteration by iteration going |
| | 42 | | /// through the Suffix Array from the 2nd to the last suffix (in lexicographic ascending order). |
| | 43 | | /// <br/> |
| | 44 | | /// - The item of the LCP Array in position i - 1, gives the LCP of such suffix with the previous suffix: |
| | 45 | | /// <c>LCPA[i - 1] = LCP(SA[i - 1], SA[i])</c>. Such LCP value is compared with the LCP value calculated at the |
| | 46 | | /// previous iteration. |
| | 47 | | /// <br/> |
| | 48 | | /// - Different insertion strategies are applied, depending on the result of the comparison. At each iteration the |
| | 49 | | /// reference to the last inserted leaf and to the previous LCP value are kept. |
| | 50 | | /// <br/> |
| | 51 | | /// - <b>Case 1</b>: if the current LCP is smaller than the previous, the root-to-leaf path of the leaf about to be |
| | 52 | | /// added shares a shorter path with the previous leaf. So move up in the tree from the last inserted leaf, up |
| | 53 | | /// until the first node which makes current LCP value non smaller than the previous LCP value. Then check |
| | 54 | | /// whether it's fallbacking to Case 2 or 3. |
| | 55 | | /// <br/> |
| | 56 | | /// - <b>Case 2</b>: if the current LCP is the same as the previous, the root-to-leaf path of the leaf about to be |
| | 57 | | /// added shares exactly the same path with the previous leaf. Therefore the new leaf can be attached directly |
| | 58 | | /// to the parent of the previous leaf, which becomes a sibling of the new leaf. No new intermediate node. |
| | 59 | | /// <br/> |
| | 60 | | /// - <b>Case 3</b>: if the current LCP is bigger than the previous, a part of the previous edge is in common |
| | 61 | | /// between the previous leaf and the leaf about to be created in this iteration. Therefore, build new |
| | 62 | | /// intermediate, detach old edge going to previous leaf, attach new intermediate edge. |
| | 63 | | /// <br/> |
| | 64 | | /// - Finally, after all iterations have been executed and all suffixes of the text have been included in the |
| | 65 | | /// mutable tree, build the final <see cref="SuffixTreeNode"/> structure from the mutable tree, using a |
| | 66 | | /// <see cref="MutableTrees.Conversions.FullyIterativeConversion"/>. |
| | 67 | | /// </para> |
| | 68 | | /// <para id="complexity"> |
| | 69 | | /// COMPLEXITY |
| | 70 | | /// <br/> |
| | 71 | | /// - The complexity of building the <see cref="SuffixArray"/> and the <see cref="LcpArray"/>, via the provided |
| | 72 | | /// <see cref="SuffixAndLcpArraysBuilder"/> is not included in this cost analysis. This has to be taken into |
| | 73 | | /// account when comparing this implementation of <see cref="IBuilder{TEdge, TNode}"/> of |
| | 74 | | /// <see cref="SuffixTreeNode"/> structures with other implementations, such as |
| | 75 | | /// <see cref="NaivePartiallyRecursiveSuffixTreeBuilder"/> or <see cref="UkkonenSuffixTreeBuilder"/>. |
| | 76 | | /// <br/> |
| | 77 | | /// - Building the initial mutable tree requires building two nodes and an edge, which can both be done in constant |
| | 78 | | /// time. |
| | 79 | | /// <br/> |
| | 80 | | /// - Then, n - 1 iterations are performed, where n is the length of the input, executing optionally Case 1 and |
| | 81 | | /// then either Case 2 or 3. |
| | 82 | | /// <br/> |
| | 83 | | /// - Cases 2 and 3 perform the insertion of a leaf and potentially of an intermediate node, both O(1) operations. |
| | 84 | | /// Case 1 may seem an O(n) operation. However, due to LCP Array property, moving up in the tree will happen at |
| | 85 | | /// most once per iteration. |
| | 86 | | /// <br/> |
| | 87 | | /// - Final step is building the final tree from the mutable tree, which requires the traversal of the O(n) nodes |
| | 88 | | /// of the tree. <see cref="MutableTrees.Conversions.FullyIterativeConversion"/> does that in O(n) time and |
| | 89 | | /// space, via a stack and a precomputed data structure to find the number of terminators per edge in O(1) time. |
| | 90 | | /// <br/> |
| | 91 | | /// - In conclusion, both Time and Space Complexity are O(n). |
| | 92 | | /// </para> |
| | 93 | | /// </remarks> |
| | 94 | | public class SuffixAndLcpArraysBasedSuffixTreeBuilder : IBuilder<SuffixTreeEdge, SuffixTreeNode> |
| | 95 | | { |
| 1 | 96 | | private static readonly MutableTrees.Conversions.IConversion MutableTreeConversion = |
| 1 | 97 | | new MutableTrees.Conversions.FullyIterativeConversion(); |
| | 98 | |
|
| | 99 | | /// <summary> |
| | 100 | | /// The builder to be used to construct the <see cref="SuffixArray"/> and the <see cref="LcpArray"/> of the full |
| | 101 | | /// text, both required by this algorithm to build the <see cref="SuffixTreeNode"/> structure. |
| | 102 | | /// </summary> |
| 27 | 103 | | public Func<TextWithTerminator, (SuffixArray, LcpArray)> SuffixAndLcpArraysBuilder { get; } |
| | 104 | |
|
| | 105 | | /// <summary> |
| | 106 | | /// <inheritdoc cref="SuffixAndLcpArraysBasedSuffixTreeBuilder"/> |
| | 107 | | /// </summary> |
| | 108 | | /// <remarks> |
| | 109 | | /// <inheritdoc cref="SuffixAndLcpArraysBasedSuffixTreeBuilder"/> |
| | 110 | | /// </remarks> |
| | 111 | | /// <param name="suffixAndLcpArraysBuilder"> |
| | 112 | | /// <inheritdoc cref="SuffixAndLcpArraysBuilder" path="/summary"/> |
| | 113 | | /// </param> |
| 28 | 114 | | public SuffixAndLcpArraysBasedSuffixTreeBuilder( |
| 28 | 115 | | Func<TextWithTerminator, (SuffixArray, LcpArray)> suffixAndLcpArraysBuilder) |
| 28 | 116 | | { |
| 28 | 117 | | SuffixAndLcpArraysBuilder = suffixAndLcpArraysBuilder; |
| 28 | 118 | | } |
| | 119 | |
|
| | 120 | | /// <inheritdoc path="//*[not self::summary or self::remarks]"/> |
| | 121 | | /// <summary> |
| | 122 | | /// <inheritdoc/> |
| | 123 | | /// </summary> |
| | 124 | | /// <remarks> |
| | 125 | | /// <inheritdoc cref="SuffixAndLcpArraysBasedSuffixTreeBuilder"/> |
| | 126 | | /// </remarks> |
| | 127 | | public SuffixTreeNode BuildTree(params TextWithTerminator[] texts) |
| 27 | 128 | | { |
| 27 | 129 | | var (fullText, terminators) = texts.GenerateFullText(); |
| 27 | 130 | | var n = fullText.Length; |
| 27 | 131 | | var (suffixArray, lcpArray) = SuffixAndLcpArraysBuilder(fullText); |
| 27 | 132 | | var (suffixArrayValues, lcpArrayValues) = (suffixArray.Indexes.ToList(), lcpArray.Lengths.ToList()); |
| | 133 | |
|
| 27 | 134 | | var firstSuffix = suffixArrayValues[0]; |
| 27 | 135 | | MutableTree.Node root = MutableTree.Node.BuildRoot(); |
| 27 | 136 | | MutableTree.Node previousLeaf = MutableTree.Node.Build( |
| 27 | 137 | | root, MutableTree.Edge.Build(firstSuffix, n - firstSuffix), firstSuffix); |
| 27 | 138 | | root.Children[previousLeaf.IncomingEdge] = previousLeaf; |
| | 139 | |
|
| 27 | 140 | | var previousLcp = 0; |
| 338 | 141 | | for (var i = 1; i < suffixArrayValues.Count; i++) |
| 142 | 142 | | { |
| 142 | 143 | | var currentSuffix = suffixArrayValues[i]; |
| 142 | 144 | | var currentSuffixLength = n - currentSuffix; |
| 142 | 145 | | var currentLcp = lcpArrayValues[i - 1]; |
| | 146 | |
|
| | 147 | | MutableTree.Node currentLeaf; |
| | 148 | |
|
| | 149 | | // Case 1: move up in the tree |
| 185 | 150 | | while (currentLcp < previousLcp) |
| 43 | 151 | | { |
| 43 | 152 | | previousLcp -= previousLeaf.ParentNode.IncomingEdge.Length; |
| 43 | 153 | | previousLeaf = previousLeaf.ParentNode; |
| 43 | 154 | | } |
| | 155 | |
|
| 142 | 156 | | if (currentLcp == previousLcp) |
| 86 | 157 | | { |
| | 158 | | // Case 2: new leaf sibling of previous leaf, no intermediate node |
| | 159 | |
|
| | 160 | | // Build current edge and leaf, for the reminder of the currentLcp. |
| 86 | 161 | | var currentEdge = MutableTree.Edge.Build(currentSuffix + currentLcp, currentSuffixLength - currentLcp); |
| 86 | 162 | | currentLeaf = MutableTree.Node.Build(previousLeaf.ParentNode, currentEdge, currentSuffix); |
| | 163 | |
|
| | 164 | | // Attach to the parent of the previous leaf. Nothing happens to the previous leaf itself. |
| 86 | 165 | | previousLeaf.ParentNode.Children[currentEdge] = currentLeaf; |
| 86 | 166 | | } |
| | 167 | | else |
| 56 | 168 | | { |
| | 169 | | // Case 3: intermediate node required |
| | 170 | |
|
| 56 | 171 | | var matchingChars = currentLcp - previousLcp; |
| | 172 | |
|
| | 173 | | // However, the edge to the new leaf can never includes the entire previous edge, because each suffix |
| | 174 | | // in this context is terminated by a unique terminator, which makes each suffix never a prefix of |
| | 175 | | // another suffix (which was exactly the point of introducing terminators in the first place. |
| | 176 | | // Therefore, remaining chars has to be bigger than 0. |
| | 177 | |
|
| | 178 | | // Build new intermediate, detach old edge going to previous leaf, attach new intermediate edge |
| 56 | 179 | | var intermediateEdge = MutableTree.Edge.Build(previousLeaf.IncomingEdge.Start, matchingChars); |
| 56 | 180 | | var intermediateNode = MutableTree.Node.Build(previousLeaf.ParentNode, intermediateEdge, null); |
| 56 | 181 | | previousLeaf.ParentNode.Children.Remove(previousLeaf.IncomingEdge); |
| 56 | 182 | | previousLeaf.ParentNode.Children[intermediateEdge] = intermediateNode; |
| | 183 | |
|
| | 184 | | // Build current edge and leaf |
| 56 | 185 | | var currentEdge = MutableTree.Edge.Build( |
| 56 | 186 | | currentSuffix + currentLcp, |
| 56 | 187 | | currentSuffixLength - currentLcp); |
| 56 | 188 | | currentLeaf = MutableTree.Node.Build(intermediateNode, currentEdge, currentSuffix); |
| | 189 | |
|
| | 190 | | // Build new edge for the reminder of the previous leaf |
| 56 | 191 | | var remainingEdge = MutableTree.Edge.Build( |
| 56 | 192 | | previousLeaf.IncomingEdge.Start + matchingChars, |
| 56 | 193 | | previousLeaf.IncomingEdge.Length - matchingChars); |
| | 194 | |
|
| | 195 | | // Attach previous leaf and current leaf to the new intermediate |
| 56 | 196 | | intermediateNode.Children[remainingEdge] = previousLeaf; |
| 56 | 197 | | intermediateNode.Children[currentEdge] = currentLeaf; |
| 56 | 198 | | } |
| | 199 | |
|
| 142 | 200 | | previousLcp = currentLcp; |
| 142 | 201 | | previousLeaf = currentLeaf; |
| 142 | 202 | | } |
| | 203 | |
|
| 27 | 204 | | return MutableTreeConversion.ConvertToSuffixTree(root, fullText, terminators); |
| 27 | 205 | | } |
| | 206 | | } |