| | 1 | | using MoreStructures.SuffixStructures.Builders; |
| | 2 | | using MoreStructures.SuffixTrees.Builders.Ukkonen; |
| | 3 | |
|
| | 4 | | namespace MoreStructures.SuffixTrees.Builders; |
| | 5 | |
|
| | 6 | | /// <summary> |
| | 7 | | /// Builds objects, such as edges and nodes, for <see cref="SuffixTreeNode"/> structures, using the Ukkonen algorithm. |
| | 8 | | /// </summary> |
| | 9 | | /// <remarks> |
| | 10 | | /// <para id="intro"> |
| | 11 | | /// Iterative implementation of the Ukkonen algorithm: see |
| | 12 | | /// <see href = "https://en.wikipedia.org/wiki/Ukkonen%27s_algorithm" /> for an introduction and |
| | 13 | | /// <see href="https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf"/> for the original paper. |
| | 14 | | /// </para> |
| | 15 | | /// <para id="algo-optimizations"> |
| | 16 | | /// ALGORITHM OPTIMIZATIONS |
| | 17 | | /// <br/> |
| | 18 | | /// The algorithm applies some optimizations to achieve linear complexity: |
| | 19 | | /// <br/> |
| | 20 | | /// - <b>Edge Labels compression</b>: strings on edges are stored as (start, end) indexes. This makes the space |
| | 21 | | /// used by a single edge constant, i.e. O(1), because it always consists of two integers, regardless of |
| | 22 | | /// how many chars of the text it represents. |
| | 23 | | /// <br/> |
| | 24 | | /// - <b>Global End</b>: all leafs have a dynamic end index, autoincremented every new phase. This makes the time |
| | 25 | | /// used to apply Rule 1 Extension constant, i.e. O(1), because incrementing the global end also increments |
| | 26 | | /// all leaf implicitely. |
| | 27 | | /// <br/> |
| | 28 | | /// - <b>Suffix Links</b>: all internal nodes point to another internal node sharing the suffix. This makes more |
| | 29 | | /// efficient traversal, because storing and jumping the suffix link when needed means traversal doesn't |
| | 30 | | /// have to be done again. |
| | 31 | | /// <br/> |
| | 32 | | /// - <b>Terminators Index Map</b>: a function is used, mapping each index i of the full input text (consolidated |
| | 33 | | /// <see cref="TextWithTerminator"/> including all <see cref="TextWithTerminator"/> instances passed in the |
| | 34 | | /// input) to the index of the next terminator appearing in the full text not before i. Such function is used |
| | 35 | | /// to perform <b>edge trimming</b> of the mutable tree structure built by the Ukkonen algorithm in linear time. |
| | 36 | | /// </para> |
| | 37 | | /// <para id="algo"> |
| | 38 | | /// ALGORITHM |
| | 39 | | /// <br/> |
| | 40 | | /// The algorithm is structured in phases, as many as the number of chars in the text. |
| | 41 | | /// <br/> |
| | 42 | | /// At the beginning of a new phase, both the number of remaining suffixes to take care of, and the global end |
| | 43 | | /// pointer are both increased by 1. |
| | 44 | | /// <br/> |
| | 45 | | /// Each phase is composed of at least 1 iteration, each one taking care of remaining suffixes. |
| | 46 | | /// At the beginning |
| | 47 | | /// </para> |
| | 48 | | /// <para id="usecases"> |
| | 49 | | /// USECASES |
| | 50 | | /// <br/> |
| | 51 | | /// Not limited by call stack depth. Convenient with long input text (i.e. string having a length < ~1K chars). |
| | 52 | | /// </para> |
| | 53 | | /// <para id="complexity"> |
| | 54 | | /// COMPLEXITY |
| | 55 | | /// <br/> |
| | 56 | | /// - The Time Complexity of the Ukkonen algorithm, building the mutable tree from the input texts, is O(t). |
| | 57 | | /// Space Complexity is O(2 * t) where t = length of the text to match. |
| | 58 | | /// <br/> |
| | 59 | | /// - While there are as many phases as number of chars in text (t), and there can be multiple iterations per |
| | 60 | | /// phase (as many as the number of remaining suffixes to process), the complexity is still linear, ~ 2t. |
| | 61 | | /// <br/> |
| | 62 | | /// - Each node of the resulting mutable tree (there are O(n) of them, and at most 2 * n), is processed by the |
| | 63 | | /// mutable tree conversion method at most twice: once for leaves and twice for intermediate nodes, which are |
| | 64 | | /// re-pushed onto the stack before its children, so that they can be processed after all their children have |
| | 65 | | /// already been converted. |
| | 66 | | /// <br/> |
| | 67 | | /// - Each stack frame processing by the mutable tree conversion method performs constant-time operations only: |
| | 68 | | /// <see cref="SuffixTreeNode"/> and <see cref="SuffixTreeEdge"/> instances creation, children dictionary |
| | 69 | | /// instantiation and edge trimming (trimming the label of edges in generalized Suffix Trees, to the first |
| | 70 | | /// terminator appearing in the label, when multiple are present - i.e. when the label spans multiple |
| | 71 | | /// concatenated texts). |
| | 72 | | /// <br/> |
| | 73 | | /// - Edge trimming is a constant-time operation once the cost of calculating the Terminators Index Map is paid, |
| | 74 | | /// which is a linear cost in time and space (linear in t). |
| | 75 | | /// <br/> |
| | 76 | | /// - Therefore, mutable tree conversion, and the overall algorithm, is linear in time and space over t. |
| | 77 | | /// </para> |
| | 78 | | /// </remarks> |
| | 79 | | public class UkkonenSuffixTreeBuilder |
| | 80 | | : IBuilder<SuffixTreeEdge, SuffixTreeNode> |
| | 81 | | { |
| | 82 | | /// <inheritdoc path="//*[not(self::summary or self::remarks)]"/> |
| | 83 | | /// <summary> |
| | 84 | | /// <inheritdoc/> |
| | 85 | | /// </summary> |
| | 86 | | /// <remarks> |
| | 87 | | /// <inheritdoc cref="UkkonenSuffixTreeBuilder" path="/remarks"/> |
| | 88 | | /// </remarks> |
| | 89 | | public SuffixTreeNode BuildTree(params TextWithTerminator[] texts) |
| 48 | 90 | | { |
| 48 | 91 | | var (fullText, terminators) = texts.GenerateFullText(); |
| | 92 | |
|
| 48 | 93 | | var state = new IterationState(fullText); |
| | 94 | |
|
| 390 | 95 | | while (state.IsThereANextPhase()) |
| 342 | 96 | | { |
| 342 | 97 | | state.StartPhaseIncreasingRemainingAndGlobalEnd(); |
| | 98 | |
|
| 342 | 99 | | ProcessCurrentPhase(state); |
| | 100 | |
|
| 342 | 101 | | Trace.WriteLine("End of phase\n"); |
| 342 | 102 | | } |
| | 103 | |
|
| 48 | 104 | | return BuildResult(state.Root, fullText, terminators); |
| 48 | 105 | | } |
| | 106 | |
|
| | 107 | | private static void ProcessCurrentPhase(IterationState state) |
| 342 | 108 | | { |
| 684 | 109 | | while (state.StillRemainingSuffixesInCurrentPhase()) |
| 496 | 110 | | { |
| | 111 | | bool endCurrentPhase; |
| 496 | 112 | | if (state.NoActivePointAndEdgeStartingFromActiveNodeWithCurrentChar() is MutableEdge edge) |
| 111 | 113 | | { |
| 111 | 114 | | state.InitializeActiveEdgeAndLength(edge); |
| 111 | 115 | | endCurrentPhase = true; |
| 111 | 116 | | } |
| 385 | 117 | | else if (state.ActivePointFollowedByCurrentChar()) |
| 43 | 118 | | { |
| 43 | 119 | | state.IncrementActiveLength(); |
| 43 | 120 | | endCurrentPhase = true; |
| 43 | 121 | | } |
| | 122 | | else // No active point nor edge, or active point not followed by current char => Rule 2 |
| 342 | 123 | | { |
| 342 | 124 | | state.CreateLeafAndPossiblyIntermediateAndDecrementRemainingSuffixes(); |
| 342 | 125 | | endCurrentPhase = false; |
| 342 | 126 | | } |
| | 127 | |
|
| 496 | 128 | | Trace.WriteLine($"State at end of iteration: {state}"); |
| 496 | 129 | | Trace.WriteLine(""); |
| | 130 | |
|
| 496 | 131 | | if (endCurrentPhase) |
| 154 | 132 | | return; |
| 342 | 133 | | } |
| 342 | 134 | | } |
| | 135 | |
|
| | 136 | | private record struct BuildResultStackFrame( |
| 1356 | 137 | | SuffixTreeEdge SuffixTreeEdge, |
| 1356 | 138 | | MutableNode MutableNode, |
| 1356 | 139 | | IDictionary<SuffixTreeEdge, SuffixTreeNode> ParentChildren, |
| 1356 | 140 | | IDictionary<SuffixTreeEdge, SuffixTreeNode>? NodeChildrenMaybe); |
| | 141 | |
|
| | 142 | | private static SuffixTreeNode BuildResult( |
| | 143 | | MutableNode root, TextWithTerminator fullText, ISet<char> terminators) |
| 48 | 144 | | { |
| 48 | 145 | | var stack = new Stack<BuildResultStackFrame>(); |
| 48 | 146 | | var rootParentChildren = new Dictionary<SuffixTreeEdge, SuffixTreeNode>() { }; |
| 48 | 147 | | var suffixTreeEdge = new SuffixTreeEdge(0, 1); |
| 48 | 148 | | var terminatorsIndexMap = TextWithTerminatorExtensions |
| 48 | 149 | | .BuildTerminatorsIndexMap(fullText, terminators) |
| 48 | 150 | | .ToList(); |
| | 151 | |
|
| 48 | 152 | | stack.Push(new(suffixTreeEdge, root, rootParentChildren, null)); |
| 726 | 153 | | while (stack.Count > 0) |
| 678 | 154 | | ProcessStack(stack, terminatorsIndexMap); |
| 48 | 155 | | return rootParentChildren[suffixTreeEdge]; |
| 48 | 156 | | } |
| | 157 | |
|
| | 158 | | private static void ProcessStack( |
| | 159 | | Stack<BuildResultStackFrame> stack, IList<int> terminatorsIndexMap) |
| 678 | 160 | | { |
| 678 | 161 | | var (suffixTreeEdge, mutableNode, parentChildren, nodeChildrenMaybe) = stack.Pop(); |
| | 162 | |
|
| 678 | 163 | | if (mutableNode.Children.Count == 0 || nodeChildrenMaybe != null) |
| 510 | 164 | | { |
| 510 | 165 | | parentChildren[suffixTreeEdge] = |
| 510 | 166 | | nodeChildrenMaybe is IDictionary<SuffixTreeEdge, SuffixTreeNode> nodeChildren |
| 510 | 167 | | ? new SuffixTreeNode.Intermediate(nodeChildren) |
| 510 | 168 | | : new SuffixTreeNode.Leaf(mutableNode.LeafStart!.Value); |
| 510 | 169 | | } |
| | 170 | | else |
| 168 | 171 | | { |
| 168 | 172 | | var nodeChildren = new Dictionary<SuffixTreeEdge, SuffixTreeNode>() { }; |
| 168 | 173 | | stack.Push(new(suffixTreeEdge, mutableNode, parentChildren, nodeChildren)); |
| 1428 | 174 | | foreach (var (childMutableEdge, childMutableNode) in mutableNode.Children) |
| 462 | 175 | | { |
| | 176 | | // If more than a terminator is included in the label of the edge, in principle we should reach |
| | 177 | | // the leaf and replace childMutableNode with it. |
| | 178 | | // However, such leaf must be unique, since each terminator appears a single time in fullText). |
| | 179 | | // Therefore, if the Suffix Tree is correctly built, childMutableNode has to be a leaf already |
| | 180 | | // (because, unlike in Suffix Tries, there can't be a non coalesced branch in a Suffix Tree). |
| 462 | 181 | | var edgeStart = childMutableEdge.Start; |
| 462 | 182 | | var childSuffixTreeEdge = new SuffixTreeEdge( |
| 462 | 183 | | edgeStart, |
| 462 | 184 | | Math.Min(childMutableEdge.End.Value, terminatorsIndexMap[edgeStart]) - edgeStart + 1); |
| | 185 | |
|
| 462 | 186 | | stack.Push(new(childSuffixTreeEdge, childMutableNode, nodeChildren, null)); |
| 462 | 187 | | } |
| 168 | 188 | | } |
| 678 | 189 | | } |
| | 190 | | } |