| | 1 | | using MoreStructures.RecImmTrees; |
| | 2 | | using MoreStructures.RecImmTrees.Conversions; |
| | 3 | | using MoreStructures.SuffixStructures; |
| | 4 | | using MoreStructures.Utilities; |
| | 5 | |
|
| | 6 | | namespace MoreStructures.SuffixTrees; |
| | 7 | |
|
| | 8 | | /// <summary> |
| | 9 | | /// An immutable node of an immutable Suffix Tree, recursively pointing to its children nodes via |
| | 10 | | /// <see cref="SuffixTreeEdge"/> instances, associated with selector strings. |
| | 11 | | /// </summary> |
| | 12 | | /// <param name="Children">The collection of children for the node, indexed by string edges.</param> |
| | 13 | | /// <param name="Start"> |
| | 14 | | /// <inheritdoc cref="ISuffixStructureNode{TEdge, TNode}.Start" path="/summary"/> |
| | 15 | | /// </param> |
| | 16 | | /// <remarks> |
| | 17 | | /// <para id="advantages"> |
| | 18 | | /// ADVANTAGES AND DISADVANTAGES |
| | 19 | | /// <br/> |
| | 20 | | /// Suffix Trees are more space-efficient than Suffix Tries due to the reduced number of |
| | 21 | | /// <see cref="SuffixTreeEdge"/> and their <see cref="SuffixTreeNode"/>, compare to the corresponding |
| | 22 | | /// <see cref="SuffixTries.SuffixTrieEdge"/> and their <see cref="SuffixTries.SuffixTrieNode"/>, due to entire |
| | 23 | | /// paths of single chains of chars in Suffix Tries being coalesced into a single selector string, which is stored |
| | 24 | | /// on the edge with path label compression, i.e. using two fixed sized numbers (<see cref="SuffixTreeEdge.Start"/> |
| | 25 | | /// and <see cref="SuffixTreeEdge.Length"/>) instead of a variable-length string of characters. |
| | 26 | | /// <br/> |
| | 27 | | /// Furthermore, suffix trees, unlike suffix tries, can be constructed in linear time, for example via the |
| | 28 | | /// <see cref="Builders.UkkonenSuffixTreeBuilder"/>. |
| | 29 | | /// </para> |
| | 30 | | /// <para id="immutability"> |
| | 31 | | /// IMMUTABILITY |
| | 32 | | /// <br/> |
| | 33 | | /// Immutability is guaranteed by using <see cref="ValueReadOnlyCollection{T}"/>. |
| | 34 | | /// </para> |
| | 35 | | /// </remarks> |
| 21613 | 36 | | public abstract record SuffixTreeNode(IDictionary<SuffixTreeEdge, SuffixTreeNode> Children, int? Start) |
| 21613 | 37 | | : ISuffixStructureNode<SuffixTreeEdge, SuffixTreeNode> |
| 21613 | 38 | | { |
| 21613 | 39 | | /// <summary> |
| 21613 | 40 | | /// Builds an intermediate node, i.e. a node with children and their corresponding incoming edges. |
| 21613 | 41 | | /// </summary> |
| 21613 | 42 | | public record Intermediate(IDictionary<SuffixTreeEdge, SuffixTreeNode> Children) |
| 10558 | 43 | | : SuffixTreeNode(Children, null) |
| 21613 | 44 | | { |
| 21613 | 45 | | /// <inheritdoc/> |
| 21613 | 46 | | public override bool IsEquivalentTo(SuffixTreeNode other, params TextWithTerminator[] texts) => |
| 165 | 47 | | other is Intermediate(var otherChildren) && |
| 165 | 48 | | SuffixTreeNode.HaveEquivalentChildren(texts.GenerateFullText().fullText, Children, otherChildren); |
| 21613 | 49 | | } |
| 21613 | 50 | |
|
| 21613 | 51 | | /// <summary> |
| 21613 | 52 | | /// Builds a leaf, i.e. a node with no children and the start index of the suffix in the text. |
| 21613 | 53 | | /// </summary> |
| 11875 | 54 | | public record Leaf(int LeafStart) |
| 11050 | 55 | | : SuffixTreeNode(new Dictionary<SuffixTreeEdge, SuffixTreeNode> { }, LeafStart) |
| 21613 | 56 | | { |
| 21613 | 57 | | /// <inheritdoc/> |
| 21613 | 58 | | public override bool IsEquivalentTo(SuffixTreeNode other, params TextWithTerminator[] texts) => |
| 386 | 59 | | other is Leaf(var otherLeafStart) && |
| 386 | 60 | | LeafStart == otherLeafStart; |
| 21613 | 61 | | } |
| 21613 | 62 | |
|
| 21613 | 63 | | /// <inheritdoc/> |
| 43659 | 64 | | public IDictionary<SuffixTreeEdge, SuffixTreeNode> Children { get; } |
| 21616 | 65 | | = (Children.Any() == (Start == null)) && Start.GetValueOrDefault(0) >= 0 |
| 21616 | 66 | | ? Children.ToValueReadOnlyDictionary() |
| 21616 | 67 | | : throw new ArgumentException($"Leafs needs to specificy a non-negative {nameof(Start)}.", nameof(Children)); |
| 21613 | 68 | |
|
| 21613 | 69 | | /// <inheritdoc/> |
| 22539 | 70 | | public int? Start { get; } = Start; |
| 21613 | 71 | |
|
| 21613 | 72 | | /// <summary> |
| 21613 | 73 | | /// Indexes into the children of this node, by edge. |
| 21613 | 74 | | /// </summary> |
| 7 | 75 | | public SuffixTreeNode this[SuffixTreeEdge edge] => Children[edge]; |
| 21613 | 76 | |
|
| 1 | 77 | | private static readonly IStringifier<SuffixTreeEdge, SuffixTreeNode> Stringifier = |
| 1 | 78 | | new FullyIterativeStringifier<SuffixTreeEdge, SuffixTreeNode>( |
| 141 | 79 | | r => r.IsLeaf() ? $"R from {r.Start}" : "R", |
| 942 | 80 | | (e, n) => $"{e} -> {(n.IsLeaf() ? $"L from {n.Start}" : "I")}") |
| 1 | 81 | | { |
| 1 | 82 | | StopIndentingLevel = 10, |
| 1 | 83 | | }; |
| 21613 | 84 | |
|
| 21613 | 85 | | /// <inheritdoc path="//*[not(self::summary or self::remarks)]"/> |
| 21613 | 86 | | /// <summary> |
| 21613 | 87 | | /// <inheritdoc/> |
| 21613 | 88 | | /// <br/> |
| 21613 | 89 | | /// Uses a <see cref="IStringifier{TEdge, TNode}"/> to generate the string which show the node and its |
| 21613 | 90 | | /// underlying structure. |
| 21613 | 91 | | /// </summary> |
| 21613 | 92 | | /// <remarks> |
| 21613 | 93 | | /// Sealed to prevent compiler from superceding <see cref="ToString()"/> in derived record. |
| 21613 | 94 | | /// </remarks> |
| 141 | 95 | | public override sealed string ToString() => Stringifier.Stringify(this); |
| 21613 | 96 | |
|
| 21613 | 97 | | /// <summary> |
| 21613 | 98 | | /// Determines whether this <see cref="SuffixTreeNode"/> structure is equivalent to the provided |
| 21613 | 99 | | /// <paramref name="other"/> structure, w.r.t. the provided <paramref name="texts"/>. |
| 21613 | 100 | | /// </summary> |
| 21613 | 101 | | /// <param name="other">The other tree, to compare this tree against.</param> |
| 21613 | 102 | | /// <param name="texts">The text, based on which to evaluate the tree equivalence.</param> |
| 21613 | 103 | | /// <returns>True if the two trees are equivalent for the provided text, false otherwise.</returns> |
| 21613 | 104 | | /// <remarks> |
| 21613 | 105 | | /// <para id="definition"> |
| 21613 | 106 | | /// DEFINITION |
| 21613 | 107 | | /// <br/> |
| 21613 | 108 | | /// The definition of <see cref="SuffixTreeNode"/> structures equivalence depends on (a list of) |
| 21613 | 109 | | /// <see cref="TextWithTerminator"/>: two trees can be equivalent for some texts, and not equivalent |
| 21613 | 110 | | /// for others. This is because of the <b>Edge Compression</b> technique used by |
| 21613 | 111 | | /// <see cref="SuffixTreeEdge"/>, to store edges in O(1) space, instead of O(n). |
| 21613 | 112 | | /// <br/> |
| 21613 | 113 | | /// Moreover, equivalence is based on the equivalence of edges and of the specific type of node. |
| 21613 | 114 | | /// <br/> |
| 21613 | 115 | | /// - Two <see cref="SuffixTreeEdge"/> E1 and E2 are equivalent w.r.t. a <see cref="TextWithTerminator"/> T, |
| 21613 | 116 | | /// if the string identified by each edge is the same: <c>T[E1] == T[E2]</c>. |
| 21613 | 117 | | /// <br/> |
| 21613 | 118 | | /// - Two <see cref="Leaf"/> L1 and L2 are equivalent if their <see cref="Leaf.LeafStart"/> is the same: |
| 21613 | 119 | | /// <c>L1.LeafStart == L2.LeafStart</c>. |
| 21613 | 120 | | /// <br/> |
| 21613 | 121 | | /// - Two <see cref="Intermediate"/> I1 and I2 are equivalent if there is a 1-to-1 correspondence between |
| 21613 | 122 | | /// children edges and child nodes pointed by such edges are equivalent: |
| 21613 | 123 | | /// <c>for each edge e1 of I1.Children there exists exactly one equivalent edge e2 of I2.Children and |
| 21613 | 124 | | /// I1.Children[e1] is equivalent to I2.Children[e2] w.r.t. T</c>. |
| 21613 | 125 | | /// </para> |
| 21613 | 126 | | /// <para id="complexity"> |
| 21613 | 127 | | /// COMPLEXITY |
| 21613 | 128 | | /// <br/> |
| 21613 | 129 | | /// - Checking for edges equivalence is a O(n) operation, where n is the number of nodes in the tree, since it |
| 21613 | 130 | | /// requires decompressing the labels of each of the edges being compare and compare for equality the two |
| 21613 | 131 | | /// resulting O(n) strings. |
| 21613 | 132 | | /// <br/> |
| 21613 | 133 | | /// - Checking for leaves equivalence is a O(1) operation, since it only requires comparing |
| 21613 | 134 | | /// <see cref="Leaf.LeafStart"/>, which has constant size. |
| 21613 | 135 | | /// <br/> |
| 21613 | 136 | | /// - Checking for intermediate nodes equivalence is a O(n) operation, amortized cost over the total number of |
| 21613 | 137 | | /// edges in the tree, since there are O(n) edges in total in the tree, an edge is visited multiple times |
| 21613 | 138 | | /// but always in the context of a single node, and the node equivalence check requires looking for the |
| 21613 | 139 | | /// equivalent edge. |
| 21613 | 140 | | /// <br/> |
| 21613 | 141 | | /// - In conclusion Time Complexity is O(n^2) and Space Complexity is O(n). |
| 21613 | 142 | | /// </para> |
| 21613 | 143 | | /// </remarks> |
| 21613 | 144 | | public abstract bool IsEquivalentTo(SuffixTreeNode other, params TextWithTerminator[] texts); |
| 21613 | 145 | |
|
| 21613 | 146 | | private static bool HaveEquivalentChildren( |
| 21613 | 147 | | TextWithTerminator fullText, |
| 21613 | 148 | | IDictionary<SuffixTreeEdge, SuffixTreeNode> thisChildren, |
| 21613 | 149 | | IDictionary<SuffixTreeEdge, SuffixTreeNode> otherChildren) |
| 165 | 150 | | { |
| 1434 | 151 | | foreach (var (childEdge, childNode) in thisChildren) |
| 472 | 152 | | { |
| 472 | 153 | | var correspondingEdgeAndNode = otherChildren |
| 1686 | 154 | | .Where(edgeAndNode => fullText[childEdge] == fullText[edgeAndNode.Key]) |
| 472 | 155 | | .ToList(); |
| 472 | 156 | | if (correspondingEdgeAndNode.Count != 1) |
| 3 | 157 | | return false; |
| 469 | 158 | | if (!childNode.IsEquivalentTo(correspondingEdgeAndNode[0].Value, fullText)) |
| 2 | 159 | | return false; |
| 467 | 160 | | } |
| 21613 | 161 | |
|
| 160 | 162 | | return true; |
| 165 | 163 | | } |
| 21613 | 164 | | } |