| | 1 | | namespace MoreStructures; |
| | 2 | |
|
| | 3 | | /// <summary> |
| | 4 | | /// Extension methods for <see cref="TextWithTerminator"/>. |
| | 5 | | /// </summary> |
| | 6 | | public static class TextWithTerminatorExtensions |
| | 7 | | { |
| | 8 | | /// <summary> |
| | 9 | | /// Builds a virtual rotation of the provided <see cref="TextWithTerminator"/>, by a number of chars defined by |
| | 10 | | /// <paramref name="rotation"/>, in constant time. |
| | 11 | | /// </summary> |
| | 12 | | /// <param name="text">The text which has to be rotated.</param> |
| | 13 | | /// <param name="rotation">The number of chars to virtually rotate <paramref name="text"/>.</param> |
| | 14 | | /// <returns> |
| | 15 | | /// An object constructed in constant time and behaving like a rotation of the provided text. |
| | 16 | | /// </returns> |
| | 17 | | /// <remarks> |
| | 18 | | /// <para id="complexity"> |
| | 19 | | /// COMPLEXITY |
| | 20 | | /// <br/> |
| | 21 | | /// - The rotation is "virtual" because no new string of length n is computed (which would make the constructor |
| | 22 | | /// take linear time in the number of chars of <paramref name="text"/>). |
| | 23 | | /// <br/> |
| | 24 | | /// - Instead, a new object storing the rotation and keeping the reference to <paramref name="text"/> is |
| | 25 | | /// created in O(1) time and space. |
| | 26 | | /// <br/> |
| | 27 | | /// - Such an object is able to appear as if the underlying string was recomputed, taking into account the |
| | 28 | | /// rotation in all its exposed functionalities. |
| | 29 | | /// </para> |
| | 30 | | /// </remarks> |
| | 31 | | public static VirtuallyRotatedTextWithTerminator ToVirtuallyRotated(this TextWithTerminator text, int rotation) => |
| | 32 | | // TODO: avoid building RotatedTextWithTerminator and have VirtuallyRotatedTextWithTerminator access directly |
| | 33 | | // into the underlying TextWithTerminator |
| 10403 | 34 | | new(new(text, text.Terminator, false), rotation); |
| | 35 | |
|
| | 36 | | /// <summary> |
| | 37 | | /// Builds a single <see cref="TextWithTerminator"/>, concatenating the <see cref="TextWithTerminator"/> instances |
| | 38 | | /// in <paramref name="texts"/>. Returns as well the <see cref="ISet{T}"/> of all the terminators and their |
| | 39 | | /// indexes. |
| | 40 | | /// </summary> |
| | 41 | | /// <param name="texts">The text instances to join into a single text.</param> |
| | 42 | | /// <returns> |
| | 43 | | /// A couple. |
| | 44 | | /// <br/> |
| | 45 | | /// The first item of the couple is a text with the <see cref="TextWithTerminator.Text"/> of all items of |
| | 46 | | /// <paramref name="texts"/> concatenated, each followed by its own <see cref="TextWithTerminator.Terminator"/> |
| | 47 | | /// except the last, and the <see cref="TextWithTerminator.Terminator"/> of the last item. |
| | 48 | | /// <br/> |
| | 49 | | /// The second item of the couple is a set of all the terminators included in the resulting text. |
| | 50 | | /// </returns> |
| | 51 | | /// <remarks> |
| | 52 | | /// <para id="complexity"> |
| | 53 | | /// COMPLEXITY |
| | 54 | | /// <br/> |
| | 55 | | /// Requires iterating over the <see cref="TextWithTerminator"/> items of <paramref name="texts"/>, but not on |
| | 56 | | /// their content. |
| | 57 | | /// <br/> |
| | 58 | | /// Therefore, Time Complexity is O(n), where n is the number of items of <paramref name="texts"/>, and not |
| | 59 | | /// O(t), where t is the length of the concatenated text. |
| | 60 | | /// <br/> |
| | 61 | | /// Space Complexity is also O(n), since the <see cref="ISet{T}"/> of terminators contains n items and the |
| | 62 | | /// generated full text receives a lazy evaluated <see cref="IEnumerable{T}"/> of the <see cref="char"/> in |
| | 63 | | /// each <see cref="TextWithTerminator"/> of <paramref name="texts"/>. |
| | 64 | | /// </para> |
| | 65 | | /// </remarks> |
| | 66 | | public static (TextWithTerminator fullText, ISet<char> terminators) GenerateFullText( |
| | 67 | | this TextWithTerminator[] texts) |
| 409 | 68 | | { |
| 409 | 69 | | if (texts.Length == 0) |
| 1 | 70 | | { |
| 1 | 71 | | return (new TextWithTerminator(""), new HashSet<char> { TextWithTerminator.DefaultTerminator }); |
| | 72 | | } |
| | 73 | |
|
| 408 | 74 | | if (texts.Length == 1) |
| 283 | 75 | | { |
| 283 | 76 | | var singleText = texts.Single(); |
| 283 | 77 | | return (singleText, new HashSet<char> { singleText.Terminator }); |
| | 78 | | } |
| | 79 | |
|
| 125 | 80 | | var fullTextContent = Enumerable.Empty<char>(); |
| 125 | 81 | | var terminators = new HashSet<char>(); |
| | 82 | |
|
| 125 | 83 | | var queue = new Queue<TextWithTerminator>(); |
| 983 | 84 | | foreach (var text in texts) |
| 305 | 85 | | { |
| 305 | 86 | | if (!terminators.Add(text.Terminator)) |
| 2 | 87 | | throw new ArgumentException("Terminators should be unique.", nameof(texts)); |
| | 88 | |
|
| 303 | 89 | | queue.Enqueue(text); |
| | 90 | |
|
| 303 | 91 | | if (queue.Count > 1) |
| 178 | 92 | | { |
| 178 | 93 | | var previousText = queue.Dequeue(); |
| 178 | 94 | | fullTextContent = fullTextContent.Concat(previousText); |
| 178 | 95 | | } |
| 303 | 96 | | } |
| | 97 | |
|
| 123 | 98 | | var lastText = queue.Dequeue(); |
| 123 | 99 | | fullTextContent = string.Concat(fullTextContent.Concat(lastText.Text)); |
| 123 | 100 | | var fullText = new TextWithTerminator(fullTextContent, lastText.Terminator); |
| | 101 | |
|
| 123 | 102 | | return (fullText, terminators); |
| 407 | 103 | | } |
| | 104 | |
|
| | 105 | | /// <summary> |
| | 106 | | /// Builds the Cumulative Distribution Function (CDF) of the provided <paramref name="terminators"/> in the |
| | 107 | | /// provided <paramref name="fullText"/>. |
| | 108 | | /// </summary> |
| | 109 | | /// <param name="fullText"> |
| | 110 | | /// The text, composed of a single or multiple concatenated <see cref="TextWithTerminator"/>. |
| | 111 | | /// </param> |
| | 112 | | /// <param name="terminators"> |
| | 113 | | /// The set of terminators of <paramref name="fullText"/>. |
| | 114 | | /// </param> |
| | 115 | | /// <returns> |
| | 116 | | /// A lazy-generated sequence of integers, representing the values of the CDF, indexed by char index in |
| | 117 | | /// <paramref name="fullText"/>. |
| | 118 | | /// </returns> |
| | 119 | | /// <remarks> |
| | 120 | | /// <para id="definition"> |
| | 121 | | /// DEFINITION |
| | 122 | | /// <br/> |
| | 123 | | /// - The Cumulative Distribution Function of a text T with terminators t is a function CDF such that CDF at |
| | 124 | | /// index i is the number of chars in T up to index i included which are in t. |
| | 125 | | /// <br/> |
| | 126 | | /// - In other terms, <c>CDF[i] = sum(j = 0 to i, isTerminator(T[j]))</c>, where |
| | 127 | | /// <c>isTerminator(c) = 1 if c is in t, 0 otherwise</c>. |
| | 128 | | /// </para> |
| | 129 | | /// <para id="algorithm"> |
| | 130 | | /// ALGORITHM |
| | 131 | | /// <br/> |
| | 132 | | /// - The definition is applied, iterating over the chars of <paramref name="fullText"/> and yielding item by |
| | 133 | | /// item. |
| | 134 | | /// <br/> |
| | 135 | | /// - The algorithm is online. |
| | 136 | | /// </para> |
| | 137 | | /// <para id="complexity"> |
| | 138 | | /// COMPLEXITY |
| | 139 | | /// <br/> |
| | 140 | | /// - Time Complexity is O(n * Ttc), where n is the length of <paramref name="fullText"/> and Ttc is the time |
| | 141 | | /// required to check whether a char of the text is a terminator or not. |
| | 142 | | /// <br/> |
| | 143 | | /// - The "terminator checking" performance depends on the implementation of <see cref="ISet{T}"/>, |
| | 144 | | /// <paramref name="terminators"/> is an instance of. |
| | 145 | | /// <br/> |
| | 146 | | /// - If <paramref name="terminators"/> is an <see cref="HashSet{T}"/>, |
| | 147 | | /// <see cref="ICollection{T}.Contains(T)"/> is executed in constant time, and Time Complexity becomes O(n). |
| | 148 | | /// <br/> |
| | 149 | | /// - Space Complexity is O(n) in all cases, since the only data structure instantiated by the algorithm is the |
| | 150 | | /// output, which has as many items as the input text. |
| | 151 | | /// </para> |
| | 152 | | /// </remarks> |
| | 153 | | public static IEnumerable<int> BuildTerminatorsCDF(TextWithTerminator fullText, ISet<char> terminators) |
| 14 | 154 | | { |
| 14 | 155 | | var numberOfTerminators = 0; |
| 172 | 156 | | for (var i = 0; i < fullText.Length; i++) |
| 72 | 157 | | { |
| 72 | 158 | | if (terminators.Contains(fullText[i])) |
| 34 | 159 | | numberOfTerminators++; |
| 72 | 160 | | yield return numberOfTerminators; |
| 72 | 161 | | } |
| 14 | 162 | | } |
| | 163 | |
|
| | 164 | | /// <summary> |
| | 165 | | /// Builds the Terminators Index Map of the provided <paramref name="terminators"/> in the provided |
| | 166 | | /// <paramref name="fullText"/>. |
| | 167 | | /// </summary> |
| | 168 | | /// <param name="fullText"> |
| | 169 | | /// The text, composed of a single or multiple concatenated <see cref="TextWithTerminator"/>. |
| | 170 | | /// </param> |
| | 171 | | /// <param name="terminators"> |
| | 172 | | /// The set of terminators of <paramref name="fullText"/>. |
| | 173 | | /// </param> |
| | 174 | | /// <returns> |
| | 175 | | /// A lazy-generated sequence of integers, representing the values of the index map, indexed by char index in |
| | 176 | | /// <paramref name="fullText"/>. |
| | 177 | | /// </returns> |
| | 178 | | /// <remarks> |
| | 179 | | /// <para id="definition"> |
| | 180 | | /// DEFINITION |
| | 181 | | /// <br/> |
| | 182 | | /// - The terminators index map of a text T with terminators t is a function TIM such that TIM at |
| | 183 | | /// index i is the index in T of the last encountered terminator of T up to index i included. |
| | 184 | | /// <br/> |
| | 185 | | /// - In other terms, <c>TIM[i] = max(j = 0 to i, isTerminator(T[j]))</c>, where |
| | 186 | | /// <c>isTerminator(c) = 1 if c is in t, 0 otherwise</c>. |
| | 187 | | /// </para> |
| | 188 | | /// <para id="algorithm"> |
| | 189 | | /// ALGORITHM |
| | 190 | | /// <br/> |
| | 191 | | /// - The definition is applied, iterating over the chars of <paramref name="fullText"/> and yielding the index |
| | 192 | | /// of a new terminator each time one is encountered, for all indexes of the text from the one following the |
| | 193 | | /// last emitted, to the current one, included. |
| | 194 | | /// <br/> |
| | 195 | | /// - The algorithm keeps track of the last emitted index and assumes the full text terminates with a |
| | 196 | | /// terminator (which is the case for every well-formed <see cref="TextWithTerminator"/> issued by |
| | 197 | | /// <see cref="GenerateFullText(TextWithTerminator[])"/>). |
| | 198 | | /// <br/> |
| | 199 | | /// - The algorithm is online. However, it requires consuming the input up to the next terminator, in order to |
| | 200 | | /// emit values for indexes of chars in the text which refer to such terminator. |
| | 201 | | /// <br/> |
| | 202 | | /// - For example, if the text is <c>"ab1cde2fghilm3"</c>, where terminators are <c>new[] {'1', '2', '3'}</c>, |
| | 203 | | /// emitting the first item of the output sequence requires consuming the input up to the terminator '1', |
| | 204 | | /// emitting the second item of the output sequence requires consuming the input up to the terminator '2', |
| | 205 | | /// etc. |
| | 206 | | /// </para> |
| | 207 | | /// <para id="complexity"> |
| | 208 | | /// COMPLEXITY |
| | 209 | | /// <br/> |
| | 210 | | /// - Time Complexity is O(n * Ttc), where n is the length of <paramref name="fullText"/> and Ttc is the time |
| | 211 | | /// required to check whether a char of the text is a terminator or not. |
| | 212 | | /// <br/> |
| | 213 | | /// - The "terminator checking" performance depends on the implementation of <see cref="ISet{T}"/>, |
| | 214 | | /// <paramref name="terminators"/> is an instance of. |
| | 215 | | /// <br/> |
| | 216 | | /// - If <paramref name="terminators"/> is an <see cref="HashSet{T}"/>, |
| | 217 | | /// <see cref="ICollection{T}.Contains(T)"/> is executed in constant time, and Time Complexity becomes O(n). |
| | 218 | | /// <br/> |
| | 219 | | /// - Space Complexity is O(n) in all cases, since the only data structure instantiated by the algorithm is the |
| | 220 | | /// output, which has as many items as the input text. |
| | 221 | | /// </para> |
| | 222 | | /// </remarks> |
| | 223 | | public static IEnumerable<int> BuildTerminatorsIndexMap(TextWithTerminator fullText, ISet<char> terminators) |
| 121 | 224 | | { |
| 121 | 225 | | var lastIndexProcessed = -1; |
| 1900 | 226 | | for (var i = 0; i < fullText.Length; i++) |
| 829 | 227 | | { |
| 829 | 228 | | if (terminators.Contains(fullText[i])) |
| 206 | 229 | | { |
| 2070 | 230 | | for (var j = lastIndexProcessed + 1; j <= i; j++) |
| 829 | 231 | | { |
| 829 | 232 | | yield return i; |
| 829 | 233 | | } |
| 206 | 234 | | lastIndexProcessed = i; |
| 206 | 235 | | } |
| 829 | 236 | | } |
| 121 | 237 | | } |
| | 238 | | } |