| | 1 | | using MoreStructures.KnuthMorrisPratt.PrefixFunction; |
| | 2 | |
|
| | 3 | | namespace MoreStructures.KnuthMorrisPratt.Borders; |
| | 4 | |
|
| | 5 | | /// <inheritdoc cref="IBordersExtraction" path="//*[not(self::summary or self::remarks)]"/> |
| | 6 | | /// <summary> |
| | 7 | | /// An implementation of <see cref="IBordersExtraction"/> which first calculate the prefix function of the text, and |
| | 8 | | /// then uses it to calculate the borders efficiently. |
| | 9 | | /// </summary> |
| | 10 | | /// <remarks> |
| | 11 | | /// <para id="algo"> |
| | 12 | | /// ALGORITHM |
| | 13 | | /// <br/> |
| | 14 | | /// - The algorithm takes advantage of the following property of borders: any non-longest border of the text is a |
| | 15 | | /// also a border of the longest border of the text. |
| | 16 | | /// <br/> |
| | 17 | | /// - That is, if T[0..w] is a border of T of length w, and T has longest border T[0..lb] of length lb, where |
| | 18 | | /// 0 < w < lb, then T[0..w] is also border of T[0..lb]. That is: any non-longest border of T is also |
| | 19 | | /// border of the longest border of T. |
| | 20 | | /// <br/> |
| | 21 | | /// - Given the Prefix Function s of T, the length of the longest border of T can be calculated as s(n - 1), where |
| | 22 | | /// n is the length of T. The longest border of the longest border can be calculated as s(s(n - 1) - 1). And so |
| | 23 | | /// on... |
| | 24 | | /// <br/> |
| | 25 | | /// - Therefore, borders are found iteratively: the next border (shorter than the longest) can be calculated as the |
| | 26 | | /// longest border of the border found at the previous iteration, by using the Prefix Function on the predecessor |
| | 27 | | /// of the previous result: w(i) = s(w(i - 1) - 1); w(i + 1) = s(w(i) - 1) = s(s(w(i - 1) - 1) - 1), etc. |
| | 28 | | /// <br/> |
| | 29 | | /// - The iteration terminates when the Prefix Function returns 0, i.e. there is no border. |
| | 30 | | /// </para> |
| | 31 | | /// <para id="complexity"> |
| | 32 | | /// COMPLEXITY |
| | 33 | | /// <br/> |
| | 34 | | /// - Values of the Prefix Function have to be iterated up to the last, since the longest border of the provided |
| | 35 | | /// text T has a length equal to the value of the Prefix Function in n - 1, where n is the length of T. |
| | 36 | | /// <br/> |
| | 37 | | /// - Then there are as many iterations as the number of borders of T. In the worst case, T can have a number of |
| | 38 | | /// borders comparable to the chars in it. For example, the string <c>new string('a', n)</c> has n - 1 borders: |
| | 39 | | /// <c>new string('a', n - 1)</c>, <c>new string('a', n - 2)</c>, ..., "a". |
| | 40 | | /// <br/> |
| | 41 | | /// - For each iteration, the border is returned as new string, and that requires iterating over each char of the |
| | 42 | | /// border. In the worst case, borders of T can have a length comparable to the length of T. For example the |
| | 43 | | /// string <c>new string('a', n)</c> has a border of length <c>n - 1</c>. |
| | 44 | | /// <br/> |
| | 45 | | /// - Direct accessing of the list of values of the Prefix Function is done in constant time. |
| | 46 | | /// <br/> |
| | 47 | | /// - Storing the Prefix Function values requires storing n integers, each of constant size. |
| | 48 | | /// <br/> |
| | 49 | | /// - Every iteration the string of a single border is built and returned. |
| | 50 | | /// <br/> |
| | 51 | | /// - Therefore, Time Complexity is O(Tpf * n + n ^ 2) and Space Complexity is O(Spf * n + n), where Tpf and Spf |
| | 52 | | /// are the amortized cost of the Prefix Function values over the n chars of T. |
| | 53 | | /// </para> |
| | 54 | | /// </remarks> |
| | 55 | | public class PrefixFunctionBasedBorderExtraction : IBordersExtraction |
| | 56 | | { |
| | 57 | | /// <summary> |
| | 58 | | /// The <see cref="IPrefixFunctionCalculator"/> implementation to be used, to calculate the Prefix Function of |
| | 59 | | /// the text. |
| | 60 | | /// </summary> |
| 16 | 61 | | protected IPrefixFunctionCalculator PrefixFunctionCalculator { get; } |
| | 62 | |
|
| | 63 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 64 | | /// <remarks> |
| | 65 | | /// <inheritdoc cref="PrefixFunctionBasedBorderExtraction" path="/remarks"/> |
| | 66 | | /// </remarks> |
| 9 | 67 | | public PrefixFunctionBasedBorderExtraction(IPrefixFunctionCalculator prefixFunctionCalculator) |
| 9 | 68 | | { |
| 9 | 69 | | PrefixFunctionCalculator = prefixFunctionCalculator; |
| 9 | 70 | | } |
| | 71 | |
|
| | 72 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 73 | | /// <remarks> |
| | 74 | | /// <inheritdoc cref="PrefixFunctionBasedBorderExtraction" path="/remarks"/> |
| | 75 | | /// </remarks> |
| | 76 | | public IEnumerable<string> GetAllBordersByDescLength(string text) |
| 18 | 77 | | { |
| 18 | 78 | | if (text == string.Empty) |
| 2 | 79 | | yield break; |
| | 80 | |
|
| 16 | 81 | | var prefixFunctionValues = PrefixFunctionCalculator.GetValues(text).ToList(); |
| | 82 | |
|
| 16 | 83 | | var i = prefixFunctionValues[text.Length - 1]; |
| 40 | 84 | | while (i > 0) |
| 24 | 85 | | { |
| 24 | 86 | | yield return text[0..i]; |
| 24 | 87 | | i = prefixFunctionValues[i - 1]; |
| 24 | 88 | | } |
| 16 | 89 | | } |
| | 90 | | } |