| | 1 | | using MoreLinq; |
| | 2 | | using MoreStructures.Utilities; |
| | 3 | |
|
| | 4 | | namespace MoreStructures.BurrowsWheelerTransform.Builders; |
| | 5 | |
|
| | 6 | | /// <summary> |
| | 7 | | /// <inheritdoc cref="IBuilder"/> |
| | 8 | | /// This implementation adopts the simplest approach at <see cref="BWMatrix"/> building, which results in a more than |
| | 9 | | /// quadratic time and space. <see cref="BWTransform"/> is calculated via the <see cref="BWMatrix"/>, therefore same |
| | 10 | | /// level of Time and Space Complexity. |
| | 11 | | /// </summary> |
| | 12 | | /// <remarks> |
| | 13 | | /// Check specific builder methods, such as <see cref="BuildMatrix(MoreStructures.TextWithTerminator)"/>, for further |
| | 14 | | /// information about the complexity of each operation. |
| | 15 | | /// </remarks> |
| | 16 | | public class NaiveBuilder : IBuilder |
| | 17 | | { |
| | 18 | | /// <inheritdoc cref="IBuilder" path="//*[not(self::remarks)]"/> |
| | 19 | | /// <remarks> |
| | 20 | | /// <para id="complexity"> |
| | 21 | | /// COMPLEXITY |
| | 22 | | /// <br/> |
| | 23 | | /// Since this operation requires computing a n * n matrix, where n is the |
| | 24 | | /// <see cref="TextWithTerminator.Length"/> of <paramref name="text"/>, it can be intensive operation, both in |
| | 25 | | /// time. |
| | 26 | | /// <br/> |
| | 27 | | /// Time Complexity: |
| | 28 | | /// <br/> |
| | 29 | | /// - Sorting a large number of strings on a large non-constant alphabet takes n * log(n) * m, where m is |
| | 30 | | /// the cost of a comparison of two n-sized strings, which is O(n). |
| | 31 | | /// <br/> |
| | 32 | | /// - Therefore Time Complexity is O(n^2 * log(n)). |
| | 33 | | /// <br/> |
| | 34 | | /// - If the alphabet can be considered of constant size and comparison between two strings happens in |
| | 35 | | /// constant time, the complexity is O(n * log(n)). |
| | 36 | | /// <br/> |
| | 37 | | /// Space Complexity: |
| | 38 | | /// <br/> |
| | 39 | | /// - The output is a n * n matrix of chars (all cyclic rotations of a n-sized string). |
| | 40 | | /// <br/> |
| | 41 | | /// - Therefore Space Complexity is O(n^2 * m), when no assumption is made on the size of a char being |
| | 42 | | /// constant, where m = log(w, M), with w = size of a word in memory and M = size of the alphabet. |
| | 43 | | /// <br/> |
| | 44 | | /// - If the alphabet can be considered of constant size, the complexity is O(n^2). |
| | 45 | | /// </para> |
| | 46 | | /// </remarks> |
| | 47 | | public virtual BWMatrix BuildMatrix(TextWithTerminator text) |
| 20 | 48 | | { |
| 20 | 49 | | var stringsComparer = StringIncludingTerminatorComparer.Build(text.Terminator); |
| 20 | 50 | | var content = Enumerable |
| 20 | 51 | | .Range(0, text.Length) |
| 188 | 52 | | .Select(i => new string(text.Skip(i).Take(text.Length - i).Concat(text.Take(i)).ToArray())) |
| 188 | 53 | | .OrderBy(i => i, stringsComparer) |
| 20 | 54 | | .ToValueReadOnlyCollection(); |
| | 55 | |
|
| 20 | 56 | | return new(text, content); |
| 20 | 57 | | } |
| | 58 | |
|
| | 59 | | /// <inheritdoc/> |
| | 60 | | public BWMatrix BuildMatrix(BWTransform lastBWMColumn) |
| 20 | 61 | | { |
| 20 | 62 | | var allBWMColumnsExceptLast = BuildAllBWMColumnsExceptLast(lastBWMColumn.Content); |
| 20 | 63 | | var allBWMRows = ( |
| 20 | 64 | | from rowIndex in Enumerable.Range(0, lastBWMColumn.Length) |
| 264 | 65 | | let rowExceptLastChar = string.Concat( |
| 264 | 66 | | from columnIndex in Enumerable.Range(0, allBWMColumnsExceptLast.Count) |
| 4456 | 67 | | select allBWMColumnsExceptLast[columnIndex][rowIndex]) |
| 264 | 68 | | select rowExceptLastChar + lastBWMColumn.Content[rowIndex]) |
| 20 | 69 | | .ToList(); // The Burrows-Wheeler Matrix requires direct random access to any of its items |
| 20 | 70 | | return new(lastBWMColumn.Text, allBWMRows); |
| 20 | 71 | | } |
| | 72 | |
|
| | 73 | | /// <inheritdoc/> |
| 20 | 74 | | public virtual BWTransform BuildTransform(BWMatrix matrix) => matrix.Transform; |
| | 75 | |
|
| | 76 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 77 | | /// <remarks> |
| | 78 | | /// <inheritdoc/> |
| | 79 | | /// <para id="complexity"> |
| | 80 | | /// COMPLEXITY |
| | 81 | | /// <br/> |
| | 82 | | /// - Done without constructing the <see cref="BWMatrix"/> of <paramref name="text"/>, which would requires |
| | 83 | | /// O(n^2) space. |
| | 84 | | /// <br/> |
| | 85 | | /// - Instead, n <see cref="VirtuallyRotatedTextWithTerminator"/> objects are created (one per char of |
| | 86 | | /// <paramref name="text"/>), mapping a specific rotation of the original <paramref name="text"/> and taking |
| | 87 | | /// into account the rotation in its all its char-position dependent functionalities, such as |
| | 88 | | /// <see cref="VirtuallyRotatedTextWithTerminator.CompareTo(VirtuallyRotatedTextWithTerminator?)"/>, |
| | 89 | | /// <see cref="VirtuallyRotatedTextWithTerminator.GetEnumerator"/> etc. |
| | 90 | | /// </para> |
| | 91 | | /// </remarks> |
| | 92 | | public virtual BWTransform BuildTransform(TextWithTerminator text) |
| 61 | 93 | | { |
| 61 | 94 | | var content = Enumerable |
| 61 | 95 | | .Range(0, text.Length) |
| 10401 | 96 | | .Select(i => text.ToVirtuallyRotated(i)) |
| 10401 | 97 | | .OrderBy(i => i) |
| 9304 | 98 | | .Select(vrtext => vrtext[^1]); |
| 61 | 99 | | return new(text, new(content, text.Terminator)); |
| 61 | 100 | | } |
| | 101 | |
|
| | 102 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 103 | | /// <remarks> |
| | 104 | | /// <para id="complexity"> |
| | 105 | | /// COMPLEXITY |
| | 106 | | /// <br/> |
| | 107 | | /// - No computation to be done, except for building the string of the <see cref="TextWithTerminator"/>. |
| | 108 | | /// <br/> |
| | 109 | | /// - Time Complexity = O(n), Space Complexity = O(n), where n = edge of <paramref name="matrix"/>. |
| | 110 | | /// </para> |
| | 111 | | /// </remarks> |
| | 112 | | public virtual TextWithTerminator InvertMatrix(BWMatrix matrix) |
| 20 | 113 | | { |
| 20 | 114 | | var firstBWMRow = matrix.Content[0]; // In the form "$..." where $ is separator |
| 20 | 115 | | return new TextWithTerminator(firstBWMRow[1..].AsValue(), firstBWMRow[0]); |
| 20 | 116 | | } |
| | 117 | |
|
| | 118 | | /// <inheritdoc path="//*[not(self::remarks)]"/> |
| | 119 | | /// <remarks> |
| | 120 | | /// <inheritdoc cref="IBuilder.InvertTransform(RotatedTextWithTerminator)" |
| | 121 | | /// path="/remarks/para[@id='terminator-required']"/> |
| | 122 | | /// <para id="algo"> |
| | 123 | | /// ALGORITHM |
| | 124 | | /// <br/> |
| | 125 | | /// This implementation inverts the BWT by iteratively building n+1-mers from n-mers. |
| | 126 | | /// <br/> |
| | 127 | | /// - 1-mers (first column of the matrix) is just the last column (BWT), sorted. |
| | 128 | | /// That gives a matrix M0 of 1 columns and n rows (where n = length of |
| | 129 | | /// <paramref name="lastBWMColumn"/>). |
| | 130 | | /// <br/> |
| | 131 | | /// - 2-mers are derived from 1-mers, by juxtaposing side-by-side last column (BWT) and M0, sorted. |
| | 132 | | /// That gives a matrix M1 of 2 columns and n rows. |
| | 133 | | /// <br/> |
| | 134 | | /// - 3-mers are derived from 2-mers, by juxtaposing side-by-side last column (BWT) and M1, sorted. |
| | 135 | | /// That gives a matrix M2 of 3 columns and n rows. |
| | 136 | | /// <br/> |
| | 137 | | /// - And so on, up to (n - 1)-mers and matrix M(n - 2) of n - 1 columns and n rows. |
| | 138 | | /// <br/> |
| | 139 | | /// - The last column is already known (BWT), so the text can be extracted from the first line: the first |
| | 140 | | /// char is the separator, the rest is the text without separator. |
| | 141 | | /// </para> |
| | 142 | | /// <para id="complexity"> |
| | 143 | | /// COMPLEXITY |
| | 144 | | /// <br/> |
| | 145 | | /// - There are n top-level iterations, where n is the length of <paramref name="lastBWMColumn"/>. |
| | 146 | | /// <br/> |
| | 147 | | /// - Each iteration takes n * log(n) * m time to sort, where m is the length of strings to compare = n. |
| | 148 | | /// <br/> |
| | 149 | | /// - So total Time Complexity is O(n^3 * log(n)) and Space Complexity is O(n^2). |
| | 150 | | /// </para> |
| | 151 | | /// </remarks> |
| | 152 | | public virtual TextWithTerminator InvertTransform(RotatedTextWithTerminator lastBWMColumn) |
| 5 | 153 | | { |
| 5 | 154 | | var allBWMColumnsExceptLast = BuildAllBWMColumnsExceptLast(lastBWMColumn); |
| | 155 | |
|
| | 156 | | // The text is in the first row of the matrix |
| 5 | 157 | | var text = |
| 5 | 158 | | Enumerable |
| 5 | 159 | | .Range(1, lastBWMColumn.Length - 2) // # columns in the BWM - 1 |
| 168 | 160 | | .Select(i => allBWMColumnsExceptLast[i][0]) |
| 5 | 161 | | .Concat(new char[] { lastBWMColumn[0] }); |
| 5 | 162 | | return new TextWithTerminator(text, lastBWMColumn.Terminator); |
| 5 | 163 | | } |
| | 164 | |
|
| | 165 | | private static IList<IList<char>> BuildAllBWMColumnsExceptLast(RotatedTextWithTerminator lastBWMColumn) |
| 25 | 166 | | { |
| 25 | 167 | | var stringsComparer = StringIncludingTerminatorComparer.Build(lastBWMColumn.Terminator); |
| | 168 | |
|
| 25 | 169 | | var allBWMColumnsExceptLast = new List<IList<char>> { }; |
| 685 | 170 | | foreach (var columnIndex in Enumerable.Range(0, lastBWMColumn.Length - 1)) // # columns in the BWM - 1 |
| 305 | 171 | | { |
| 305 | 172 | | var nextBWMColumn = ( |
| 305 | 173 | | from rowIndex in Enumerable.Range(0, lastBWMColumn.Length) // # rows in the BWM |
| 5240 | 174 | | let lastElementOfRow = Enumerable.Repeat(lastBWMColumn[rowIndex], 1) |
| 5240 | 175 | | let ithFirstElementsOfRow = ( |
| 5240 | 176 | | from j in Enumerable.Range(0, columnIndex) |
| 54050 | 177 | | select allBWMColumnsExceptLast[j][rowIndex]) |
| 305 | 178 | | let lastAndIthFirstElementsOfRow = |
| 5240 | 179 | | string.Concat(lastElementOfRow.Concat(ithFirstElementsOfRow)) |
| 5240 | 180 | | select lastAndIthFirstElementsOfRow) |
| 5240 | 181 | | .OrderBy(lastAndIthFirstElementsOfRow => lastAndIthFirstElementsOfRow, stringsComparer) |
| 5545 | 182 | | .Select(lastAndIthFirstElementsOfRow => lastAndIthFirstElementsOfRow[^1]); |
| 305 | 183 | | allBWMColumnsExceptLast.Add(nextBWMColumn.ToList()); |
| 305 | 184 | | } |
| | 185 | |
|
| 25 | 186 | | return allBWMColumnsExceptLast; |
| 25 | 187 | | } |
| | 188 | | } |