@@ -186,13 +186,38 @@ type BWT struct {
186186 // represented as a list of skipEntries because the first column of
187187 // the BWT is always lexicographically ordered. This saves time and memory.
188188 firstColumnSkipList []skipEntry
189- // Column last column of the BWT- the actual textual representation
190- // of the BWT.
191- lastColumn waveletTree
192189 // suffixArray an array that allows us to map a position in the first
193190 // column to a position in the original sequence. This is needed to be
194191 // able to extract text from the BWT.
195192 suffixArray []int
193+ // runLengthCompressedBWT is the compressed version of the BWT. The compression
194+ // is for each run. For Example:
195+ // the sequence "banana" has BWT "annb$aa"
196+ // the run length compression of "annb$aa" is "anb$a"
197+ // This helps us save a lot of memory while still having a search index we can
198+ // use to align the original sequence. This allows us to understand how many
199+ // runs of a certain character there are and where a run of a certain rank exists.
200+ runBWTCompression waveletTree
201+ // runStartPositions are the starting position of each run in the original sequence
202+ // For example:
203+ // "annb$aa" will have the runStartPositions [0, 1, 3, 4, 5]
204+ // This helps us map our search range from "uncompressed BWT Space" to its
205+ // "compressed BWT Run Space". With this, we can understand which runs we need
206+ // to consider during LF mapping.
207+ runStartPositions runInfo
208+ // runCumulativeCounts is the cumulative count of characters for each run.
209+ // This helps us efficiently lookup the number of occurrences of a given
210+ // character before a given offset in "uncompressed BWT Space"
211+ // For Example:
212+ // "annb$aa" will have the runCumulativeCounts:
213+ // "a": [0, 1, 3],
214+ // "n": [0, 2],
215+ // "b": [0, 1],
216+ // "$": [0, 1],
217+ runCumulativeCounts map [string ]runInfo
218+
219+ // flag for turning on BWT debugging
220+ debug bool
196221}
197222
198223// Count represents the number of times the provided pattern
@@ -269,7 +294,34 @@ func (bwt BWT) Len() int {
269294
270295// GetTransform returns the last column of the BWT transform of the original sequence.
271296func (bwt BWT ) GetTransform () string {
272- return bwt .lastColumn .reconstruct ()
297+ lastColumn := strings.Builder {}
298+ lastColumn .Grow (bwt .getLenOfOriginalStringWithNullChar ())
299+ for i := 0 ; i < bwt .runBWTCompression .length ; i ++ {
300+ currChar := bwt .runBWTCompression .Access (i )
301+ var currCharEnd int
302+ if i + 1 >= len (bwt .runStartPositions ) {
303+ currCharEnd = bwt .getLenOfOriginalStringWithNullChar ()
304+ } else {
305+ currCharEnd = bwt .runStartPositions [i + 1 ]
306+ }
307+ for lastColumn .Len () < currCharEnd {
308+ lastColumn .WriteByte (currChar )
309+ }
310+ }
311+ return lastColumn .String ()
312+ }
313+
314+ //lint:ignore U1000 Ignore unused function. This is valuable for future debugging
315+ func (bwt BWT ) getFirstColumnStr () string {
316+ firstColumn := strings.Builder {}
317+ firstColumn .Grow (bwt .getLenOfOriginalStringWithNullChar ())
318+ for i := 0 ; i < len (bwt .firstColumnSkipList ); i ++ {
319+ e := bwt .firstColumnSkipList [i ]
320+ for j := e .openEndedInterval .start ; j < e .openEndedInterval .end ; j ++ {
321+ firstColumn .WriteByte (e .char )
322+ }
323+ }
324+ return firstColumn .String ()
273325}
274326
275327// getFCharPosFromOriginalSequenceCharPos looks up mapping from the original position
@@ -291,21 +343,55 @@ func (bwt BWT) getFCharPosFromOriginalSequenceCharPos(originalPos int) int {
291343func (bwt BWT ) lfSearch (pattern string ) interval {
292344 searchRange := interval {start : 0 , end : bwt .getLenOfOriginalStringWithNullChar ()}
293345 for i := 0 ; i < len (pattern ); i ++ {
346+ if bwt .debug {
347+ printLFDebug (bwt , searchRange , i )
348+ }
294349 if searchRange .end - searchRange .start <= 0 {
295350 return interval {}
296351 }
297352
298353 c := pattern [len (pattern )- 1 - i ]
299- skip , ok := bwt .lookupSkipByChar (c )
300- if ! ok {
301- return interval {}
302- }
303- searchRange .start = skip .openEndedInterval .start + bwt .lastColumn .Rank (c , searchRange .start )
304- searchRange .end = skip .openEndedInterval .start + bwt .lastColumn .Rank (c , searchRange .end )
354+ nextStart := bwt .getNextLfSearchOffset (c , searchRange .start )
355+ nextEnd := bwt .getNextLfSearchOffset (c , searchRange .end )
356+ searchRange .start = nextStart
357+ searchRange .end = nextEnd
305358 }
306359 return searchRange
307360}
308361
362+ func (bwt BWT ) getNextLfSearchOffset (c byte , offset int ) int {
363+ nearestRunStart := bwt .runStartPositions .FindNearestRunStartPosition (offset + 1 )
364+ maxRunInCompressedSpace := bwt .runBWTCompression .Rank (c , nearestRunStart )
365+
366+ skip , ok := bwt .lookupSkipByChar (c )
367+ if ! ok {
368+ return 0
369+ }
370+
371+ cumulativeCounts , ok := bwt .runCumulativeCounts [string (c )]
372+ if ! ok {
373+ return 0
374+ }
375+
376+ cumulativeCountBeforeMaxRun := cumulativeCounts [maxRunInCompressedSpace ]
377+
378+ currRunStart := bwt .runStartPositions .FindNearestRunStartPosition (offset )
379+ currentRunChar := string (bwt .runBWTCompression .Access (currRunStart ))
380+ extraOffset := 0
381+ // It is possible that an offset currently lies within a run of the same
382+ // character we are inspecting. In this case, cumulativeCountBeforeMaxRun
383+ // is not enough since the Max Run in this case does not include the run
384+ // the offset is currently in. To adjust for this, we must count the number
385+ // of character occurrences since the beginning of the run that the offset
386+ // is currently in.
387+ if c == currentRunChar [0 ] {
388+ o := bwt .runStartPositions [nearestRunStart ]
389+ extraOffset += offset - o
390+ }
391+
392+ return skip .openEndedInterval .start + cumulativeCountBeforeMaxRun + extraOffset
393+ }
394+
309395// lookupSkipByChar looks up a skipEntry by its character in the First Column
310396func (bwt BWT ) lookupSkipByChar (c byte ) (entry skipEntry , ok bool ) {
311397 for i := range bwt .firstColumnSkipList {
@@ -372,30 +458,64 @@ func New(sequence string) (BWT, error) {
372458 sortPrefixArray (prefixArray )
373459
374460 suffixArray := make ([]int , len (sequence ))
375- lastColBuilder := strings.Builder {}
461+ charCount := 0
462+ runBWTCompressionBuilder := strings.Builder {}
463+ var runStartPositions runInfo
464+ runCumulativeCounts := make (map [string ]runInfo )
465+
466+ var prevChar * byte
376467 for i := 0 ; i < len (prefixArray ); i ++ {
377468 currChar := sequence [getBWTIndex (len (sequence ), len (prefixArray [i ]))]
378- lastColBuilder .WriteByte (currChar )
469+ if prevChar == nil {
470+ prevChar = & currChar
471+ }
472+
473+ if currChar != * prevChar {
474+ runBWTCompressionBuilder .WriteByte (* prevChar )
475+ runStartPositions = append (runStartPositions , i - charCount )
476+ addRunCumulativeCountEntry (runCumulativeCounts , * prevChar , charCount )
379477
478+ charCount = 0
479+ prevChar = & currChar
480+ }
481+
482+ charCount ++
380483 suffixArray [i ] = len (sequence ) - len (prefixArray [i ])
381484 }
485+ runBWTCompressionBuilder .WriteByte (* prevChar )
486+ runStartPositions = append (runStartPositions , len (prefixArray )- charCount )
487+ addRunCumulativeCountEntry (runCumulativeCounts , * prevChar , charCount )
488+
382489 fb := strings.Builder {}
383490 for i := 0 ; i < len (prefixArray ); i ++ {
384491 fb .WriteByte (prefixArray [i ][0 ])
385492 }
386493
387- wt , err := newWaveletTreeFromString (lastColBuilder .String ())
494+ skipList := buildSkipList (prefixArray )
495+
496+ wt , err := newWaveletTreeFromString (runBWTCompressionBuilder .String ())
388497 if err != nil {
389498 return BWT {}, err
390499 }
391-
392500 return BWT {
393- firstColumnSkipList : buildSkipList (prefixArray ),
394- lastColumn : wt ,
501+ firstColumnSkipList : skipList ,
395502 suffixArray : suffixArray ,
503+ runBWTCompression : wt ,
504+ runStartPositions : runStartPositions ,
505+ runCumulativeCounts : runCumulativeCounts ,
396506 }, nil
397507}
398508
509+ func addRunCumulativeCountEntry (rumCumulativeCounts map [string ]runInfo , char byte , charCount int ) {
510+ cumulativeCountsOfChar , ok := rumCumulativeCounts [string (char )]
511+ if ok {
512+ cumulativeCountsOfChar = append (cumulativeCountsOfChar , charCount + cumulativeCountsOfChar [len (cumulativeCountsOfChar )- 1 ])
513+ } else {
514+ cumulativeCountsOfChar = runInfo {0 , charCount }
515+ }
516+ rumCumulativeCounts [string (char )] = cumulativeCountsOfChar
517+ }
518+
399519// buildSkipList compressed the First Column of the BWT into a skip list
400520func buildSkipList (prefixArray []string ) []skipEntry {
401521 prevChar := prefixArray [0 ][0 ]
@@ -457,6 +577,38 @@ func bwtRecovery(operation string, err *error) {
457577 }
458578}
459579
580+ // runInfo each element of runInfo should represent an offset i where i
581+ // corresponds to the start of a run in a given sequence. For example,
582+ // aaaabbccc would have the run info [0, 4, 6]
583+ type runInfo []int
584+
585+ // FindNearestRunStartPosition given some offset, find the nearest starting position for the.
586+ // beginning of a run. Another way of saying this is give me the max i where runStartPositions[i] <= offset.
587+ // This is needed so we can understand which run an offset is a part of.
588+ func (r runInfo ) FindNearestRunStartPosition (offset int ) int {
589+ start := 0
590+ end := len (r ) - 1
591+ for start < end {
592+ mid := start + (end - start )/ 2
593+ if r [mid ] < offset {
594+ start = mid + 1
595+ continue
596+ }
597+ if r [mid ] > offset {
598+ end = mid - 1
599+ continue
600+ }
601+
602+ return mid
603+ }
604+
605+ if r [start ] > offset {
606+ return start - 1
607+ }
608+
609+ return start
610+ }
611+
460612func isValidPattern (s string ) (err error ) {
461613 if len (s ) == 0 {
462614 return errors .New ("Pattern can not be empty" )
@@ -480,3 +632,58 @@ func validateSequenceBeforeTransforming(sequence *string) (err error) {
480632 }
481633 return nil
482634}
635+
636+ // printLFDebug this will print the first column and last column of the BWT along with some ascii visualizations.
637+ // This is very helpful for debugging the LF mapping. For example, lets say you're in the middle of making some changes to the LF
638+ // mapping and the test for counting starts to fail. To understand where the LF search is going wrong, you
639+ // can do something like the below to outline which parts of the BWT are being searched some given iteration.
640+ //
641+ // For Example, if you had the BWT of:
642+ // "rowrowrowyourboat"
643+ // and wanted to Count the number of occurrences of "row"
644+ // Then the iterations of the LF search would look something like:
645+ //
646+ // BWT Debug Begin Iteration: 0
647+ // torbyrrru$wwaoooow
648+ // $abooooorrrrtuwwwy
649+ // ^^^^^^^^^^^^^^^^^^X
650+ //
651+ // BWT Debug Begin Iteration: 1
652+ // torbyrrru$wwaoooow
653+ // $abooooorrrrtuwwwy
654+ // ______________^^^X
655+ //
656+ // BWT Debug Begin Iteration: 2
657+ // torbyrrru$wwaoooow
658+ // $abooooorrrrtuwwwy
659+ // _____^^^X
660+ //
661+ // Where:
662+ // * '^' denotes the active search range
663+ // * 'X' denotes one character after the end of the active search searchRange
664+ // * '_' is visual padding to help align the active search range
665+ //
666+ // NOTE: It can also be helpful to include the other auxiliary data structures. For example, it can be very helpful to include
667+ // a similar visualization for the run length compression to help debug and understand which run were used to compute the active
668+ // search window during each iteration.
669+ func printLFDebug (bwt BWT , searchRange interval , iteration int ) {
670+ first := bwt .getFirstColumnStr ()
671+ last := bwt .GetTransform ()
672+ lastRunCompression := bwt .runBWTCompression .reconstruct ()
673+
674+ fullASCIIRange := strings.Builder {}
675+ fullASCIIRange .Grow (searchRange .end + 1 )
676+ for i := 0 ; i < searchRange .start ; i ++ {
677+ fullASCIIRange .WriteRune ('_' )
678+ }
679+ for i := searchRange .start ; i < searchRange .end ; i ++ {
680+ fullASCIIRange .WriteRune ('^' )
681+ }
682+ fullASCIIRange .WriteRune ('X' )
683+
684+ fmt .Println ("BWT Debug Begin Iteration:" , iteration )
685+ fmt .Println (last )
686+ fmt .Println (first )
687+ fmt .Println (fullASCIIRange .String ())
688+ fmt .Println (lastRunCompression )
689+ }
0 commit comments