Skip to content

Commit 05169d2

Browse files
Run length bwt (#440)
* run length bwt * better clarify mapping from original sequnce space to run space --------- Co-authored-by: Timothy Stiles <[email protected]>
1 parent 3248776 commit 05169d2

File tree

5 files changed

+409
-44
lines changed

5 files changed

+409
-44
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
### Added
1111
- Basic BWT for sub-sequence count and offset for sequence alignment. Only supports exact matches for now.
1212
- Moved `BWT`, `align`, and `mash` packages to new `search` sub-directory.
13+
- Implemented Run-Length Burrows Wheeler Transform.
1314

1415

1516
## [0.30.0] - 2023-12-18

search/bwt/bwt.go

Lines changed: 223 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -186,13 +186,38 @@ type BWT struct {
186186
// represented as a list of skipEntries because the first column of
187187
// the BWT is always lexicographically ordered. This saves time and memory.
188188
firstColumnSkipList []skipEntry
189-
// Column last column of the BWT- the actual textual representation
190-
// of the BWT.
191-
lastColumn waveletTree
192189
// suffixArray an array that allows us to map a position in the first
193190
// column to a position in the original sequence. This is needed to be
194191
// able to extract text from the BWT.
195192
suffixArray []int
193+
// runLengthCompressedBWT is the compressed version of the BWT. The compression
194+
// is for each run. For Example:
195+
// the sequence "banana" has BWT "annb$aa"
196+
// the run length compression of "annb$aa" is "anb$a"
197+
// This helps us save a lot of memory while still having a search index we can
198+
// use to align the original sequence. This allows us to understand how many
199+
// runs of a certain character there are and where a run of a certain rank exists.
200+
runBWTCompression waveletTree
201+
// runStartPositions are the starting position of each run in the original sequence
202+
// For example:
203+
// "annb$aa" will have the runStartPositions [0, 1, 3, 4, 5]
204+
// This helps us map our search range from "uncompressed BWT Space" to its
205+
// "compressed BWT Run Space". With this, we can understand which runs we need
206+
// to consider during LF mapping.
207+
runStartPositions runInfo
208+
// runCumulativeCounts is the cumulative count of characters for each run.
209+
// This helps us efficiently lookup the number of occurrences of a given
210+
// character before a given offset in "uncompressed BWT Space"
211+
// For Example:
212+
// "annb$aa" will have the runCumulativeCounts:
213+
// "a": [0, 1, 3],
214+
// "n": [0, 2],
215+
// "b": [0, 1],
216+
// "$": [0, 1],
217+
runCumulativeCounts map[string]runInfo
218+
219+
// flag for turning on BWT debugging
220+
debug bool
196221
}
197222

198223
// Count represents the number of times the provided pattern
@@ -269,7 +294,34 @@ func (bwt BWT) Len() int {
269294

270295
// GetTransform returns the last column of the BWT transform of the original sequence.
271296
func (bwt BWT) GetTransform() string {
272-
return bwt.lastColumn.reconstruct()
297+
lastColumn := strings.Builder{}
298+
lastColumn.Grow(bwt.getLenOfOriginalStringWithNullChar())
299+
for i := 0; i < bwt.runBWTCompression.length; i++ {
300+
currChar := bwt.runBWTCompression.Access(i)
301+
var currCharEnd int
302+
if i+1 >= len(bwt.runStartPositions) {
303+
currCharEnd = bwt.getLenOfOriginalStringWithNullChar()
304+
} else {
305+
currCharEnd = bwt.runStartPositions[i+1]
306+
}
307+
for lastColumn.Len() < currCharEnd {
308+
lastColumn.WriteByte(currChar)
309+
}
310+
}
311+
return lastColumn.String()
312+
}
313+
314+
//lint:ignore U1000 Ignore unused function. This is valuable for future debugging
315+
func (bwt BWT) getFirstColumnStr() string {
316+
firstColumn := strings.Builder{}
317+
firstColumn.Grow(bwt.getLenOfOriginalStringWithNullChar())
318+
for i := 0; i < len(bwt.firstColumnSkipList); i++ {
319+
e := bwt.firstColumnSkipList[i]
320+
for j := e.openEndedInterval.start; j < e.openEndedInterval.end; j++ {
321+
firstColumn.WriteByte(e.char)
322+
}
323+
}
324+
return firstColumn.String()
273325
}
274326

275327
// getFCharPosFromOriginalSequenceCharPos looks up mapping from the original position
@@ -291,21 +343,55 @@ func (bwt BWT) getFCharPosFromOriginalSequenceCharPos(originalPos int) int {
291343
func (bwt BWT) lfSearch(pattern string) interval {
292344
searchRange := interval{start: 0, end: bwt.getLenOfOriginalStringWithNullChar()}
293345
for i := 0; i < len(pattern); i++ {
346+
if bwt.debug {
347+
printLFDebug(bwt, searchRange, i)
348+
}
294349
if searchRange.end-searchRange.start <= 0 {
295350
return interval{}
296351
}
297352

298353
c := pattern[len(pattern)-1-i]
299-
skip, ok := bwt.lookupSkipByChar(c)
300-
if !ok {
301-
return interval{}
302-
}
303-
searchRange.start = skip.openEndedInterval.start + bwt.lastColumn.Rank(c, searchRange.start)
304-
searchRange.end = skip.openEndedInterval.start + bwt.lastColumn.Rank(c, searchRange.end)
354+
nextStart := bwt.getNextLfSearchOffset(c, searchRange.start)
355+
nextEnd := bwt.getNextLfSearchOffset(c, searchRange.end)
356+
searchRange.start = nextStart
357+
searchRange.end = nextEnd
305358
}
306359
return searchRange
307360
}
308361

362+
func (bwt BWT) getNextLfSearchOffset(c byte, offset int) int {
363+
nearestRunStart := bwt.runStartPositions.FindNearestRunStartPosition(offset + 1)
364+
maxRunInCompressedSpace := bwt.runBWTCompression.Rank(c, nearestRunStart)
365+
366+
skip, ok := bwt.lookupSkipByChar(c)
367+
if !ok {
368+
return 0
369+
}
370+
371+
cumulativeCounts, ok := bwt.runCumulativeCounts[string(c)]
372+
if !ok {
373+
return 0
374+
}
375+
376+
cumulativeCountBeforeMaxRun := cumulativeCounts[maxRunInCompressedSpace]
377+
378+
currRunStart := bwt.runStartPositions.FindNearestRunStartPosition(offset)
379+
currentRunChar := string(bwt.runBWTCompression.Access(currRunStart))
380+
extraOffset := 0
381+
// It is possible that an offset currently lies within a run of the same
382+
// character we are inspecting. In this case, cumulativeCountBeforeMaxRun
383+
// is not enough since the Max Run in this case does not include the run
384+
// the offset is currently in. To adjust for this, we must count the number
385+
// of character occurrences since the beginning of the run that the offset
386+
// is currently in.
387+
if c == currentRunChar[0] {
388+
o := bwt.runStartPositions[nearestRunStart]
389+
extraOffset += offset - o
390+
}
391+
392+
return skip.openEndedInterval.start + cumulativeCountBeforeMaxRun + extraOffset
393+
}
394+
309395
// lookupSkipByChar looks up a skipEntry by its character in the First Column
310396
func (bwt BWT) lookupSkipByChar(c byte) (entry skipEntry, ok bool) {
311397
for i := range bwt.firstColumnSkipList {
@@ -372,30 +458,64 @@ func New(sequence string) (BWT, error) {
372458
sortPrefixArray(prefixArray)
373459

374460
suffixArray := make([]int, len(sequence))
375-
lastColBuilder := strings.Builder{}
461+
charCount := 0
462+
runBWTCompressionBuilder := strings.Builder{}
463+
var runStartPositions runInfo
464+
runCumulativeCounts := make(map[string]runInfo)
465+
466+
var prevChar *byte
376467
for i := 0; i < len(prefixArray); i++ {
377468
currChar := sequence[getBWTIndex(len(sequence), len(prefixArray[i]))]
378-
lastColBuilder.WriteByte(currChar)
469+
if prevChar == nil {
470+
prevChar = &currChar
471+
}
472+
473+
if currChar != *prevChar {
474+
runBWTCompressionBuilder.WriteByte(*prevChar)
475+
runStartPositions = append(runStartPositions, i-charCount)
476+
addRunCumulativeCountEntry(runCumulativeCounts, *prevChar, charCount)
379477

478+
charCount = 0
479+
prevChar = &currChar
480+
}
481+
482+
charCount++
380483
suffixArray[i] = len(sequence) - len(prefixArray[i])
381484
}
485+
runBWTCompressionBuilder.WriteByte(*prevChar)
486+
runStartPositions = append(runStartPositions, len(prefixArray)-charCount)
487+
addRunCumulativeCountEntry(runCumulativeCounts, *prevChar, charCount)
488+
382489
fb := strings.Builder{}
383490
for i := 0; i < len(prefixArray); i++ {
384491
fb.WriteByte(prefixArray[i][0])
385492
}
386493

387-
wt, err := newWaveletTreeFromString(lastColBuilder.String())
494+
skipList := buildSkipList(prefixArray)
495+
496+
wt, err := newWaveletTreeFromString(runBWTCompressionBuilder.String())
388497
if err != nil {
389498
return BWT{}, err
390499
}
391-
392500
return BWT{
393-
firstColumnSkipList: buildSkipList(prefixArray),
394-
lastColumn: wt,
501+
firstColumnSkipList: skipList,
395502
suffixArray: suffixArray,
503+
runBWTCompression: wt,
504+
runStartPositions: runStartPositions,
505+
runCumulativeCounts: runCumulativeCounts,
396506
}, nil
397507
}
398508

509+
func addRunCumulativeCountEntry(rumCumulativeCounts map[string]runInfo, char byte, charCount int) {
510+
cumulativeCountsOfChar, ok := rumCumulativeCounts[string(char)]
511+
if ok {
512+
cumulativeCountsOfChar = append(cumulativeCountsOfChar, charCount+cumulativeCountsOfChar[len(cumulativeCountsOfChar)-1])
513+
} else {
514+
cumulativeCountsOfChar = runInfo{0, charCount}
515+
}
516+
rumCumulativeCounts[string(char)] = cumulativeCountsOfChar
517+
}
518+
399519
// buildSkipList compressed the First Column of the BWT into a skip list
400520
func buildSkipList(prefixArray []string) []skipEntry {
401521
prevChar := prefixArray[0][0]
@@ -457,6 +577,38 @@ func bwtRecovery(operation string, err *error) {
457577
}
458578
}
459579

580+
// runInfo each element of runInfo should represent an offset i where i
581+
// corresponds to the start of a run in a given sequence. For example,
582+
// aaaabbccc would have the run info [0, 4, 6]
583+
type runInfo []int
584+
585+
// FindNearestRunStartPosition given some offset, find the nearest starting position for the.
586+
// beginning of a run. Another way of saying this is give me the max i where runStartPositions[i] <= offset.
587+
// This is needed so we can understand which run an offset is a part of.
588+
func (r runInfo) FindNearestRunStartPosition(offset int) int {
589+
start := 0
590+
end := len(r) - 1
591+
for start < end {
592+
mid := start + (end-start)/2
593+
if r[mid] < offset {
594+
start = mid + 1
595+
continue
596+
}
597+
if r[mid] > offset {
598+
end = mid - 1
599+
continue
600+
}
601+
602+
return mid
603+
}
604+
605+
if r[start] > offset {
606+
return start - 1
607+
}
608+
609+
return start
610+
}
611+
460612
func isValidPattern(s string) (err error) {
461613
if len(s) == 0 {
462614
return errors.New("Pattern can not be empty")
@@ -480,3 +632,58 @@ func validateSequenceBeforeTransforming(sequence *string) (err error) {
480632
}
481633
return nil
482634
}
635+
636+
// printLFDebug this will print the first column and last column of the BWT along with some ascii visualizations.
637+
// This is very helpful for debugging the LF mapping. For example, lets say you're in the middle of making some changes to the LF
638+
// mapping and the test for counting starts to fail. To understand where the LF search is going wrong, you
639+
// can do something like the below to outline which parts of the BWT are being searched some given iteration.
640+
//
641+
// For Example, if you had the BWT of:
642+
// "rowrowrowyourboat"
643+
// and wanted to Count the number of occurrences of "row"
644+
// Then the iterations of the LF search would look something like:
645+
//
646+
// BWT Debug Begin Iteration: 0
647+
// torbyrrru$wwaoooow
648+
// $abooooorrrrtuwwwy
649+
// ^^^^^^^^^^^^^^^^^^X
650+
//
651+
// BWT Debug Begin Iteration: 1
652+
// torbyrrru$wwaoooow
653+
// $abooooorrrrtuwwwy
654+
// ______________^^^X
655+
//
656+
// BWT Debug Begin Iteration: 2
657+
// torbyrrru$wwaoooow
658+
// $abooooorrrrtuwwwy
659+
// _____^^^X
660+
//
661+
// Where:
662+
// * '^' denotes the active search range
663+
// * 'X' denotes one character after the end of the active search searchRange
664+
// * '_' is visual padding to help align the active search range
665+
//
666+
// NOTE: It can also be helpful to include the other auxiliary data structures. For example, it can be very helpful to include
667+
// a similar visualization for the run length compression to help debug and understand which run were used to compute the active
668+
// search window during each iteration.
669+
func printLFDebug(bwt BWT, searchRange interval, iteration int) {
670+
first := bwt.getFirstColumnStr()
671+
last := bwt.GetTransform()
672+
lastRunCompression := bwt.runBWTCompression.reconstruct()
673+
674+
fullASCIIRange := strings.Builder{}
675+
fullASCIIRange.Grow(searchRange.end + 1)
676+
for i := 0; i < searchRange.start; i++ {
677+
fullASCIIRange.WriteRune('_')
678+
}
679+
for i := searchRange.start; i < searchRange.end; i++ {
680+
fullASCIIRange.WriteRune('^')
681+
}
682+
fullASCIIRange.WriteRune('X')
683+
684+
fmt.Println("BWT Debug Begin Iteration:", iteration)
685+
fmt.Println(last)
686+
fmt.Println(first)
687+
fmt.Println(fullASCIIRange.String())
688+
fmt.Println(lastRunCompression)
689+
}

0 commit comments

Comments
 (0)