Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# CHANGELOG

## 1.2.0

- Fix: Handle BAM records with empty SEQ fields (`*` in SAM format) in `base-depth`. Previously this would cause an out-of-bounds error. Now these reads still count toward depth, and their bases are counted as `N`. ([#92](https://github.com/sstadick/perbase/pull/92) by @ghuls)

## 1.1.0

- Fix/Feat: For both MapQualBaseQualN and BaseQualMapQualN only resolve to N when the bases are ambiguous, otherwise return the consensus base.
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "perbase"
version = "1.1.0"
version = "1.2.0"
authors = ["Seth Stadick <sstadick@gmail.com>"]
edition = "2024"
license = "MIT"
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ You can also download a binary from the [releases](https://github.com/sstadick/p

The `base-depth` tool walks over every position in the BAM/CRAM file and calculates the depth, as well as the number of each nucleotide at the given position. Additionally, it counts the numbers of Ins/Dels at each position.

**Note on empty SEQ records:** BAM records with empty SEQ fields (represented as `*` in SAM format) are handled gracefully. These reads still count toward depth, but since the actual nucleotide cannot be determined, they are counted as `N`.

The output columns are as follows:

| Column | Description |
Expand Down
233 changes: 232 additions & 1 deletion src/lib/position/pileup_position.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ impl PileupPosition {
// Check if we are checking the base quality score
// && Check if the base quality score is greater or equal to than the cutoff
if let Some(base_qual_filter) = base_filter
&& record.qual()[alignment.qpos().unwrap()] < base_qual_filter
&& (record.seq().is_empty() || record.qual()[alignment.qpos().unwrap()] < base_qual_filter)
{
self.n += 1
} else if let Some(b) = recommended_base {
Expand All @@ -124,6 +124,8 @@ impl PileupPosition {
Base::M => self.m += 1,
_ => self.n += 1,
}
} else if record.seq().is_empty() {
self.n += 1
} else {
match (record.seq()[alignment.qpos().unwrap()] as char).to_ascii_uppercase() {
'A' => self.a += 1,
Expand Down Expand Up @@ -517,6 +519,235 @@ mod tests {
}
}

/// Test that reads with empty SEQ (represented as `*` in SAM format) are handled correctly
/// in the pileup code path used by base-depth. Empty SEQ reads should still count toward
/// depth, but their bases should be counted as N. This tests the fix for issue #91.
#[test]
fn test_empty_seq_pileup() {
use rust_htslib::bam::{IndexedReader, Writer, index};
use tempfile::tempdir;

let tempdir = tempdir().unwrap();
let bam_path = tempdir.path().join("empty_seq.bam");

// Create header
let mut header = bam::header::Header::new();
let mut chr1 = bam::header::HeaderRecord::new(b"SQ");
chr1.push_tag(b"SN", &"chr1".to_owned());
chr1.push_tag(b"LN", &"100".to_owned());
header.push_record(&chr1);
let view = bam::HeaderView::from_header(&header);

// Create records including one with empty SEQ
// Normal read with 'A' bases at positions 1-25
let normal_record = Record::from_sam(
&view,
b"NORMAL\t0\tchr1\t1\t40\t25M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAA\t#########################",
)
.unwrap();

// Read with empty SEQ at positions 10-35 - should count toward depth and N count
let empty_seq_record = Record::from_sam(
&view,
b"EMPTY_SEQ\t0\tchr1\t10\t40\t25M\t*\t0\t0\t*\t*",
)
.unwrap();

// Another normal read with 'G' bases at positions 15-40
let normal_record2 = Record::from_sam(
&view,
b"NORMAL2\t0\tchr1\t15\t40\t25M\t*\t0\t0\tGGGGGGGGGGGGGGGGGGGGGGGGG\t#########################",
)
.unwrap();

// Write BAM file
let mut writer = Writer::from_path(&bam_path, &header, bam::Format::Bam).unwrap();
writer.write(&normal_record).unwrap();
writer.write(&empty_seq_record).unwrap();
writer.write(&normal_record2).unwrap();
drop(writer);

// Index the BAM file
index::build(&bam_path, None, index::Type::Bai, 1).unwrap();

// Read back and create pileup
let mut reader = IndexedReader::from_path(&bam_path).unwrap();
let header_view = reader.header().clone();

let read_filter = DefaultReadFilter::new(0, 0, 0);

// Test at position 14 (0-based) = position 15 (1-based)
// At this position: NORMAL (A), EMPTY_SEQ (*), NORMAL2 (G) = depth 3
reader.fetch(("chr1", 14, 15)).unwrap();
let pileup_iter = reader.pileup();

for pileup_result in pileup_iter {
let pileup = pileup_result.unwrap();
if pileup.pos() == 14 {
let position = PileupPosition::from_pileup(pileup, &header_view, &read_filter, None);

// Depth should include all 3 reads
assert_eq!(position.depth, 3, "Depth should be 3 (NORMAL + EMPTY_SEQ + NORMAL2)");

// 'A' count from NORMAL read
assert_eq!(position.a, 1, "Should have 1 A from NORMAL read");

// 'G' count from NORMAL2 read
assert_eq!(position.g, 1, "Should have 1 G from NORMAL2 read");

// 'N' count from EMPTY_SEQ read (empty seq should be counted as N)
assert_eq!(position.n, 1, "Empty SEQ read should be counted as N");

break;
}
}
}

/// Test that reads with empty SEQ work correctly with base quality filtering.
/// When a base quality filter is set and SEQ is empty, the read should be counted as N.
#[test]
fn test_empty_seq_with_base_qual_filter() {
use rust_htslib::bam::{IndexedReader, Writer, index};
use tempfile::tempdir;

let tempdir = tempdir().unwrap();
let bam_path = tempdir.path().join("empty_seq_bq.bam");

// Create header
let mut header = bam::header::Header::new();
let mut chr1 = bam::header::HeaderRecord::new(b"SQ");
chr1.push_tag(b"SN", &"chr1".to_owned());
chr1.push_tag(b"LN", &"100".to_owned());
header.push_record(&chr1);
let view = bam::HeaderView::from_header(&header);

// Normal read with high quality
let normal_record = Record::from_sam(
&view,
b"NORMAL\t0\tchr1\t1\t40\t25M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAA\tIIIIIIIIIIIIIIIIIIIIIIIII",
)
.unwrap();

// Read with empty SEQ
let empty_seq_record = Record::from_sam(
&view,
b"EMPTY_SEQ\t0\tchr1\t1\t40\t25M\t*\t0\t0\t*\t*",
)
.unwrap();

// Write BAM file
let mut writer = Writer::from_path(&bam_path, &header, bam::Format::Bam).unwrap();
writer.write(&normal_record).unwrap();
writer.write(&empty_seq_record).unwrap();
drop(writer);

// Index the BAM file
index::build(&bam_path, None, index::Type::Bai, 1).unwrap();

// Read back and create pileup with base quality filter
let mut reader = IndexedReader::from_path(&bam_path).unwrap();
let header_view = reader.header().clone();

let read_filter = DefaultReadFilter::new(0, 0, 0);

reader.fetch(("chr1", 0, 1)).unwrap();
let pileup_iter = reader.pileup();

for pileup_result in pileup_iter {
let pileup = pileup_result.unwrap();
if pileup.pos() == 0 {
// With base quality filter of 20
let position =
PileupPosition::from_pileup(pileup, &header_view, &read_filter, Some(20));

// Depth should include both reads
assert_eq!(position.depth, 2, "Depth should be 2");

// 'A' from NORMAL (quality 'I' = 40, passes filter)
assert_eq!(position.a, 1, "Should have 1 A from high-quality NORMAL read");

// Empty SEQ read should be counted as N (regardless of base qual filter)
assert_eq!(position.n, 1, "Empty SEQ read should be counted as N");

break;
}
}
}

/// Test that reads with empty SEQ work correctly in mate-aware pileup mode.
#[test]
fn test_empty_seq_mate_aware() {
use rust_htslib::bam::{IndexedReader, Writer, index};
use tempfile::tempdir;

let tempdir = tempdir().unwrap();
let bam_path = tempdir.path().join("empty_seq_mate.bam");

// Create header
let mut header = bam::header::Header::new();
let mut chr1 = bam::header::HeaderRecord::new(b"SQ");
chr1.push_tag(b"SN", &"chr1".to_owned());
chr1.push_tag(b"LN", &"100".to_owned());
header.push_record(&chr1);
let view = bam::HeaderView::from_header(&header);

// Normal read
let normal_record = Record::from_sam(
&view,
b"NORMAL\t0\tchr1\t1\t40\t25M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAA\t#########################",
)
.unwrap();

// Read with empty SEQ
let empty_seq_record = Record::from_sam(
&view,
b"EMPTY_SEQ\t0\tchr1\t1\t40\t25M\t*\t0\t0\t*\t*",
)
.unwrap();

// Write BAM file
let mut writer = Writer::from_path(&bam_path, &header, bam::Format::Bam).unwrap();
writer.write(&normal_record).unwrap();
writer.write(&empty_seq_record).unwrap();
drop(writer);

// Index the BAM file
index::build(&bam_path, None, index::Type::Bai, 1).unwrap();

// Read back and create pileup in mate-aware mode
let mut reader = IndexedReader::from_path(&bam_path).unwrap();
let header_view = reader.header().clone();

let read_filter = DefaultReadFilter::new(0, 0, 0);

reader.fetch(("chr1", 0, 1)).unwrap();
let pileup_iter = reader.pileup();

for pileup_result in pileup_iter {
let pileup = pileup_result.unwrap();
if pileup.pos() == 0 {
let position = PileupPosition::from_pileup_mate_aware(
pileup,
&header_view,
&read_filter,
None,
MateResolutionStrategy::Original,
);

// Depth should include both reads
assert_eq!(position.depth, 2, "Depth should be 2");

// 'A' from NORMAL read
assert_eq!(position.a, 1, "Should have 1 A from NORMAL read");

// Empty SEQ read should be counted as N
assert_eq!(position.n, 1, "Empty SEQ read should be counted as N");

break;
}
}
}

/// Test N strategy
#[test]
fn test_n_strategy() {
Expand Down