diff --git a/CHANGELOG.md b/CHANGELOG.md index f5dd1b4..0b355fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # CHANGELOG +## 1.2.0 + +- Fix: Handle BAM records with empty SEQ fields (`*` in SAM format) in `base-depth`. Previously this would cause an out-of-bounds error. Now these reads still count toward depth, and their bases are counted as `N`. ([#92](https://github.com/sstadick/perbase/pull/92) by @ghuls) + ## 1.1.0 - Fix/Feat: For both MapQualBaseQualN and BaseQualMapQualN only resolve to N when the bases are ambiguous, otherwise return the consensus base. diff --git a/Cargo.lock b/Cargo.lock index a1e1fc3..9f1f1a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1027,7 +1027,7 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "perbase" -version = "1.1.0" +version = "1.2.0" dependencies = [ "anyhow", "bio", diff --git a/Cargo.toml b/Cargo.toml index e1a0c95..bc8393b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perbase" -version = "1.1.0" +version = "1.2.0" authors = ["Seth Stadick "] edition = "2024" license = "MIT" diff --git a/README.md b/README.md index 57b8c7f..c74d762 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,8 @@ You can also download a binary from the [releases](https://github.com/sstadick/p The `base-depth` tool walks over every position in the BAM/CRAM file and calculates the depth, as well as the number of each nucleotide at the given position. Additionally, it counts the numbers of Ins/Dels at each position. +**Note on empty SEQ records:** BAM records with empty SEQ fields (represented as `*` in SAM format) are handled gracefully. These reads still count toward depth, but since the actual nucleotide cannot be determined, they are counted as `N`. + The output columns are as follows: | Column | Description | diff --git a/src/lib/position/pileup_position.rs b/src/lib/position/pileup_position.rs index d2613b1..742a7cb 100644 --- a/src/lib/position/pileup_position.rs +++ b/src/lib/position/pileup_position.rs @@ -107,7 +107,7 @@ impl PileupPosition { // Check if we are checking the base quality score // && Check if the base quality score is greater or equal to than the cutoff if let Some(base_qual_filter) = base_filter - && record.qual()[alignment.qpos().unwrap()] < base_qual_filter + && (record.seq().is_empty() || record.qual()[alignment.qpos().unwrap()] < base_qual_filter) { self.n += 1 } else if let Some(b) = recommended_base { @@ -124,6 +124,8 @@ impl PileupPosition { Base::M => self.m += 1, _ => self.n += 1, } + } else if record.seq().is_empty() { + self.n += 1 } else { match (record.seq()[alignment.qpos().unwrap()] as char).to_ascii_uppercase() { 'A' => self.a += 1, @@ -517,6 +519,235 @@ mod tests { } } + /// Test that reads with empty SEQ (represented as `*` in SAM format) are handled correctly + /// in the pileup code path used by base-depth. Empty SEQ reads should still count toward + /// depth, but their bases should be counted as N. This tests the fix for issue #91. + #[test] + fn test_empty_seq_pileup() { + use rust_htslib::bam::{IndexedReader, Writer, index}; + use tempfile::tempdir; + + let tempdir = tempdir().unwrap(); + let bam_path = tempdir.path().join("empty_seq.bam"); + + // Create header + let mut header = bam::header::Header::new(); + let mut chr1 = bam::header::HeaderRecord::new(b"SQ"); + chr1.push_tag(b"SN", &"chr1".to_owned()); + chr1.push_tag(b"LN", &"100".to_owned()); + header.push_record(&chr1); + let view = bam::HeaderView::from_header(&header); + + // Create records including one with empty SEQ + // Normal read with 'A' bases at positions 1-25 + let normal_record = Record::from_sam( + &view, + b"NORMAL\t0\tchr1\t1\t40\t25M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAA\t#########################", + ) + .unwrap(); + + // Read with empty SEQ at positions 10-35 - should count toward depth and N count + let empty_seq_record = Record::from_sam( + &view, + b"EMPTY_SEQ\t0\tchr1\t10\t40\t25M\t*\t0\t0\t*\t*", + ) + .unwrap(); + + // Another normal read with 'G' bases at positions 15-40 + let normal_record2 = Record::from_sam( + &view, + b"NORMAL2\t0\tchr1\t15\t40\t25M\t*\t0\t0\tGGGGGGGGGGGGGGGGGGGGGGGGG\t#########################", + ) + .unwrap(); + + // Write BAM file + let mut writer = Writer::from_path(&bam_path, &header, bam::Format::Bam).unwrap(); + writer.write(&normal_record).unwrap(); + writer.write(&empty_seq_record).unwrap(); + writer.write(&normal_record2).unwrap(); + drop(writer); + + // Index the BAM file + index::build(&bam_path, None, index::Type::Bai, 1).unwrap(); + + // Read back and create pileup + let mut reader = IndexedReader::from_path(&bam_path).unwrap(); + let header_view = reader.header().clone(); + + let read_filter = DefaultReadFilter::new(0, 0, 0); + + // Test at position 14 (0-based) = position 15 (1-based) + // At this position: NORMAL (A), EMPTY_SEQ (*), NORMAL2 (G) = depth 3 + reader.fetch(("chr1", 14, 15)).unwrap(); + let pileup_iter = reader.pileup(); + + for pileup_result in pileup_iter { + let pileup = pileup_result.unwrap(); + if pileup.pos() == 14 { + let position = PileupPosition::from_pileup(pileup, &header_view, &read_filter, None); + + // Depth should include all 3 reads + assert_eq!(position.depth, 3, "Depth should be 3 (NORMAL + EMPTY_SEQ + NORMAL2)"); + + // 'A' count from NORMAL read + assert_eq!(position.a, 1, "Should have 1 A from NORMAL read"); + + // 'G' count from NORMAL2 read + assert_eq!(position.g, 1, "Should have 1 G from NORMAL2 read"); + + // 'N' count from EMPTY_SEQ read (empty seq should be counted as N) + assert_eq!(position.n, 1, "Empty SEQ read should be counted as N"); + + break; + } + } + } + + /// Test that reads with empty SEQ work correctly with base quality filtering. + /// When a base quality filter is set and SEQ is empty, the read should be counted as N. + #[test] + fn test_empty_seq_with_base_qual_filter() { + use rust_htslib::bam::{IndexedReader, Writer, index}; + use tempfile::tempdir; + + let tempdir = tempdir().unwrap(); + let bam_path = tempdir.path().join("empty_seq_bq.bam"); + + // Create header + let mut header = bam::header::Header::new(); + let mut chr1 = bam::header::HeaderRecord::new(b"SQ"); + chr1.push_tag(b"SN", &"chr1".to_owned()); + chr1.push_tag(b"LN", &"100".to_owned()); + header.push_record(&chr1); + let view = bam::HeaderView::from_header(&header); + + // Normal read with high quality + let normal_record = Record::from_sam( + &view, + b"NORMAL\t0\tchr1\t1\t40\t25M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAA\tIIIIIIIIIIIIIIIIIIIIIIIII", + ) + .unwrap(); + + // Read with empty SEQ + let empty_seq_record = Record::from_sam( + &view, + b"EMPTY_SEQ\t0\tchr1\t1\t40\t25M\t*\t0\t0\t*\t*", + ) + .unwrap(); + + // Write BAM file + let mut writer = Writer::from_path(&bam_path, &header, bam::Format::Bam).unwrap(); + writer.write(&normal_record).unwrap(); + writer.write(&empty_seq_record).unwrap(); + drop(writer); + + // Index the BAM file + index::build(&bam_path, None, index::Type::Bai, 1).unwrap(); + + // Read back and create pileup with base quality filter + let mut reader = IndexedReader::from_path(&bam_path).unwrap(); + let header_view = reader.header().clone(); + + let read_filter = DefaultReadFilter::new(0, 0, 0); + + reader.fetch(("chr1", 0, 1)).unwrap(); + let pileup_iter = reader.pileup(); + + for pileup_result in pileup_iter { + let pileup = pileup_result.unwrap(); + if pileup.pos() == 0 { + // With base quality filter of 20 + let position = + PileupPosition::from_pileup(pileup, &header_view, &read_filter, Some(20)); + + // Depth should include both reads + assert_eq!(position.depth, 2, "Depth should be 2"); + + // 'A' from NORMAL (quality 'I' = 40, passes filter) + assert_eq!(position.a, 1, "Should have 1 A from high-quality NORMAL read"); + + // Empty SEQ read should be counted as N (regardless of base qual filter) + assert_eq!(position.n, 1, "Empty SEQ read should be counted as N"); + + break; + } + } + } + + /// Test that reads with empty SEQ work correctly in mate-aware pileup mode. + #[test] + fn test_empty_seq_mate_aware() { + use rust_htslib::bam::{IndexedReader, Writer, index}; + use tempfile::tempdir; + + let tempdir = tempdir().unwrap(); + let bam_path = tempdir.path().join("empty_seq_mate.bam"); + + // Create header + let mut header = bam::header::Header::new(); + let mut chr1 = bam::header::HeaderRecord::new(b"SQ"); + chr1.push_tag(b"SN", &"chr1".to_owned()); + chr1.push_tag(b"LN", &"100".to_owned()); + header.push_record(&chr1); + let view = bam::HeaderView::from_header(&header); + + // Normal read + let normal_record = Record::from_sam( + &view, + b"NORMAL\t0\tchr1\t1\t40\t25M\t*\t0\t0\tAAAAAAAAAAAAAAAAAAAAAAAAA\t#########################", + ) + .unwrap(); + + // Read with empty SEQ + let empty_seq_record = Record::from_sam( + &view, + b"EMPTY_SEQ\t0\tchr1\t1\t40\t25M\t*\t0\t0\t*\t*", + ) + .unwrap(); + + // Write BAM file + let mut writer = Writer::from_path(&bam_path, &header, bam::Format::Bam).unwrap(); + writer.write(&normal_record).unwrap(); + writer.write(&empty_seq_record).unwrap(); + drop(writer); + + // Index the BAM file + index::build(&bam_path, None, index::Type::Bai, 1).unwrap(); + + // Read back and create pileup in mate-aware mode + let mut reader = IndexedReader::from_path(&bam_path).unwrap(); + let header_view = reader.header().clone(); + + let read_filter = DefaultReadFilter::new(0, 0, 0); + + reader.fetch(("chr1", 0, 1)).unwrap(); + let pileup_iter = reader.pileup(); + + for pileup_result in pileup_iter { + let pileup = pileup_result.unwrap(); + if pileup.pos() == 0 { + let position = PileupPosition::from_pileup_mate_aware( + pileup, + &header_view, + &read_filter, + None, + MateResolutionStrategy::Original, + ); + + // Depth should include both reads + assert_eq!(position.depth, 2, "Depth should be 2"); + + // 'A' from NORMAL read + assert_eq!(position.a, 1, "Should have 1 A from NORMAL read"); + + // Empty SEQ read should be counted as N + assert_eq!(position.n, 1, "Empty SEQ read should be counted as N"); + + break; + } + } + } + /// Test N strategy #[test] fn test_n_strategy() {