diff --git a/crates/seqair/proptest-regressions/bam/cigar.txt b/crates/seqair/proptest-regressions/bam/cigar.txt new file mode 100644 index 00000000..4525848a --- /dev/null +++ b/crates/seqair/proptest-regressions/bam/cigar.txt @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc ce29820e7f85cf8d61f65ce2bb072a88ce00f7fbb17e0b7e3d3c94a1fdcc7076 # shrinks to ops = [CigarOp { op: Match, len: 1 }] diff --git a/crates/seqair/src/bam.rs b/crates/seqair/src/bam.rs index c304a055..2c475bf6 100644 --- a/crates/seqair/src/bam.rs +++ b/crates/seqair/src/bam.rs @@ -80,12 +80,16 @@ pub use header::{BamHeader, BamHeaderError, ContigInfo}; pub use index::{AlignmentIndex, BaiError, BamIndex}; pub use nm_md::NmMdError; pub use owned_record::{OwnedBamRecord, OwnedRecordError}; -pub use pileup::{AlignmentView, PileupColumn, PileupEngine, PileupGuard, PileupOp, RefSeq}; +pub use pileup::{ + AlignmentView, PileupAlignment, PileupColumn, PileupEngine, PileupGuard, PileupOp, RefSeq, +}; pub use reader::{BamError, BamShared, IndexedBamReader}; // `record` is intentionally not re-exported as a type. The production decode // path is `RecordStore::push_raw`; the `record` module exposes only shared // decode primitives (`parse_header`, `compute_end_pos_from_raw`, `DecodeError`). -pub use record_store::RecordStore; +pub use record_store::{ + CustomizeRecordStore, FilterRawFields, RecordStore, RejectUnmapped, Sequence, +}; pub use seqair_types::bam_flags as flags; pub use seqair_types::{Offset, One, Pos, Pos0, Pos1, Zero}; pub use writer::{BamWriteError, BamWriter, BamWriterBuilder, ToPath, ToWriter}; diff --git a/crates/seqair/src/bam/cigar.rs b/crates/seqair/src/bam/cigar.rs index 4feb5362..6fb11aee 100644 --- a/crates/seqair/src/bam/cigar.rs +++ b/crates/seqair/src/bam/cigar.rs @@ -195,6 +195,57 @@ impl CigarOp { } } } + + /// View raw BAM CIGAR bytes as `&[CigarOp]` without copying. + /// + /// Returns `None` if the length is not a multiple of 4, the pointer is + /// not 4-byte aligned, or the target is big-endian. On aligned LE input + /// this is a zero-cost transmute. BAM CIGAR data is **not** guaranteed + /// aligned — the read name length determines the start offset, and + /// samtools produces unaligned CIGAR data when `l_read_name` is not a + /// multiple of 4. Use [`extend_from_bam_bytes`] if alignment cannot be + /// guaranteed (it's always correct, just requires a copy). + #[inline] + pub fn slice_from_bam_bytes(bytes: &[u8]) -> Option<&[Self]> { + if bytes.is_empty() { + return Some(&[]); + } + if !bytes.len().is_multiple_of(4) { + return None; + } + #[cfg(target_endian = "little")] + { + // BAM data is packed sequentially: 32-byte fixed header, then + // qname (l_read_name bytes, including NUL), then CIGAR. If + // l_read_name is not a multiple of 4, the CIGAR pointer is + // unaligned. from_raw_parts requires CigarOp alignment (4). + #[allow( + clippy::arithmetic_side_effects, + clippy::manual_is_multiple_of, + reason = "pointer-to-usize cast is defined on all supported targets; align_of is non-zero so % is safe" + )] + if bytes.as_ptr() as usize % align_of::() != 0 { + return None; + } + let n = bytes.len() / 4; + // SAFETY: + // - Length: `n` is `bytes.len() / 4`, checked to be a multiple of + // 4 above. The pointer advance of `n * size_of::()` + // (== n * 4) stays within the allocation. + // - Alignment: checked via `is_aligned()` above. + // - Validity: every `u32` bit pattern is a valid `CigarOp` — the + // op code field is 4 bits (0..=15, all decodable) and the length + // field is an unconstrained 28-bit integer. + // - Endianness: guarded by `#[cfg(target_endian = "little")]`; + // on LE hosts BAM's on-disk LE `u32` byte order matches native + // in-memory layout. + Some(unsafe { std::slice::from_raw_parts(bytes.as_ptr() as *const CigarOp, n) }) + } + #[cfg(not(target_endian = "little"))] + { + None + } + } } impl std::fmt::Debug for CigarOp { @@ -316,13 +367,28 @@ pub enum CigarPosInfo { /// Compact CIGAR mapping for fast qpos lookup in the pileup engine. /// -/// For the ~90% of reads with simple CIGARs (clips + one match block), -/// `Linear` avoids any allocation and computes qpos with a single subtraction. -/// Complex CIGARs use a `SmallVec` of compact ops that stays inline for ≤6 ops. +/// Two variants: +/// +/// **`Linear`**: For the ~96% of reads whose CIGAR is a single contiguous +/// match block with optional leading/trailing clips (`[HS]* [M=X]+ [HS]*`). +/// Computes `qpos` with a single subtraction. 16 bytes. +/// +/// **`Complex`**: For reads with insertions, deletions, reference skips, +/// or multiple disjoint match blocks (~4% of reads). Stores a pre-computed +/// index of [`CompactOp`]s — 12 bytes each, 6 inline (72-byte buffer, no +/// heap allocation for 98.1% of complex CIGARs). +/// +/// Real-world data from NA12878 chr12 Illumina WGS (100k reads): +/// 96.1% Linear, 3.9% Complex. Of complex reads: 79% have 3 ops (e.g. `M I M`), +/// 98.1% fit in ≤6 ops (inline). pub enum CigarMapping { - /// `qpos = (pos - rec_pos) as usize + query_offset as usize` + /// `qpos = (pos - rec_pos) + query_offset`. No insertion/deletion support — + /// `try_linear` rejects CIGARs containing I, D, or N ops. Linear { rec_pos: Pos0, query_offset: u32, match_len: u32 }, - /// Pre-computed compact ops for linear/binary search. + /// Pre-computed compact ops for position lookup via linear scan (≤4 ops) + /// or binary search (>4 ops). The mapping is immutable — `pos_info_at` + /// is a pure function of `pos`, allowing the active set to be reordered + /// without invalidating state. Complex(SmallVec), } @@ -340,13 +406,46 @@ impl std::fmt::Debug for CigarMapping { } } -/// 16-byte CIGAR op for the compact index. +/// Compact CIGAR operation for the position-lookup index used by the pileup engine. +/// +/// 12 bytes (down from 16). Each op records the absolute reference start, absolute +/// query start, operation length, and CIGAR op code. The length and op code are +/// packed into a single `u32` (`len << 4 | op_code`), matching BAM's on-disk +/// packing — extraction is a single shift or mask. +/// +/// Stored inline in [`CigarMapping::Complex`] via `SmallVec` +/// (72-byte inline buffer, no heap allocation for 98.1% of complex CIGARs). #[derive(Clone, Copy)] pub struct CompactOp { - ref_start: i32, + /// 0-based reference position where this operation begins. + /// Range: `0..=i32::MAX` (BAM's position limit). + ref_start: u32, + /// 0-based query position where this operation begins. Equal to the + /// cumulative leading clip length plus all preceding query-consuming + /// op lengths. query_start: u32, - len: u32, - op_type: u8, + /// Packed: `len << 4 | op_code`. Same layout as BAM's on-disk CIGAR + /// encoding. Upper 28 bits = operation length, lower 4 bits = CIGAR + /// op code (0=M, 1=I, 2=D, 3=N, 4=S, 5=H, 6=P, 7==, 8=X). + len_op: u32, +} + +impl CompactOp { + /// CIGAR operation length (upper 28 bits of the BAM-packed `u32`). + #[inline] + #[allow( + clippy::len_without_is_empty, + reason = "CIGAR ops don't have a meaningful 'empty' state" + )] + pub fn len(self) -> u32 { + self.len_op >> 4 + } + /// BAM CIGAR op code (lower 4 bits). See module-level constants + /// `CIGAR_M` through `CIGAR_X`. + #[inline] + pub fn op_type(self) -> u8 { + (self.len_op & 0xF) as u8 + } } impl CigarMapping { @@ -431,8 +530,9 @@ fn build_compact_ops(rec_pos: Pos0, ops: &[CigarOp]) -> Option Option { let mut total = 0u32; let mut idx = op_idx.checked_add(1)?; while let Some(next) = ops.get(idx) { - if next.op_type == CIGAR_P { + if next.op_type() == CIGAR_P { idx = idx.checked_add(1)?; continue; } - if next.op_type != CIGAR_I { + if next.op_type() != CIGAR_I { break; } - total = total.saturating_add(next.len); + total = total.saturating_add(next.len()); idx = idx.checked_add(1)?; } if total > 0 { Some(total) } else { None } @@ -475,43 +575,40 @@ fn classify_op( ops: &[CompactOp], i: usize, op: &CompactOp, - pos32: i32, - ref_end: i32, + pos: u32, + ref_end: u32, ) -> Option { - if consumes_query(op.op_type) { + if consumes_query(op.op_type()) { debug_assert!( - matches!(op.op_type, CIGAR_M | CIGAR_EQ | CIGAR_X), + matches!(op.op_type(), CIGAR_M | CIGAR_EQ | CIGAR_X), "classify_op reached query-consuming branch with unexpected op type {}", - op.op_type + op.op_type() ); - let offset = pos32.wrapping_sub(op.ref_start) as u32; + let offset = pos.wrapping_sub(op.ref_start); let qpos = op.query_start.checked_add(offset).trace_err("qpos overflow")?; - if pos32 == ref_end.wrapping_sub(1) + if pos == ref_end.wrapping_sub(1) && let Some(insert_len) = next_insertion_len(ops, i) { return Some(CigarPosInfo::Insertion { qpos, insert_len }); } Some(CigarPosInfo::Match { qpos }) - } else if op.op_type == CIGAR_D { - // r[impl pileup_indel.op_enum] - // At the last position of a D op, check if a following I op exists (skipping P). - if pos32 == ref_end.wrapping_sub(1) + } else if op.op_type() == CIGAR_D { + if pos == ref_end.wrapping_sub(1) && let Some(insert_len) = next_insertion_len(ops, i) { return Some(CigarPosInfo::ComplexIndel { - del_len: op.len, + del_len: op.len(), insert_len, is_refskip: false, }); } - Some(CigarPosInfo::Deletion { del_len: op.len }) - } else if op.op_type == CIGAR_N { - // Same complex-indel check for N ops. - if pos32 == ref_end.wrapping_sub(1) + Some(CigarPosInfo::Deletion { del_len: op.len() }) + } else if op.op_type() == CIGAR_N { + if pos == ref_end.wrapping_sub(1) && let Some(insert_len) = next_insertion_len(ops, i) { return Some(CigarPosInfo::ComplexIndel { - del_len: op.len, + del_len: op.len(), insert_len, is_refskip: true, }); @@ -524,17 +621,16 @@ fn classify_op( #[inline] fn pos_info_linear(ops: &[CompactOp], pos: Pos0) -> Option { - let pos32 = pos.as_i32(); + let pos: u32 = *pos; for (i, op) in ops.iter().enumerate() { - if !consumes_ref(op.op_type) { + if !consumes_ref(op.op_type()) { continue; } - let ref_end = - op.ref_start.saturating_add(i32::try_from(op.len).trace_ok("op len exceeds i32")?); - if pos32 < op.ref_start || pos32 >= ref_end { + let ref_end = op.ref_start.saturating_add(op.len()); + if pos < op.ref_start || pos >= ref_end { continue; } - return classify_op(ops, i, op, pos32, ref_end); + return classify_op(ops, i, op, pos, ref_end); } None } @@ -542,20 +638,19 @@ fn pos_info_linear(ops: &[CompactOp], pos: Pos0) -> Option { // r[impl perf.cigar_binary_search] #[inline] fn pos_info_bsearch(ops: &[CompactOp], pos: Pos0) -> Option { - let pos32 = pos.as_i32(); - let idx = ops.partition_point(|op| op.ref_start <= pos32); + let pos: u32 = *pos; + let idx = ops.partition_point(|op| op.ref_start <= pos); if idx == 0 { return None; } for i in idx.saturating_sub(2)..ops.len().min(idx.saturating_add(1)) { let Some(op) = ops.get(i) else { continue }; - if !consumes_ref(op.op_type) { + if !consumes_ref(op.op_type()) { continue; } - let ref_end = - op.ref_start.saturating_add(i32::try_from(op.len).trace_ok("op len exceeds i32")?); - if pos32 >= op.ref_start && pos32 < ref_end { - return classify_op(ops, i, op, pos32, ref_end); + let ref_end = op.ref_start.saturating_add(op.len()); + if pos >= op.ref_start && pos < ref_end { + return classify_op(ops, i, op, pos, ref_end); } } None @@ -565,6 +660,7 @@ fn pos_info_bsearch(ops: &[CompactOp], pos: Pos0) -> Option { #[allow(clippy::arithmetic_side_effects, reason = "test arithmetic on known small values")] mod tests { use super::*; + use proptest::prelude::*; fn op(op_type: CigarOpType, len: u32) -> CigarOp { CigarOp::new(op_type, len) @@ -721,4 +817,68 @@ mod tests { "pos far before alignment must return None" ); } + + // r[verify cigar.slice_from_bam_bytes] + proptest::proptest! { + /// Round-trip: encode CigarOp → BAM LE bytes → slice_from_bam_bytes → same ops. + #[test] + fn roundtrip_slice_from_bam_bytes( + ops in proptest::collection::vec( + (0u32..=8u32, 1u32..1000u32), + 1..20, + ).prop_map(|pairs| { + pairs.into_iter() + .map(|(code, len)| CigarOp::from_bam_u32((len << 4) | code)) + .collect::>() + }) + ) { + let bytes: Vec = ops.iter() + .flat_map(|op| op.to_bam_u32().to_le_bytes()) + .collect(); + // 8 bytes padding ensures 8-byte alignment (≥ 4-byte for CigarOp) + let mut buf = vec![0u8; 8]; + buf.extend_from_slice(&bytes); + let aligned = &buf[8..]; + + let result = CigarOp::slice_from_bam_bytes(aligned); + prop_assert!(result.is_some(), "aligned buffer must succeed"); + prop_assert_eq!(result.unwrap(), ops.as_slice()); + } + } + + #[test] + fn slice_from_bam_bytes_rejects_odd_length() { + assert!(CigarOp::slice_from_bam_bytes(&[0u8; 7]).is_none()); + } + + #[test] + fn slice_from_bam_bytes_empty_ok() { + assert_eq!(CigarOp::slice_from_bam_bytes(&[]), Some(&[][..])); + } + + // r[verify cigar.slice_from_bam_bytes] + #[test] + fn slice_from_bam_bytes_rejects_unaligned_input() { + // BAM CIGAR data can be unaligned within the record: the read name + // sits between the 32-byte fixed header and the CIGAR data. If + // l_read_name is not a multiple of 4, the CIGAR bytes start at a + // non-4-byte-aligned offset. Samtools produces such records. + let cigar_op = CigarOp::new(CigarOpType::Match, 100); + let packed = cigar_op.to_bam_u32(); + let mut buf = vec![0u8]; + buf.extend_from_slice(&packed.to_le_bytes()); + buf.extend_from_slice(&packed.to_le_bytes()); + buf.extend_from_slice(&packed.to_le_bytes()); + + let mut any_unaligned = false; + for off in 0..=3 { + let sub = &buf[off..off + 4]; + let result = CigarOp::slice_from_bam_bytes(sub); + if sub.as_ptr() as usize % 4 != 0 { + assert!(result.is_none(), "unaligned slice at offset {off} must return None"); + any_unaligned = true; + } + } + assert!(any_unaligned, "at least one test offset must be unaligned"); + } } diff --git a/crates/seqair/src/bam/owned_record.rs b/crates/seqair/src/bam/owned_record.rs index b6a15254..6658aad9 100644 --- a/crates/seqair/src/bam/owned_record.rs +++ b/crates/seqair/src/bam/owned_record.rs @@ -325,8 +325,9 @@ impl OwnedBamRecord { pub fn to_bam_bytes(&self, buf: &mut Vec) -> Result<(), OwnedRecordError> { // Validate field limits. `pos`/`next_pos` are constructively bounded // by `Option` (Pos0 ≤ i32::MAX) so no overflow check is needed. - if self.qname.len() > 254 { - return Err(OwnedRecordError::QnameTooLong { len: self.qname.len() }); + let name_len = self.qname.len(); + if name_len > 254 { + return Err(OwnedRecordError::QnameTooLong { len: name_len }); } if self.cigar.len() > 65535 { return Err(OwnedRecordError::CigarCountOverflow { count: self.cigar.len() }); diff --git a/crates/seqair/src/bam/pileup.rs b/crates/seqair/src/bam/pileup.rs index 49e3748e..1c942f4b 100644 --- a/crates/seqair/src/bam/pileup.rs +++ b/crates/seqair/src/bam/pileup.rs @@ -118,6 +118,8 @@ impl RefSeq { // r[impl pileup.extras.generic_param] pub struct PileupEngine { store: RecordStore, + /// Alignment buffer reused across columns — cleared each call. + buf: Vec, current_pos: Pos0, region_end: Pos0, next_entry: usize, @@ -169,17 +171,22 @@ fn base_qual_at( // r[impl pileup.column_contents] // r[impl pileup.htslib_compat] // r[impl pileup.lending_iterator] -/// A single pileup column, borrowing the record store for the duration of its use. +/// A single pileup column, borrowing the record store and alignment +/// buffer for the duration of its use. /// -/// Returned by [`PileupEngine::pileups`]. Valid until the next call to `pileups` -/// on the same engine. Access per-record extras or slab data (qname, aux) via -/// [`alignments`](Self::alignments), which yields [`AlignmentView`] wrappers, or -/// via [`store`](Self::store) directly. -pub struct PileupColumn<'store, U = ()> { +/// Returned by [`PileupEngine::pileups`]. Valid until the next call to +/// `pileups` on the same engine. Both the per-alignment data and the +/// record store borrow from the engine, so the column cannot outlive +/// the `pileups` call that produced it. +/// +/// Access per-record extras or slab data (qname, aux, seq, qualities) via +/// [`alignments`](Self::alignments), which yields [`AlignmentView`] +/// wrappers, or via [`store`](Self::store) directly. +pub struct PileupColumn<'eng, U = ()> { pos: Pos0, reference_base: Base, - alignments: Vec, - store: &'store RecordStore, + alignments: &'eng [PileupAlignment], + store: &'eng RecordStore, } impl std::fmt::Debug for PileupColumn<'_, U> { @@ -192,7 +199,7 @@ impl std::fmt::Debug for PileupColumn<'_, U> { } } -impl<'store, U> PileupColumn<'store, U> { +impl<'eng, U> PileupColumn<'eng, U> { #[must_use] pub fn pos(&self) -> Pos0 { self.pos @@ -204,17 +211,19 @@ impl<'store, U> PileupColumn<'store, U> { self.alignments.len() } - /// Iterate alignments with store access for per-record extras, qname, and aux. + /// Iterate alignments with store access for per-record extras, qname, aux, + /// seq, and qual. /// /// Each yielded [`AlignmentView`] derefs to [`PileupAlignment`] for the flat /// per-position fields, and exposes [`extra`](AlignmentView::extra), - /// [`qname`](AlignmentView::qname), and [`aux`](AlignmentView::aux) for slab data. - pub fn alignments(&self) -> impl Iterator> + '_ { + /// [`qname`](AlignmentView::qname), [`aux`](AlignmentView::aux), + /// [`seq`](AlignmentView::seq), and [`qualities`](AlignmentView::qualities) for slab data. + pub fn alignments(&self) -> impl Iterator> + '_ { let store = self.store; self.alignments.iter().map(move |aln| AlignmentView { aln, store }) } - /// Iterate the raw alignments without store access (equivalent to the old API). + /// Iterate the raw alignments without store access. pub fn raw_alignments(&self) -> impl Iterator + '_ { self.alignments.iter() } @@ -224,7 +233,7 @@ impl<'store, U> PileupColumn<'store, U> { } /// Borrow the record store for custom slab access (e.g., record fields by index). - pub fn store(&self) -> &'store RecordStore { + pub fn store(&self) -> &'eng RecordStore { self.store } @@ -272,6 +281,16 @@ impl<'a, 'store, U> AlignmentView<'a, 'store, U> { self.store.aux(self.aln.record_idx()) } + /// The read's full decoded sequence (all bases, not just the pileup position). + pub fn seq(&self) -> &'store [Base] { + self.store.seq(self.aln.record_idx()) + } + + /// The read's full per-base quality scores (all positions, not just the pileup position). + pub fn qualities(&self) -> &'store [BaseQuality] { + self.store.qual(self.aln.record_idx()) + } + /// The underlying [`PileupAlignment`] — use when you want to call /// [`Clone::clone`] or otherwise escape the deref coercion. pub fn alignment(&self) -> &'a PileupAlignment { @@ -436,6 +455,7 @@ impl PileupEngine { pub fn new(store: RecordStore, region_start: Pos0, region_end: Pos0) -> Self { PileupEngine { store, + buf: Vec::new(), current_pos: region_start, region_end, next_entry: 0, @@ -483,6 +503,11 @@ impl PileupEngine { /// /// Call this after iteration is complete. The returned store retains its /// allocated capacity but is cleared. + /// Take the `RecordStore` out for reuse. Returns `None` if already taken. + /// + /// Call this after iteration is complete when using `PileupEngine` directly + /// (without the [`PileupGuard`] wrapper). The guard recovers the store + /// automatically on drop — prefer that path. pub fn take_store(&mut self) -> Option> { if self.store.is_empty() && self.store.records_capacity() == 0 { return None; @@ -519,17 +544,17 @@ impl std::fmt::Debug for PileupEngine { /// retaining its allocated capacity for the next pileup. Callers no /// longer need an explicit `recover_store` step. /// -/// # Footgun: don't drain the engine via `Deref` +/// The escape hatch is [`Self::into_inner`] — it consumes the guard +/// without recovering the store, so the next +/// [`Readers::pileup`](crate::reader::Readers::pileup) allocates a fresh +/// store. Prefer letting the guard drop normally. /// -/// Because the guard `Deref`s to [`PileupEngine`], the engine's +/// ⚠️ Because the guard [`Deref`](std::ops::Deref)s to [`PileupEngine`], /// [`PileupEngine::take_store`] is reachable as `guard.take_store()`. -/// Calling it leaves the engine's store empty, and then the guard's -/// `Drop` finds nothing to recover, so the next call to -/// [`Readers::pileup`](crate::reader::Readers::pileup) allocates a fresh -/// ~39 MB store. **Do not call `take_store` on a guard.** If you need -/// the underlying engine without recovery, use [`Self::into_inner`] — -/// that path is documented to disable recovery and at least makes the -/// intent explicit. +/// Calling it leaves the engine's store empty, and the guard's `Drop` +/// finds nothing to recover — the next pileup allocates a fresh store. +/// Use [`into_inner`](Self::into_inner) if you genuinely need to skip +/// recovery. pub struct PileupGuard<'a, U = ()> { /// `Some` for the lifetime of the guard, then taken to `None` by /// [`Self::into_inner`] (so the `Drop` impl knows to skip recovery). @@ -598,9 +623,9 @@ impl Drop for PileupGuard<'_, U> { // Recover the store into the caller's slot. The engine is `None` // only after [`PileupGuard::into_inner`], in which case recovery // is intentionally skipped. `take_store` further returns `None` - // if the store was already drained through `Deref` (the - // documented footgun) or was empty with zero capacity from the - // start. In any of those cases, leave the slot untouched. + // if the store was already drained (edge case) or was empty with + // zero capacity from the start. In any of those cases, leave the + // slot untouched. if let Some(mut engine) = self.engine.take() && let Some(store) = engine.take_store() { @@ -611,19 +636,18 @@ impl Drop for PileupGuard<'_, U> { // r[impl pileup.position_iteration] impl PileupEngine { - /// Core iteration logic used by [`pileups`](Self::pileups). + /// Core iteration logic for [`pileups`](Self::pileups). /// - /// Returns the per-column data (pos, reference base, alignments) so the caller - /// can attach a store reference to build a [`PileupColumn`]. - fn advance(&mut self) -> Option<(Pos0, Base, Vec)> { + /// Clears `self.buf` and fills it with the alignments for the next + /// non-empty column. Returns `(pos, reference_base)` so the caller + /// can build a [`PileupColumn`] borrowing the buffer. + fn advance(&mut self) -> Option<(Pos0, Base)> { loop { if self.current_pos > self.region_end { return None; } let pos = self.current_pos; - // current_pos <= region_end (checked above), and region_end < u32::MAX (niche), - // so current_pos + 1 always fits. #[allow( clippy::unwrap_in_result, reason = "infallible: current_pos <= region_end < u32::MAX - 1" @@ -635,23 +659,30 @@ impl PileupEngine { .trace_err("BUG: current_pos + 1 overflowed despite being <= region_end")?; } - // Evict expired records. Iterate the compact end_pos vec (4-byte stride) - // and swap-remove from both vecs in lockstep. + // Evict expired records — stable retain, preserves insertion order. { - let mut i = 0; - while i < self.active_end_pos.len() { - debug_assert!(i < self.active.len()); + let mut write = 0; + let len = self.active_end_pos.len(); + for read in 0..len { + debug_assert!(read < self.active.len()); #[allow( clippy::indexing_slicing, - reason = "i < active_end_pos.len() == active.len()" + reason = "read < active_end_pos.len() == active.len()" )] - if self.active_end_pos[i] < pos { - self.active_end_pos.swap_remove(i); - self.active.swap_remove(i); - } else { - i = i.checked_add(1).trace_err("active set size exceeded usize::MAX")?; + if self.active_end_pos[read] >= pos { + if read != write { + self.active_end_pos[write] = self.active_end_pos[read]; + // swap(write, read) with write < read moves the survivor left + // without needing Clone on ActiveRecord. + self.active.swap(write, read); + } + write = write + .checked_add(1) + .trace_err("active set size exceeded usize::MAX")?; } } + self.active_end_pos.truncate(write); + self.active.truncate(write); } while self.next_entry < self.store.len() { @@ -721,7 +752,7 @@ impl PileupEngine { // r[impl pileup_indel.refskips_included] // r[related record_store.field_access] // r[impl perf.reuse_alignment_vec+2] - let mut alignments = Vec::with_capacity(self.active.len()); + self.buf.clear(); for active in &self.active { let Some(info) = active.cigar.pos_info_at(pos) else { continue }; @@ -742,7 +773,7 @@ impl PileupEngine { CigarPosInfo::RefSkip => PileupOp::RefSkip, }; - alignments.push(PileupAlignment { + self.buf.push(PileupAlignment { op, mapq: active.mapq, flags: active.flags, @@ -756,20 +787,20 @@ impl PileupEngine { // r[impl pileup.max_depth_per_position] if let Some(max) = self.max_depth { - alignments.truncate(max as usize); + self.buf.truncate(max as usize); } - if !alignments.is_empty() { + if !self.buf.is_empty() { self.columns_produced = self.columns_produced.saturating_add(1); #[expect( clippy::cast_possible_truncation, reason = "depth is bounded by max_depth (u32) or typical pileup sizes; saturates at u32::MAX for profiling" )] - let depth_u32 = alignments.len() as u32; + let depth_u32 = self.buf.len() as u32; self.max_active_depth = self.max_active_depth.max(depth_u32); let reference_base = self.ref_seq.as_ref().map_or(Base::Unknown, |r| r.base_at(pos)); - return Some((pos, reference_base, alignments)); + return Some((pos, reference_base)); } } } @@ -777,17 +808,17 @@ impl PileupEngine { // r[impl pileup.lending_iterator] /// Advance to the next pileup column. /// - /// Returns `Some(PileupColumn<'_, U>)` borrowing the record store. Call - /// this in a `while let` loop. The column is valid until the next call to - /// `pileups` on the same engine. + /// Returns `Some(PileupColumn<'_, U>)` borrowing the engine's record store + /// and alignment buffer. Call this in a `while let` loop. The column is + /// valid until the next call to `pileups` on the same engine. /// - /// This is a lending iterator — the returned column holds a borrow of the - /// engine's store, so it cannot be collected into a `Vec` or held across - /// subsequent `pileups` calls. Extract primitive data (pos, depth, etc.) - /// if you need to retain it. + /// This is a lending iterator — the returned column borrows from the engine, + /// so it cannot be collected into a `Vec` or held across subsequent + /// `pileups` calls. Extract primitive data (pos, depth, etc.) if you need + /// to retain it. pub fn pileups(&mut self) -> Option> { - let (pos, reference_base, alignments) = self.advance()?; - Some(PileupColumn { pos, reference_base, alignments, store: &self.store }) + let (pos, reference_base) = self.advance()?; + Some(PileupColumn { pos, reference_base, alignments: &self.buf, store: &self.store }) } /// Remaining positions in the current region — lower-bound estimate for @@ -845,4 +876,167 @@ mod tests { let ref_seq = RefSeq::new(Rc::from([]), Pos0::new(100).unwrap()); assert_eq!(ref_seq.base_at(Pos0::new(100).unwrap()), Base::Unknown); } + + // r[verify pileup.internal_buf] + /// The internal buffer must retain capacity across columns so the + /// engine doesn't keep re-allocating. + #[test] + fn internal_buf_retains_capacity() { + use crate::bam::cigar::{CigarOp, CigarOpType}; + use seqair_types::{BamFlags, Base}; + + let mut store = RecordStore::new(); + store + .push_fields( + Pos0::new(100).unwrap(), + Pos0::new(104).unwrap(), + BamFlags::empty(), + 30, + 5, + 0, + b"read1", + &[CigarOp::new(CigarOpType::Match, 5)], + &[Base::A; 5], + &[40; 5], + &[], + 0, + -1, + 0, + 0, + &mut (), + ) + .unwrap(); + + let mut engine = PileupEngine::new(store, Pos0::new(100).unwrap(), Pos0::new(104).unwrap()); + + // First column — buf grows to accommodate alignments. + let col1 = engine.pileups().unwrap(); + assert!(col1.depth() > 0, "should have at least one alignment"); + drop(col1); + let cap_after_first = engine.buf.capacity(); + assert!(cap_after_first > 0, "buffer should have allocated"); + + // Second column — same buffer, capacity must be retained. + assert!(engine.pileups().is_some()); + assert_eq!(engine.buf.capacity(), cap_after_first, "buffer capacity retained across calls"); + } + + // r[verify pileup.eviction_preserves_order] + /// After eviction of expired records, survivors must remain in insertion order. + /// With the old `swap_remove` eviction, removing the shortest-lived record would + /// swap in the last record, scrambling the order (e.g. [0,1,2,3,4] → [4,1,2,3]). + /// Stable retain keeps them in order ([1,2,3,4]). + #[test] + fn eviction_preserves_insertion_order() { + use crate::bam::cigar::{CigarOp, CigarOpType}; + use seqair_types::{BamFlags, Base}; + + let mut store = RecordStore::new(); + + // 5 records at pos=0 with spans 10, 20, 30, 40, 50. + // Record 0 (span 10) covers [0,9], record 1 (span 20) covers [0,19], etc. + // At pos=10, record 0 is evicted. Old swap_remove would have moved record 4 + // into slot 0; stable retain keeps 1,2,3,4 in insertion order. + for (i, span) in [10u32, 20, 30, 40, 50].iter().enumerate() { + let end = span.saturating_sub(1); + store + .push_fields( + Pos0::new(0).unwrap(), + Pos0::new(end).unwrap(), + BamFlags::empty(), + 30, + *span, + 0, + format!("read{i}").as_bytes(), + &[CigarOp::new(CigarOpType::Match, *span)], + &vec![Base::A; *span as usize], + &vec![40u8; *span as usize], + &[], + 0, + -1, + 0, + 0, + &mut (), + ) + .unwrap(); + } + + let mut engine = PileupEngine::new(store, Pos0::new(0).unwrap(), Pos0::new(49).unwrap()); + + let mut columns = Vec::new(); + while let Some(col) = engine.pileups() { + let indices: Vec = col.alignments().map(|a| a.record_idx()).collect(); + columns.push((col.pos(), indices)); + } + + // Pre-eviction: pos 5 — all 5 records in insertion order + let at_5 = columns.iter().find(|(p, _)| *p == Pos0::new(5).unwrap()).unwrap(); + assert_eq!(at_5.1, vec![0, 1, 2, 3, 4]); + + // After first eviction (record 0 expired at pos=10): pos 15 — [1,2,3,4] + let at_15 = columns.iter().find(|(p, _)| *p == Pos0::new(15).unwrap()).unwrap(); + assert_eq!(at_15.1, vec![1, 2, 3, 4]); + + // After second eviction (record 1 expired at pos=20): pos 25 — [2,3,4] + let at_25 = columns.iter().find(|(p, _)| *p == Pos0::new(25).unwrap()).unwrap(); + assert_eq!(at_25.1, vec![2, 3, 4]); + + // After third eviction (record 2 expired at pos=30): pos 35 — [3,4] + let at_35 = columns.iter().find(|(p, _)| *p == Pos0::new(35).unwrap()).unwrap(); + assert_eq!(at_35.1, vec![3, 4]); + + // After fourth eviction (record 3 expired at pos=40): pos 45 — [4] + let at_45 = columns.iter().find(|(p, _)| *p == Pos0::new(45).unwrap()).unwrap(); + assert_eq!(at_45.1, vec![4]); + } + + // r[verify pileup.max_depth_order] + /// max_depth truncation must keep the first N alignments in insertion order, + /// not arbitrary ones. After the stable-retain fix, the insertion order is + /// preserved through eviction, so truncation drops predictable (last N) entries. + #[test] + fn max_depth_truncates_in_insertion_order() { + use crate::bam::cigar::{CigarOp, CigarOpType}; + use seqair_types::{BamFlags, Base}; + + let mut store = RecordStore::new(); + + // 5 records at pos=0, all spanning the full 0..50 range. + for i in 0..5u32 { + store + .push_fields( + Pos0::new(0).unwrap(), + Pos0::new(49).unwrap(), + BamFlags::empty(), + 30, + 50, + 0, + format!("read{i}").as_bytes(), + &[CigarOp::new(CigarOpType::Match, 50)], + &vec![Base::A; 50], + &vec![40u8; 50], + &[], + 0, + -1, + 0, + 0, + &mut (), + ) + .unwrap(); + } + + let mut engine = PileupEngine::new(store, Pos0::new(0).unwrap(), Pos0::new(49).unwrap()); + engine.set_max_depth(3); + + // At every position, only the first 3 records (indices 0,1,2) should be kept. + while let Some(col) = engine.pileups() { + let indices: Vec = col.alignments().map(|a| a.record_idx()).collect(); + assert_eq!( + indices, + vec![0, 1, 2], + "max_depth=3 should keep first 3 records in insertion order at pos {:?}", + col.pos() + ); + } + } } diff --git a/crates/seqair/src/bam/reader.rs b/crates/seqair/src/bam/reader.rs index 21f441c6..08a0a070 100644 --- a/crates/seqair/src/bam/reader.rs +++ b/crates/seqair/src/bam/reader.rs @@ -106,12 +106,6 @@ pub struct IndexedBamReader { /// large sequential reads that don't benefit from `BufReader`). bulk_reader: R, shared: Arc, - /// When `true`, `fetch_into` keeps placed-unmapped reads (flag 0x4 with a - /// valid tid) so the caller's `filter_raw` / `filter` can decide. Default - /// `false` preserves the legacy behavior (reader drops them silently). - /// Fully unmapped reads (tid = -1) are filtered separately by the BAI - /// chunk lookup and never reach this branch regardless. - keep_unmapped: bool, } impl std::fmt::Debug for IndexedBamReader { @@ -135,7 +129,6 @@ impl IndexedBamReader { Ok(IndexedBamReader { bulk_reader: bulk_file, shared: Arc::new(BamShared { index, header, bam_path: path.to_path_buf() }), - keep_unmapped: false, }) } @@ -148,11 +141,7 @@ impl IndexedBamReader { let bulk_file = File::open(&self.shared.bam_path) .map_err(|source| BamError::Open { path: self.shared.bam_path.clone(), source })?; - Ok(IndexedBamReader { - bulk_reader: bulk_file, - shared: Arc::clone(&self.shared), - keep_unmapped: self.keep_unmapped, - }) + Ok(IndexedBamReader { bulk_reader: bulk_file, shared: Arc::clone(&self.shared) }) } } @@ -167,7 +156,6 @@ impl IndexedBamReader>> { Ok(IndexedBamReader { bulk_reader: std::io::Cursor::new(bam_data), shared: Arc::new(BamShared { index, header, bam_path: PathBuf::from("") }), - keep_unmapped: false, }) } } @@ -182,25 +170,6 @@ impl IndexedBamReader { &self.shared.header } - // r[impl bam.reader.unmapped_skipped+2] - /// Keep placed-unmapped reads (flag 0x4 with a valid tid) in the fetch - /// stream so the customize layer's `filter_raw` / `filter` decides their - /// fate. Default `false` mirrors htslib's pileup behavior. Useful for - /// downstream tools (e.g. perbase `only-depth -x`) that want - /// htslib-`view`-equivalent semantics where placed-unmapped reads still - /// reach user code. - /// - /// Builder-style: returns `self` for chaining after `open` / `fork`. - pub fn keep_unmapped(mut self, keep: bool) -> Self { - self.keep_unmapped = keep; - self - } - - /// Inspect the current `keep_unmapped` setting. - pub fn keeps_unmapped(&self) -> bool { - self.keep_unmapped - } - // r[impl bam.reader.fetch_into+2] // r[impl bam.reader.overlap_filter] // r[impl bam.reader.sorted_order+2] @@ -243,7 +212,6 @@ impl IndexedBamReader { let tid_i32 = validate_tid(tid)?; let mut skipped_tid: u32 = 0; - let mut skipped_unmapped: u32 = 0; let mut skipped_out_of_range: u32 = 0; let mut accepted: u32 = 0; let mut kept_count: usize = 0; @@ -282,7 +250,7 @@ impl IndexedBamReader { return Err(BamError::TruncatedRecord { offset: current_voff.0 }); } - // r[impl bam.reader.unmapped_skipped+2] + // r[impl bam.reader.unmapped_skipped] debug_assert!(raw.len() >= 32, "raw record too short: {}", raw.len()); #[allow(clippy::indexing_slicing, reason = "raw.len() >= 32 checked above")] let rec_tid = i32::from_le_bytes([raw[0], raw[1], raw[2], raw[3]]); @@ -298,12 +266,12 @@ impl IndexedBamReader { continue; } - if rec_flags.is_unmapped() && !self.keep_unmapped { - skipped_unmapped = skipped_unmapped.saturating_add(1); - continue; - } - - let rec_end = compute_end_pos_from_raw(raw).unwrap_or(rec_pos); + // r[impl record_store.end_pos_htslib] + let rec_end = if rec_flags.is_unmapped() { + rec_pos + } else { + compute_end_pos_from_raw(raw).unwrap_or(rec_pos) + }; if rec_pos > end || rec_end < start { skipped_out_of_range = skipped_out_of_range.saturating_add(1); continue; diff --git a/crates/seqair/src/bam/record_store.rs b/crates/seqair/src/bam/record_store.rs index 62f0ed3c..7f077867 100644 --- a/crates/seqair/src/bam/record_store.rs +++ b/crates/seqair/src/bam/record_store.rs @@ -21,7 +21,12 @@ use seqair_types::{BamFlags, Base, BaseQuality, Pos0}; pub struct SlimRecord { /// 0-based leftmost aligned reference position. pub pos: Pos0, - /// 0-based exclusive end (`pos` + reference-consuming CIGAR length). + /// 0-based exclusive end, htslib-compatible. + /// + /// For mapped reads: computed from CIGAR (`pos` + reference-consuming op + /// lengths). For unmapped reads: equals [`pos`](Self::pos) — htslib ignores + /// the CIGAR and reports `end = start + 1`, which in 0-based exclusive + /// semantics is `pos`. pub end_pos: Pos0, // r[impl flags.field_type] /// BAM flag bits (paired, unmapped, reverse strand, …). @@ -359,7 +364,7 @@ impl RecordStore { let packed_seq_slice = &raw[h.cigar_end..h.seq_end]; if !customize.filter_raw(&FilterRawFields { pos: h.pos, - end_pos: h.pos, // not yet computed from CIGAR + end_pos: None, flags: h.flags, mapq: h.mapq, n_cigar_ops: h.n_cigar_ops, @@ -373,25 +378,27 @@ impl RecordStore { qname: qname_stripped, qual_bytes: qual_slice, aux_bytes: aux_slice, - raw_cigar_bytes: Some(cigar_bytes), - cigar_ops: None, - packed_seq: Some(packed_seq_slice), - bases: None, + cigar: cigar_bytes, + seq: Sequence::Packed(packed_seq_slice), }) { return Ok(None); } } - // --- Write into cigar slab (typed; bulk memcpy on LE) --- + // --- Write into cigar slab --- // r[impl record_store.checked_offsets] let cigar_off = u32::try_from(self.cigar.len()).map_err(|_| DecodeError::SlabOverflow)?; CigarOp::extend_from_bam_bytes(&mut self.cigar, cigar_bytes); let cigar_start = cigar_off as usize; #[allow(clippy::indexing_slicing, reason = "just appended; offsets within slab bounds")] let cigar_ops = &self.cigar[cigar_start..]; - - let end_pos = cigar::compute_end_pos(h.pos, cigar_ops) - .ok_or(DecodeError::InvalidPosition { value: h.pos.as_i32() })?; + // r[impl record_store.end_pos_htslib] + let end_pos = if h.flags.is_unmapped() { + h.pos + } else { + cigar::compute_end_pos(h.pos, cigar_ops) + .ok_or(DecodeError::InvalidPosition { value: h.pos.as_i32() })? + }; let (matching_bases, indel_bases) = cigar::calc_matches_indels(cigar_ops); // --- Write into name slab --- @@ -548,7 +555,7 @@ impl RecordStore { // r[impl record_store.filter_raw] if !customize.filter_raw(&FilterRawFields { pos, - end_pos, + end_pos: Some(end_pos), flags, mapq, n_cigar_ops, @@ -562,10 +569,8 @@ impl RecordStore { qname, qual_bytes: qual, aux_bytes: aux, - raw_cigar_bytes: None, - cigar_ops: Some(cigar_ops), - packed_seq: None, - bases: Some(bases), + cigar: CigarOp::ops_as_bytes(cigar_ops), + seq: Sequence::Bases(bases), }) { return Ok(None); } @@ -642,6 +647,21 @@ impl RecordStore { } } +// r[impl record_store.filter_raw_fields] +/// The form in which the CIGAR data is available in [`FilterRawFields`]. +/// +/// In the `push_raw` path (BAM), the CIGAR is available as raw BAM bytes +/// In `push_raw` (BAM), the sequence is in 4-bit packed BAM format. +/// In `push_fields` (SAM/CRAM), it has already been decoded into typed +/// [`Base`] values. +#[derive(Debug, Clone, Copy)] +pub enum Sequence<'a> { + /// 4-bit packed BAM sequence bytes (from `push_raw`). + Packed(&'a [u8]), + /// Decoded sequence bases (from `push_fields`). + Bases(&'a [Base]), +} + // r[impl record_store.filter_raw_fields] /// Fields available to [`CustomizeRecordStore::filter_raw`] before any slab /// data is written — the raw slices and parsed header fields from the incoming @@ -653,25 +673,31 @@ impl RecordStore { /// # `push_raw` vs `push_fields` /// /// From `push_raw` (BAM binary decode): -/// - `raw_cigar_bytes` is `Some` (the raw BAM CIGAR bytes). -/// - `packed_seq` is `Some` (4-bit packed sequence bytes). -/// - `cigar_ops` and `bases` are `None` (not yet decoded). -/// - `end_pos`, `matching_bases`, `indel_bases` are not yet computed (set to -/// `pos`, `0`, `0` respectively). +/// - [`seq`](Self::seq) is [`Sequence::Packed`] (4-bit packed sequence bytes). +/// - [`end_pos`](Self::end_pos) is `None` (not yet computed from CIGAR). +/// - [`matching_bases`](Self::matching_bases) and [`indel_bases`](Self::indel_bases) +/// are `0` (not yet computed from CIGAR). /// /// From `push_fields` (pre-parsed SAM/CRAM fields): -/// - `cigar_ops` is `Some` (decoded CIGAR ops). -/// - `bases` is `Some` (decoded sequence bases). -/// - `raw_cigar_bytes` and `packed_seq` are `None`. -/// - `end_pos`, `matching_bases`, `indel_bases` are the pre-computed values. +/// - [`seq`](Self::seq) is [`Sequence::Bases`] (decoded sequence bases). +/// - [`end_pos`](Self::end_pos) is `Some(_)` (pre-computed). +/// - [`matching_bases`](Self::matching_bases) and [`indel_bases`](Self::indel_bases) +/// are the pre-computed values. +/// +/// [`cigar`](Self::cigar) is always raw BAM CIGAR bytes (`&[u8]`). Use +/// [`CigarOp::slice_from_bam_bytes`] or [`CigarOp::extend_from_bam_bytes`] +/// for typed access. #[derive(Debug)] #[non_exhaustive] pub struct FilterRawFields<'a> { /// 0-based reference position. pub pos: Pos0, - /// Exclusive end position (computed from CIGAR; `pos` for `push_raw` since - /// CIGAR is not yet decoded). - pub end_pos: Pos0, + /// Exclusive end position, computed from CIGAR. + /// + /// `Some(_)` from `push_fields` (SAM/CRAM) where the CIGAR is pre-parsed. + /// `None` from `push_raw` (BAM) because the CIGAR hasn't been decoded yet + /// at this point — `filter_raw` runs before any slab work. + pub end_pos: Option, /// BAM flags. pub flags: BamFlags, /// Mapping quality (0..254). @@ -698,14 +724,13 @@ pub struct FilterRawFields<'a> { pub qual_bytes: &'a [u8], /// Raw auxiliary tag bytes (BAM binary format). pub aux_bytes: &'a [u8], - /// Raw BAM CIGAR bytes (`push_raw` path); `None` from `push_fields`. - pub raw_cigar_bytes: Option<&'a [u8]>, - /// Decoded CIGAR ops (`push_fields` path); `None` from `push_raw`. - pub cigar_ops: Option<&'a [CigarOp]>, - /// 4-bit packed sequence bytes (`push_raw` path); `None` from `push_fields`. - pub packed_seq: Option<&'a [u8]>, - /// Decoded sequence bases (`push_fields` path); `None` from `push_raw`. - pub bases: Option<&'a [Base]>, + /// Raw BAM CIGAR bytes. Use [`CigarOp::slice_from_bam_bytes`] for zero-cost + /// typed access on aligned little-endian input, or + /// [`CigarOp::extend_from_bam_bytes`] to copy into a `Vec`. + pub cigar: &'a [u8], + /// Read sequence. [`Sequence::Packed`] from `push_raw` (BAM), + /// [`Sequence::Bases`] from `push_fields` (SAM/CRAM). + pub seq: Sequence<'a>, } // r[impl record_store.customize.trait] /// Customize how records flow into a [`RecordStore`]: filter and compute @@ -733,6 +758,54 @@ pub struct FilterRawFields<'a> { /// (mapq, flags, tid, qname prefix) that can reject records without /// touching the slabs. /// +/// # `compute` before `filter` — wasted work +/// +/// `compute` runs *before* `filter`. If your `compute` is expensive (aux +/// parsing, base counting, read group extraction) and your `filter` might +/// reject many records, that work is wasted. When possible, push rejection +/// logic into `filter_raw` instead — it runs before any slab extension. +/// +/// For checks that require slab data (aux tags, CIGAR, sequence), you have +/// two options: +/// +/// 1. **Accept the waste.** If your `filter` rejects few records and your +/// `compute` needs the same slab data anyway, the waste is negligible. +/// 2. **Inline the check into `filter_raw`.** Parse the raw bytes directly +/// to reject early. This avoids slab writes AND `compute` for rejected +/// records. The second example below shows this pattern for read groups. +/// +/// # Example: pre-filter on read group +/// +/// A customizer that only keeps reads from a specific read group. Rather +/// than relying on `filter` (which would waste `compute` on every rejected +/// record), it scans the raw aux bytes in `filter_raw` — before any slab +/// work happens: +/// +/// ``` +/// use seqair::bam::record_store::{CustomizeRecordStore, FilterRawFields, RecordStore, SlimRecord}; +/// +/// #[derive(Clone)] +/// struct ReadGroupFilter { +/// /// Expected read group, e.g. `b"@RG\tID:mygroup"`. +/// target_rg: Vec, +/// } +/// +/// impl CustomizeRecordStore for ReadGroupFilter { +/// type Extra = (); +/// +/// /// Reject records whose RG tag doesn't match, before any slab extension. +/// fn filter_raw(&mut self, fields: &FilterRawFields<'_>) -> bool { +/// // Quick: if the aux bytes don't contain our target string at all, +/// // skip the record. No slab work done. +/// fields.aux_bytes +/// .windows(self.target_rg.len()) +/// .any(|w| w == self.target_rg.as_slice()) +/// } +/// +/// fn compute(&mut self, _: &SlimRecord, _: &RecordStore<()>) {} +/// } +/// ``` +/// /// The same `&mut E` instance is reused across regions (`Readers` holds one /// customize value for its whole lifetime). Implementors that want /// per-region state MUST reset it themselves between fetches via @@ -819,6 +892,68 @@ impl CustomizeRecordStore for () { fn compute(&mut self, _rec: &SlimRecord, _store: &RecordStore<()>) {} } +/// Drop unmapped reads at the earliest possible point. +/// +/// This is the simplest possible [`CustomizeRecordStore`] — it implements +/// only [`filter_raw`](CustomizeRecordStore::filter_raw) to reject unmapped +/// reads before any slab extension or base decode work happens. Because +/// [`filter_raw`](CustomizeRecordStore::filter_raw) runs before CIGAR is +/// decoded in the BAM path, it can only check fields that are always +/// available: `flags`, `mapq`, `tid`, `qname`, etc. +/// +/// Pass this to [`Readers::open_customized`](crate::reader::Readers::open_customized) +/// to get htslib-compatible behavior where unmapped reads never enter the record +/// store. The default [`Readers::open`](crate::reader::Readers::open) passes +/// `()` as the customizer, which keeps unmapped reads — the pileup engine will +/// exclude them from columns but they remain in the store for other uses. +/// +/// # Example +/// +/// ```no_run +/// use seqair::bam::record_store::RejectUnmapped; +/// use seqair::reader::Readers; +/// use std::path::Path; +/// +/// # fn main() -> Result<(), Box> { +/// let mut readers = Readers::open_customized( +/// Path::new("sample.bam"), +/// Path::new("reference.fa"), +/// RejectUnmapped, +/// )?; +/// # Ok(()) +/// # } +/// ``` +/// +/// # Source +/// +/// The entire implementation: +/// +/// ``` +/// # use seqair::bam::record_store::{CustomizeRecordStore, FilterRawFields, RecordStore, SlimRecord}; +/// #[derive(Clone, Default)] +/// pub struct RejectUnmapped; +/// +/// impl CustomizeRecordStore for RejectUnmapped { +/// type Extra = (); +/// fn filter_raw(&mut self, fields: &FilterRawFields<'_>) -> bool { +/// !fields.flags.is_unmapped() +/// } +/// fn compute(&mut self, _: &SlimRecord, _: &RecordStore<()>) {} +/// } +/// ``` +#[derive(Clone, Default)] +pub struct RejectUnmapped; + +impl CustomizeRecordStore for RejectUnmapped { + type Extra = (); + #[inline] + fn filter_raw(&mut self, fields: &FilterRawFields<'_>) -> bool { + !fields.flags.is_unmapped() + } + #[inline] + fn compute(&mut self, _rec: &SlimRecord, _store: &RecordStore<()>) {} +} + impl RecordStore { // --- Ordering --- @@ -1004,8 +1139,13 @@ impl RecordStore { let n_cigar_ops = u16::try_from(n_ops).map_err(|_| DecodeError::CigarOpCountOverflow { count: n_ops })?; - let end_pos = cigar::compute_end_pos(new_pos, new_cigar_ops) - .ok_or(DecodeError::InvalidPosition { value: new_pos.as_i32() })?; + // r[impl record_store.end_pos_htslib] + let end_pos = if rec.flags.is_unmapped() { + new_pos + } else { + cigar::compute_end_pos(new_pos, new_cigar_ops) + .ok_or(DecodeError::InvalidPosition { value: new_pos.as_i32() })? + }; let (matching_bases, indel_bases) = cigar::calc_matches_indels(new_cigar_ops); @@ -1518,6 +1658,155 @@ mod tests { assert_eq!(store.record(0).next_ref_id, 3); } + // r[verify record_store.end_pos_htslib] + #[test] + fn end_pos_mapped_uses_cigar() { + use seqair_types::{BamFlags, Base}; + let mut store = RecordStore::new(); + store + .push_fields( + Pos0::new(100).unwrap(), + Pos0::new(104).unwrap(), // 0-based inclusive, 5M: span = [100, 104] + BamFlags::empty(), // mapped + 30, + 5, + 0, + b"read1", + &[CigarOp::new(cigar::CigarOpType::Match, 5)], + &[Base::A; 5], + &[30; 5], + &[], + 0, + -1, + 0, + 0, + &mut (), + ) + .unwrap(); + let rec = store.record(0); + assert_eq!(rec.end_pos, Pos0::new(104).unwrap()); + } + + // r[verify record_store.end_pos_htslib] + #[test] + fn end_pos_unmapped_ignores_cigar() { + use seqair_types::bam_flags::consts::FLAG_UNMAPPED; + use seqair_types::{BamFlags, Base}; + let mut store = RecordStore::new(); + store + .push_fields( + Pos0::new(100).unwrap(), + Pos0::new(100).unwrap(), // htslib-compatible: end_pos = pos for unmapped + BamFlags::from(FLAG_UNMAPPED), // unmapped + 0, + 100, + 0, + b"read1", + &[CigarOp::new(cigar::CigarOpType::Match, 100)], + &[Base::A; 100], + &[0; 100], + &[], + 0, + -1, + 0, + 0, + &mut (), + ) + .unwrap(); + let rec = store.record(0); + assert!(rec.flags.is_unmapped()); + assert_eq!(rec.end_pos, Pos0::new(100).unwrap()); + } + + // r[verify record_store.end_pos_htslib] + /// Verify the CRAM unmapped-read code path semantics: unmapped reads pushed + /// via `push_fields` (the path CRAM uses, `cram/slice.rs:649-705`) have + /// `end_pos == pos`, `mapq == 0`, and correct flags/sequence data. + #[test] + fn push_fields_unmapped_cram_semantics() { + use seqair_types::bam_flags::consts::FLAG_UNMAPPED; + use seqair_types::{BamFlags, Base}; + + let mut store = RecordStore::new(); + let pos = Pos0::new(200).unwrap(); + let bases = &[Base::A, Base::C, Base::G, Base::T]; + let quals = &[30u8, 31, 32, 33]; + let aux = b"RGZ\x00read_group"; + store + .push_fields( + pos, + pos, // end_pos = pos (htslib-compatible, as CRAM does at slice.rs:676) + BamFlags::from(FLAG_UNMAPPED), + 0, // mapq = 0 (CRAM slice.rs:681) + 0, // matching_bases = 0 (slice.rs:682) + 0, // indel_bases = 0 (slice.rs:683) + b"cram_unmapped_read", + &[], // empty CIGAR (slice.rs:685) + bases, + quals, + aux, + 0, // tid + -1, // next_ref_id + 0, // next_pos + 0, // template_len + &mut (), + ) + .unwrap(); + + let rec = store.record(0); + assert!(rec.flags.is_unmapped(), "CRAM unmapped reads must have FLAG_UNMAPPED"); + assert_eq!(rec.mapq, 0, "CRAM unmapped reads must have MAPQ=0"); + assert_eq!(rec.end_pos, pos, "CRAM unmapped reads must have end_pos == pos"); + assert_eq!(rec.seq_len, 4, "CRAM unmapped reads retain decoded sequence"); + assert_eq!(rec.matching_bases, 0, "CRAM unmapped reads have zero matching_bases"); + assert_eq!(rec.indel_bases, 0, "CRAM unmapped reads have zero indel_bases"); + assert_eq!(rec.qname(&store).unwrap(), b"cram_unmapped_read"); + } + + // r[verify record_store.end_pos_htslib] + /// Verify that `compute_end_pos_from_raw` (used by the BAM reader's pre-push + /// overlap filter) agrees with the stored `end_pos` after `push_raw`, for + /// both mapped and unmapped reads. + #[test] + fn push_raw_end_pos_agrees_with_reader_prefilter() { + use seqair_types::bam_flags::consts::FLAG_UNMAPPED; + + // --- Mapped read --- + let mut mapped = make_raw_record(b"read1", 10, 1); // 1 CIGAR op (10M) + mapped[4..8].copy_from_slice(&100i32.to_le_bytes()); // pos = 100 + let reader_end = record::compute_end_pos_from_raw(&mapped).unwrap(); + let mut store = RecordStore::new(); + store.push_raw(&mapped, &mut ()).unwrap(); + assert_eq!( + store.record(0).end_pos, + reader_end, + "mapped: reader's compute_end_pos_from_raw must agree with push_raw's stored end_pos" + ); + + // --- Unmapped read --- + let mut unmapped = make_raw_record(b"read2", 10, 1); // 1 CIGAR op (10M) + unmapped[4..8].copy_from_slice(&100i32.to_le_bytes()); // pos = 100 + unmapped[14..16].copy_from_slice(&FLAG_UNMAPPED.to_le_bytes()); // flag 0x4 + let reader_end = record::compute_end_pos_from_raw(&unmapped).unwrap(); + // compute_end_pos_from_raw computes CIGAR-derived (doesn't check flags). + // After refactor, push_raw stores pos for unmapped reads. + let mut store2 = RecordStore::new(); + store2.push_raw(&unmapped, &mut ()).unwrap(); + assert!(store2.record(0).flags.is_unmapped()); + assert_eq!( + store2.record(0).end_pos, + Pos0::new(100).unwrap(), + "unmapped: push_raw stores pos (htslib-compatible), not CIGAR-derived end" + ); + // Verify they differ: reader's prefilter returns CIGAR-derived (which is fine + // for overlap checking — it's a loose upper bound), while store is exact. + assert_ne!( + store2.record(0).end_pos, + reader_end, + "reader prefilter end_pos (CIGAR-derived) differs from stored end_pos (pos) for unmapped reads" + ); + } + // ---- Model-based property tests for rollback ---- // // The rollback invariant: pushing a sequence of records with per-record diff --git a/crates/seqair/src/cram/slice.rs b/crates/seqair/src/cram/slice.rs index b37a208d..53dfe8cc 100644 --- a/crates/seqair/src/cram/slice.rs +++ b/crates/seqair/src/cram/slice.rs @@ -647,14 +647,58 @@ fn decode_record( } // r[impl cram.edge.unmapped_reads] - // Unmapped reads — skip for fetch_into (same as BAM/SAM) + // Unmapped read — push to store so filter_raw can decide. + // htslib compat: end_pos = pos (ignore CIGAR for unmapped reads). + // + // Apply the same foreign-tid and overlap checks as the mapped path + // so unmapped reads from other references in multi-ref slices or + // outside the query region don't pollute the store. + #[expect( + clippy::cast_possible_wrap, + reason = "tid comes from BAM header, capped at MAX_REFERENCES (1M), well within i32" + )] + let is_multi_ref_unmapped = is_multi_ref && record_ref_id != tid as i32 && record_ref_id != -1; + if is_multi_ref_unmapped || pos_0based > query_end { + return Ok(( + false, + SliceMateInfo { + pos: pos_0based.as_i64(), + end_pos: pos_0based.as_i64(), + mate_line, + store_idx: None, + bam_flags: raw_flags, + ref_id: record_ref_id, + }, + )); + } + + let qname: &[u8] = name_buf; + let end_pos = pos_0based; + let store_idx = store.push_fields( + pos_0based, + end_pos, + bam_flags, + 0, // mapq — unmapped reads have MAPQ=0 per spec + 0, // matching_bases + 0, // indel_bases + qname, + &[], // empty cigar — unmapped reads don't have features decoded + bases_buf, + qual_buf, + aux_buf, + record_ref_id, + next_ref_id_val, + next_pos_val, + template_len_val, + customize, + )?; Ok(( - false, + true, SliceMateInfo { - pos: -1, - end_pos: -1, - mate_line: -1, - store_idx: None, + pos: pos_0based.as_i64(), + end_pos: end_pos.as_i64(), + mate_line, + store_idx, bam_flags: raw_flags, ref_id: record_ref_id, }, diff --git a/crates/seqair/src/reader/readers.rs b/crates/seqair/src/reader/readers.rs index 58987eb1..ee35fbe8 100644 --- a/crates/seqair/src/reader/readers.rs +++ b/crates/seqair/src/reader/readers.rs @@ -192,9 +192,11 @@ impl Readers { tid: u32, start: Pos0, end: Pos0, - store: &mut RecordStore<()>, + store: &mut RecordStore, ) -> Result { - self.alignment.fetch_into(tid, start, end, store) + self.alignment + .fetch_into_customized(tid, start, end, store, &mut self.customize) + .map(|c| c.kept) } /// Access the customize value, e.g. to inspect any internal counters it carries. diff --git a/crates/seqair/src/sam/reader.rs b/crates/seqair/src/sam/reader.rs index a88b9831..28a73b4c 100644 --- a/crates/seqair/src/sam/reader.rs +++ b/crates/seqair/src/sam/reader.rs @@ -482,11 +482,6 @@ fn parse_sam_line( .ok_or_else(|| SamRecordError::InvalidFlag { value: flag_field.into() })?, ); - // Skip unmapped - if flags.is_unmapped() { - return Ok(None); - } - // r[impl sam.edge.rname_star] let rname = fields.get(2).copied().unwrap_or(b"*"); if rname == b"*" { diff --git a/crates/seqair/tests/bam_write_roundtrip.proptest-regressions b/crates/seqair/tests/bam_write_roundtrip.proptest-regressions new file mode 100644 index 00000000..cfb672b1 --- /dev/null +++ b/crates/seqair/tests/bam_write_roundtrip.proptest-regressions @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 602ee9fc1af50d28393c25b01b7acd9c4833a6d3b46c2db7e2d4b7449f564993 # shrinks to mut inputs = [E2eInput { qname: [97, 97, 97, 97, 97, 97], bases: [A], quals: [30], pos: 0, mapq: 0 }] diff --git a/crates/seqair/tests/compare_bam_with_htslib.rs b/crates/seqair/tests/compare_bam_with_htslib.rs index 828bb6b8..711911ac 100644 --- a/crates/seqair/tests/compare_bam_with_htslib.rs +++ b/crates/seqair/tests/compare_bam_with_htslib.rs @@ -14,8 +14,10 @@ mod helpers; use rust_htslib::bam::{self, FetchDefinition, Read as _, record::Aux}; -use seqair::bam::Pos0; -use seqair::bam::aux::{AuxValue, find_tag as find_aux_tag}; +use seqair::bam::{ + Pos0, RejectUnmapped, + aux::{AuxValue, find_tag as find_aux_tag}, +}; use seqair_types::BaseQuality; use std::path::Path; @@ -84,12 +86,14 @@ fn all_contigs_record_count_matches() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); assert_eq!( @@ -116,12 +120,14 @@ fn all_contigs_record_fields_match() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); for (i, h) in hts.iter().enumerate() { @@ -204,12 +210,14 @@ fn all_contigs_pileup_positions_and_depth_match() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); let mut engine = seqair::bam::PileupEngine::new( @@ -255,12 +263,14 @@ fn all_contigs_pileup_qpos_and_flags_match() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); let mut engine = seqair::bam::PileupEngine::new( @@ -312,12 +322,14 @@ fn all_contigs_pileup_bases_match() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); let mut engine = seqair::bam::PileupEngine::new( @@ -411,12 +423,14 @@ fn all_contigs_aux_tags_match() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); assert_eq!( diff --git a/crates/seqair/tests/compare_bam_with_noodles.rs b/crates/seqair/tests/compare_bam_with_noodles.rs index 7ca2cb0b..f0aa5fda 100644 --- a/crates/seqair/tests/compare_bam_with_noodles.rs +++ b/crates/seqair/tests/compare_bam_with_noodles.rs @@ -14,7 +14,7 @@ )] use noodles::bam; use noodles::sam; -use seqair::bam::Pos0; +use seqair::bam::{Pos0, RejectUnmapped}; use seqair_types::BaseQuality; use std::path::Path; @@ -135,12 +135,14 @@ fn bam_record_count_matches_noodles() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); // seqair's index-based fetch may include records starting before @@ -169,12 +171,14 @@ fn bam_record_fields_match_noodles() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); // Filter seqair records to match noodles' sequential filter (pos >= start). @@ -206,12 +210,14 @@ fn bam_sequence_matches_noodles() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); let seqair_indices: Vec = (0..store.len() as u32) @@ -257,12 +263,14 @@ fn bam_quality_scores_match_noodles() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); let seqair_indices: Vec = (0..store.len() as u32) @@ -291,12 +299,14 @@ fn bam_cigar_matches_noodles() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(contig).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("fetch"); let seqair_indices: Vec = (0..store.len() as u32) diff --git a/crates/seqair/tests/compare_cram_parsing.rs b/crates/seqair/tests/compare_cram_parsing.rs index 93fc3a47..e6cea484 100644 --- a/crates/seqair/tests/compare_cram_parsing.rs +++ b/crates/seqair/tests/compare_cram_parsing.rs @@ -13,7 +13,7 @@ reason = "test code with known small values" )] -use seqair::bam::Pos0; +use seqair::bam::{Pos0, RejectUnmapped}; use std::path::Path; fn test_cram_path() -> &'static Path { @@ -241,12 +241,14 @@ fn cram_records_match_bam_records() { let end = 6_143_229u64; let cram_count = cram_reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut cram_store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .unwrap(); hts.fetch(FetchDefinition::Region(tid as i32, start as i64, end as i64)).unwrap(); diff --git a/crates/seqair/tests/compare_cram_with_htslib.rs b/crates/seqair/tests/compare_cram_with_htslib.rs index ad29b140..15a76c69 100644 --- a/crates/seqair/tests/compare_cram_with_htslib.rs +++ b/crates/seqair/tests/compare_cram_with_htslib.rs @@ -15,7 +15,7 @@ reason = "test code with known small values" )] use rust_htslib::bam::{self, FetchDefinition, Read as _}; -use seqair::bam::{Pos0, RecordStore}; +use seqair::bam::{Pos0, RecordStore, RejectUnmapped}; use seqair::reader::Readers; use seqair_types::BaseQuality; use std::path::Path; @@ -135,7 +135,8 @@ fn cram_records_match_htslib_for_chr19() { let hts_records = fetch_hts_records(contig, start, end); for &(version, cram_path_fn) in CRAM_VERSIONS { - let mut readers = Readers::open(cram_path_fn(), test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path_fn(), test_fasta_path(), RejectUnmapped).unwrap(); let mut store = RecordStore::new(); let tid = readers.header().tid(contig).unwrap(); let cram_count = readers @@ -189,7 +190,8 @@ fn cram_mate_fields_match_htslib() { let hts_records = fetch_hts_records(contig, start, end); for &(version, cram_path_fn) in CRAM_VERSIONS { - let mut readers = Readers::open(cram_path_fn(), test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path_fn(), test_fasta_path(), RejectUnmapped).unwrap(); let mut store = RecordStore::new(); let tid = readers.header().tid(contig).unwrap(); readers @@ -237,7 +239,8 @@ fn cram_end_pos_matches_htslib_inclusive_convention() { let hts_records = fetch_hts_records(contig, start, end); for &(version, cram_path_fn) in CRAM_VERSIONS { - let mut readers = Readers::open(cram_path_fn(), test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path_fn(), test_fasta_path(), RejectUnmapped).unwrap(); let mut store = RecordStore::new(); let tid = readers.header().tid(contig).unwrap(); readers @@ -289,7 +292,8 @@ fn cram_quality_scores_match_htslib() { let hts_quals = fetch_hts_quals(contig, start, end); for &(version, cram_path_fn) in CRAM_VERSIONS { - let mut readers = Readers::open(cram_path_fn(), test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path_fn(), test_fasta_path(), RejectUnmapped).unwrap(); let mut store = RecordStore::new(); let tid = readers.header().tid(contig).unwrap(); readers @@ -326,7 +330,8 @@ fn cram_sequences_match_htslib() { let hts_seqs = fetch_hts_seqs(contig, start, end); for &(version, cram_path_fn) in CRAM_VERSIONS { - let mut readers = Readers::open(cram_path_fn(), test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path_fn(), test_fasta_path(), RejectUnmapped).unwrap(); let mut store = RecordStore::new(); let tid = readers.header().tid(contig).unwrap(); readers @@ -373,7 +378,8 @@ fn cram_sequences_match_htslib() { #[test] fn cram_fork_produces_same_records() { for &(version, cram_path_fn) in CRAM_VERSIONS { - let readers = Readers::open(cram_path_fn(), test_fasta_path()).unwrap(); + let readers = + Readers::open_customized(cram_path_fn(), test_fasta_path(), RejectUnmapped).unwrap(); let mut fork1 = readers.fork().unwrap(); let mut fork2 = readers.fork().unwrap(); diff --git a/crates/seqair/tests/compare_cram_with_noodles.rs b/crates/seqair/tests/compare_cram_with_noodles.rs index ef18ed00..e46f07d9 100644 --- a/crates/seqair/tests/compare_cram_with_noodles.rs +++ b/crates/seqair/tests/compare_cram_with_noodles.rs @@ -20,7 +20,7 @@ use noodles::cram; use noodles::fasta; use noodles::sam; use noodles::sam::alignment::record::Sequence as _; -use seqair::bam::{Pos0, RecordStore}; +use seqair::bam::{Pos0, RecordStore, RejectUnmapped}; use seqair::reader::Readers; use seqair_types::BaseQuality; use std::path::Path; @@ -143,7 +143,8 @@ fn cram_chr19_count_matches_noodles() { .filter(|r| r.ref_id == Some(0) && r.pos >= start as i64 && r.pos <= end as i64) .count(); - let mut readers = Readers::open(cram_path, test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path, test_fasta_path(), RejectUnmapped).unwrap(); let chr19_tid = readers.header().tid("chr19").unwrap(); let mut store = RecordStore::new(); readers @@ -182,7 +183,8 @@ fn cram_chr19_records_match_noodles_field_by_field() { .filter(|r| r.ref_id == Some(0) && r.pos >= start && r.pos <= end) .collect(); - let mut readers = Readers::open(cram_path, test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path, test_fasta_path(), RejectUnmapped).unwrap(); let chr19_tid = readers.header().tid("chr19").unwrap(); let mut store = RecordStore::new(); readers @@ -268,7 +270,8 @@ fn cram_end_pos_matches_noodles_inclusive_convention() { .filter(|r| r.ref_id == Some(0) && r.pos >= start && r.pos <= end) .collect(); - let mut readers = Readers::open(cram_path, test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path, test_fasta_path(), RejectUnmapped).unwrap(); let chr19_tid = readers.header().tid("chr19").unwrap(); let mut store = RecordStore::new(); readers @@ -316,7 +319,8 @@ fn cram_chr19_sequences_match_noodles() { .filter(|r| r.ref_id == Some(0) && r.pos >= start && r.pos <= end) .collect(); - let mut readers = Readers::open(cram_path, test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path, test_fasta_path(), RejectUnmapped).unwrap(); let chr19_tid = readers.header().tid("chr19").unwrap(); let mut store = RecordStore::new(); readers @@ -374,7 +378,8 @@ fn cram_chr19_quality_scores_match_noodles() { .filter(|r| r.ref_id == Some(0) && r.pos >= start && r.pos <= end) .collect(); - let mut readers = Readers::open(cram_path, test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path, test_fasta_path(), RejectUnmapped).unwrap(); let chr19_tid = readers.header().tid("chr19").unwrap(); let mut store = RecordStore::new(); readers @@ -418,7 +423,8 @@ fn cram_chr19_qnames_match_noodles() { .filter(|r| r.ref_id == Some(0) && r.pos >= start && r.pos <= end) .collect(); - let mut readers = Readers::open(cram_path, test_fasta_path()).unwrap(); + let mut readers = + Readers::open_customized(cram_path, test_fasta_path(), RejectUnmapped).unwrap(); let chr19_tid = readers.header().tid("chr19").unwrap(); let mut store = RecordStore::new(); readers diff --git a/crates/seqair/tests/compare_records.rs b/crates/seqair/tests/compare_records.rs index 3efe62fd..6705e310 100644 --- a/crates/seqair/tests/compare_records.rs +++ b/crates/seqair/tests/compare_records.rs @@ -14,7 +14,7 @@ )] use rust_htslib::bam::{self, Read as _}; -use seqair::bam::Pos0; +use seqair::bam::{Pos0, RejectUnmapped}; use seqair_types::BaseQuality; use std::path::Path; @@ -93,12 +93,14 @@ fn record_count_matches() { let mut store = seqair::bam::RecordStore::new(); let tid = reader.header().tid(TEST_REGION).expect("tid"); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(TEST_START as u32).unwrap(), Pos0::new(TEST_END as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("seqair fetch"); assert_eq!( @@ -110,95 +112,105 @@ fn record_count_matches() { ); } -// r[verify bam.reader.unmapped_skipped+2] +// r[verify bam.reader.unmapped_skipped] /// Region with both mapped and placed-unmapped reads (chr19:6104000-6106000 /// has 4 unmapped + 625 mapped per `samtools view -c -f 4`). const UNMAPPED_REGION: &str = "chr19"; const UNMAPPED_START: u64 = 6_104_000; const UNMAPPED_END: u64 = 6_106_000; -fn fetch_count(keep_unmapped: bool) -> usize { +/// Fetch records, optionally rejecting unmapped via filter_raw. +fn fetch_count(reject_unmapped: bool) -> usize { let bam_path = test_bam_path(); - let reader = seqair::bam::IndexedBamReader::open(bam_path).expect("open"); - let mut reader = reader.keep_unmapped(keep_unmapped); - assert_eq!(reader.keeps_unmapped(), keep_unmapped, "keep_unmapped flag round-trips"); - + let mut reader = seqair::bam::IndexedBamReader::open(bam_path).expect("open"); let tid = reader.header().tid(UNMAPPED_REGION).expect("tid"); let mut store = seqair::bam::RecordStore::new(); - reader - .fetch_into( - tid, - Pos0::new(UNMAPPED_START as u32).unwrap(), - Pos0::new(UNMAPPED_END as u32).unwrap(), - &mut store, - ) - .expect("fetch"); + if reject_unmapped { + reader + .fetch_into_customized( + tid, + Pos0::new(UNMAPPED_START as u32).unwrap(), + Pos0::new(UNMAPPED_END as u32).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) + .expect("fetch"); + } else { + reader + .fetch_into( + tid, + Pos0::new(UNMAPPED_START as u32).unwrap(), + Pos0::new(UNMAPPED_END as u32).unwrap(), + &mut store, + ) + .expect("fetch"); + } store.len() } #[test] -fn keep_unmapped_default_drops_placed_unmapped_reads() { - // Default behavior: unmapped reads filtered out. Compare against - // htslib's view -F 4 over the same region. +fn default_includes_unmapped_reads_in_store() { let bam_path = test_bam_path(); let mut hts = bam::IndexedReader::from_path(bam_path).expect("htslib open"); let tid = hts.header().tid(UNMAPPED_REGION.as_bytes()).expect("tid"); hts.fetch((tid, UNMAPPED_START as i64, UNMAPPED_END as i64)).expect("fetch"); - let mut hts_mapped = 0usize; - let mut hts_unmapped = 0usize; + let mut hts_total = 0usize; let mut record = bam::Record::new(); while hts.read(&mut record) == Some(Ok(())) { - if record.flags() & 0x4 != 0 { - hts_unmapped += 1; - } else { - hts_mapped += 1; - } + hts_total += 1; } - assert!(hts_unmapped > 0, "test region must contain placed-unmapped reads"); - - let seqair_default = fetch_count(false); - assert_eq!( - seqair_default, hts_mapped, - "default keep_unmapped=false should match htslib mapped-only count" - ); + let seqair_total = fetch_count(false); + assert_eq!(seqair_total, hts_total, "default includes all reads matching htslib full fetch"); } #[test] -fn keep_unmapped_true_includes_placed_unmapped_reads() { +fn filter_raw_can_reject_unmapped_reads() { let bam_path = test_bam_path(); let mut hts = bam::IndexedReader::from_path(bam_path).expect("htslib open"); let tid = hts.header().tid(UNMAPPED_REGION.as_bytes()).expect("tid"); hts.fetch((tid, UNMAPPED_START as i64, UNMAPPED_END as i64)).expect("fetch"); - let mut hts_total = 0usize; + let mut hts_mapped = 0usize; + let mut hts_unmapped = 0usize; let mut record = bam::Record::new(); while hts.read(&mut record) == Some(Ok(())) { - hts_total += 1; + if record.flags() & 0x4 != 0 { + hts_unmapped += 1; + } else { + hts_mapped += 1; + } } - - let seqair_with_unmapped = fetch_count(true); - assert_eq!( - seqair_with_unmapped, hts_total, - "keep_unmapped=true should match htslib's full fetch count" - ); - - // And it should be strictly more than the default — proves the flag flips behavior. - let seqair_default = fetch_count(false); - assert!( - seqair_with_unmapped > seqair_default, - "keep_unmapped=true ({seqair_with_unmapped}) must yield more than default ({seqair_default})" - ); + assert!(hts_unmapped > 0); + let seqair_reject = fetch_count(true); + assert_eq!(seqair_reject, hts_mapped, "filter_raw reject matches htslib mapped-only count"); + assert!(fetch_count(false) > seqair_reject, "unfiltered includes more than filtered"); } #[test] -fn keep_unmapped_propagates_through_fork() { +fn fork_produces_identical_results() { let bam_path = test_bam_path(); - let parent = seqair::bam::IndexedBamReader::open(bam_path).expect("open").keep_unmapped(true); - let child = parent.fork().expect("fork"); - assert!(child.keeps_unmapped(), "fork must inherit keep_unmapped=true"); - - let parent_off = seqair::bam::IndexedBamReader::open(bam_path).expect("open"); - let child_off = parent_off.fork().expect("fork"); - assert!(!child_off.keeps_unmapped(), "fork must inherit keep_unmapped=false"); + let mut parent = seqair::bam::IndexedBamReader::open(bam_path).expect("open"); + let mut child = parent.fork().expect("fork"); + let tid = parent.header().tid(UNMAPPED_REGION).expect("tid"); + let mut ps = seqair::bam::RecordStore::new(); + let mut cs = seqair::bam::RecordStore::new(); + parent + .fetch_into( + tid, + Pos0::new(UNMAPPED_START as u32).unwrap(), + Pos0::new(UNMAPPED_END as u32).unwrap(), + &mut ps, + ) + .expect("parent"); + child + .fetch_into( + tid, + Pos0::new(UNMAPPED_START as u32).unwrap(), + Pos0::new(UNMAPPED_END as u32).unwrap(), + &mut cs, + ) + .expect("child"); + assert_eq!(ps.len(), cs.len(), "fork produces identical results"); } // r[verify bam.record.decode] diff --git a/crates/seqair/tests/compare_sam_with_htslib.rs b/crates/seqair/tests/compare_sam_with_htslib.rs index 4c169b4f..bec89d99 100644 --- a/crates/seqair/tests/compare_sam_with_htslib.rs +++ b/crates/seqair/tests/compare_sam_with_htslib.rs @@ -14,7 +14,7 @@ reason = "test code with known small values" )] use rust_htslib::bam::{self, FetchDefinition, Read as _}; -use seqair::bam::{Pos0, RecordStore}; +use seqair::bam::{Pos0, RecordStore, RejectUnmapped}; use seqair::sam::reader::IndexedSamReader; use seqair_types::BaseQuality; use std::path::Path; @@ -115,12 +115,14 @@ fn sam_record_count_matches_htslib() { let tid = reader.header().tid(contig).expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("rio fetch"); assert_eq!( @@ -149,12 +151,14 @@ fn sam_record_fields_match_htslib() { let tid = reader.header().tid(contig).expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into( + .fetch_into_customized( tid, Pos0::new(start as u32).unwrap(), Pos0::new(end as u32).unwrap(), &mut store, + &mut RejectUnmapped, ) + .map(|c| c.kept) .expect("rio fetch"); for (i, h) in hts.iter().enumerate() { @@ -197,7 +201,14 @@ fn sam_aux_tags_present() { let tid = reader.header().tid("chr19").expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(6_105_700).unwrap(), Pos0::new(6_105_800).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(6_105_700).unwrap(), + Pos0::new(6_105_800).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .expect("fetch"); // Every record in the test data should have aux tags (at least RG) @@ -227,7 +238,14 @@ fn sam_aux_rg_tag_matches_htslib() { let tid = reader.header().tid("chr19").expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(6_105_700).unwrap(), Pos0::new(6_105_800).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(6_105_700).unwrap(), + Pos0::new(6_105_800).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .expect("fetch"); assert_eq!(store.len(), hts.len()); diff --git a/crates/seqair/tests/htslib_edge_cases.rs b/crates/seqair/tests/htslib_edge_cases.rs index 4a396579..69151072 100644 --- a/crates/seqair/tests/htslib_edge_cases.rs +++ b/crates/seqair/tests/htslib_edge_cases.rs @@ -17,7 +17,7 @@ use noodles::bam; use noodles::sam; -use seqair::bam::{Pos0, RecordStore}; +use seqair::bam::{Pos0, RecordStore, RejectUnmapped}; use seqair::reader::IndexedReader; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; @@ -97,7 +97,14 @@ fn dos_line_endings_in_sam() { let tid = reader.header().tid("chr1").expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(1000).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(0).unwrap(), + Pos0::new(1000).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .expect("fetch"); assert_eq!(store.len(), 2, "should parse 2 records from DOS SAM"); @@ -121,7 +128,14 @@ fn dos_line_endings_via_bam() { let tid = reader.header().tid("CHROMOSOME_I").expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(1_009_800).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(0).unwrap(), + Pos0::new(1_009_800).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .expect("fetch"); assert!(!store.is_empty(), "should have records on CHROMOSOME_I"); @@ -151,7 +165,14 @@ fn dos_line_endings_via_bam() { if let Some(tid) = reader.header().tid(name) { store.clear(); reader - .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(5000).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(0).unwrap(), + Pos0::new(5000).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .unwrap(); total += store.len(); } @@ -182,7 +203,14 @@ fn colons_in_contig_names() { let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(len).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(0).unwrap(), + Pos0::new(len).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .unwrap_or_else(|e| panic!("fetch {contig}: {e}")); let qnames: Vec<&[u8]> = (0..store.len() as u32).map(|i| store.qname(i)).collect(); @@ -212,7 +240,14 @@ fn sequence_less_mapped_reads() { let tid = reader.header().tid("c1").expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(10).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(0).unwrap(), + Pos0::new(10).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .expect("fetch"); assert_eq!(store.len(), noodles_records.len(), "mapped record count mismatch"); @@ -261,7 +296,14 @@ fn seq_qual_presence_combos() { let tid = reader.header().tid("c1").expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(10).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(0).unwrap(), + Pos0::new(10).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .expect("fetch"); assert_eq!(store.len(), noodles_mapped.len(), "mapped record count mismatch"); @@ -330,7 +372,14 @@ fn supplementary_alignments_included() { let tid = reader.header().tid("CHROMOSOME_I").expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(1_009_800).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(0).unwrap(), + Pos0::new(1_009_800).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .expect("fetch"); // Check that supplementary alignments (flag 2048) are present @@ -360,7 +409,14 @@ fn secondary_alignment_without_sequence() { let tid = reader.header().tid("CHROMOSOME_V").expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(5000).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(0).unwrap(), + Pos0::new(5000).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .expect("fetch"); // Find the secondary alignment (flag 256) @@ -387,7 +443,14 @@ fn padding_cigar_operations() { let tid = reader.header().tid("c1").expect("tid"); let mut store = RecordStore::new(); reader - .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(10).unwrap(), &mut store) + .fetch_into_customized( + tid, + Pos0::new(0).unwrap(), + Pos0::new(10).unwrap(), + &mut store, + &mut RejectUnmapped, + ) + .map(|c| c.kept) .expect("fetch"); // Verify against noodles diff --git a/crates/seqair/tests/pileup_vs_samtools.rs b/crates/seqair/tests/pileup_vs_samtools.rs index af066332..62b6a499 100644 --- a/crates/seqair/tests/pileup_vs_samtools.rs +++ b/crates/seqair/tests/pileup_vs_samtools.rs @@ -112,15 +112,14 @@ fn seqair_pileup(bam_path: &Path, contig: &str, start: u32, end: u32) -> Vec n_del += 1, - // ComplexIndel counts as both a deletion and an insertion - // (htslib: is_del=true + indel>0). N→I also sets is_refskip. seqair::bam::PileupOp::ComplexIndel { is_refskip, .. } => { n_del += 1; n_ins += 1; @@ -128,8 +127,6 @@ fn seqair_pileup(bam_path: &Path, contig: &str, start: u32, end: u32) -> Vec { n_del += 1; n_refskip += 1; @@ -139,13 +136,14 @@ fn seqair_pileup(bam_path: &Path, contig: &str, start: u32, end: u32) -> Vec 0); assert!(matches!(readers.alignment(), &IndexedReader::Bam(_))); } @@ -219,7 +220,7 @@ fn readers_open_bam() { fn readers_open_sam() { let dir = tempfile::tempdir().unwrap(); let sam_gz = create_sam_gz(dir.path()); - let readers = Readers::open(&sam_gz, test_fasta_path()).unwrap(); + let readers = Readers::open_customized(&sam_gz, test_fasta_path(), RejectUnmapped).unwrap(); assert!(matches!(readers.alignment(), &IndexedReader::Sam(_))); } @@ -228,7 +229,8 @@ fn readers_open_sam() { // r[verify unified.fork_cram] #[test] fn readers_open_cram() { - let readers = Readers::open(test_cram_path(), test_fasta_path()).unwrap(); + let readers = + Readers::open_customized(test_cram_path(), test_fasta_path(), RejectUnmapped).unwrap(); assert!(matches!(readers.alignment(), &IndexedReader::Cram(_))); assert!(readers.header().target_count() > 0); } @@ -236,7 +238,8 @@ fn readers_open_cram() { // r[verify unified.readers_fork] #[test] fn readers_fork_cram() { - let readers = Readers::open(test_cram_path(), test_fasta_path()).unwrap(); + let readers = + Readers::open_customized(test_cram_path(), test_fasta_path(), RejectUnmapped).unwrap(); let mut forked = readers.fork().unwrap(); let tid = forked.header().tid("chr19").unwrap(); let mut store = RecordStore::new(); @@ -249,8 +252,10 @@ fn readers_fork_cram() { // r[verify unified.fetch_equivalence] #[test] fn bam_and_cram_produce_same_records() { - let mut bam = Readers::open(test_bam_path(), test_fasta_path()).unwrap(); - let mut cram = Readers::open(test_cram_path(), test_fasta_path()).unwrap(); + let mut bam = + Readers::open_customized(test_bam_path(), test_fasta_path(), RejectUnmapped).unwrap(); + let mut cram = + Readers::open_customized(test_cram_path(), test_fasta_path(), RejectUnmapped).unwrap(); for &(contig, start, end) in CONTIGS { let bam_tid = bam.header().tid(contig).unwrap(); @@ -304,9 +309,11 @@ fn all_three_formats_produce_same_records() { let dir = tempfile::tempdir().unwrap(); let sam_gz = create_sam_gz(dir.path()); - let mut bam = Readers::open(test_bam_path(), test_fasta_path()).unwrap(); - let mut sam = Readers::open(&sam_gz, test_fasta_path()).unwrap(); - let mut cram = Readers::open(test_cram_path(), test_fasta_path()).unwrap(); + let mut bam = + Readers::open_customized(test_bam_path(), test_fasta_path(), RejectUnmapped).unwrap(); + let mut sam = Readers::open_customized(&sam_gz, test_fasta_path(), RejectUnmapped).unwrap(); + let mut cram = + Readers::open_customized(test_cram_path(), test_fasta_path(), RejectUnmapped).unwrap(); let contig = "chr19"; let start = 6_105_700u64; diff --git a/docs/spec/2-bam-1-reader.md b/docs/spec/2-bam-1-reader.md index a8167351..9c3d88a7 100644 --- a/docs/spec/2-bam-1-reader.md +++ b/docs/spec/2-bam-1-reader.md @@ -30,10 +30,8 @@ Records in the store MUST be in coordinate-sorted order (by pos, then by end_pos ## Edge cases -r[bam.reader.unmapped_skipped+2] -Unmapped reads (flag 0x4) MUST be excluded from the store during `fetch_into` by default. htslib's pileup engine rejects them in `bam_plp_push`, so including them by default would inflate depth counts. - -The reader MUST expose a `keep_unmapped(bool)` builder-style setter (and a `keeps_unmapped()` inspector) so downstream tools that need htslib-`view`-equivalent semantics can opt into receiving placed-unmapped reads (flag 0x4 with a valid tid). When `keep_unmapped` is `true`, placed-unmapped reads MUST flow through the customize layer (`filter_raw` / `filter`) like any other record, letting the caller decide. The default MUST remain `false` to preserve the legacy pileup-correct behavior. `fork()` MUST propagate the flag so each forked reader observes the same setting. +r[bam.reader.unmapped_skipped] +Unmapped reads (flag 0x4) MUST flow through to the customize layer (`filter_raw` / `filter`) like any other record. The reader MUST NOT silently drop them — `filter_raw` is the single gate. The pileup engine (`r[pileup.unmapped_excluded]`) is responsible for excluding unmapped reads from pileup columns. Callers that want the legacy htslib-pileup behavior (no unmapped reads in store) can reject them in `filter_raw`. r[bam.reader.secondary_supplementary_included+2] Secondary (0x100) and supplementary (0x800) reads MUST be included in the store by default. The caller's pileup filter is responsible for excluding them if desired, matching htslib's behavior. diff --git a/docs/spec/2-cram-1-reader.md b/docs/spec/2-cram-1-reader.md index bebb16f1..70e4a3e7 100644 --- a/docs/spec/2-cram-1-reader.md +++ b/docs/spec/2-cram-1-reader.md @@ -493,7 +493,7 @@ r[cram.index.multi_ref_slices] Multi-ref slices produce multiple index entries (one per reference they span). A query for one reference may hit a multi-ref slice that also contains records from other references. The reader MUST filter records by reference ID after decoding. r[cram.index.unmapped] -Unmapped records (ref ID = -1) have start=0, span=0. They are only returned if explicitly queried. For `fetch_into`, unmapped records MUST be skipped (same as BAM). +Unmapped records (ref ID = -1) have start=0, span=0. They are only returned if explicitly queried. For `fetch_into`, unmapped records MUST flow through to the `filter_raw` customizer; the pileup engine is responsible for excluding them per `r[pileup.unmapped_excluded]`. r[cram.index.crai_per_slice] A CRAI entry's `slice_offset` field is the same value the container header records as the slice's `landmark` (byte offset from the start of the container data block to the slice header). When iterating containers during fetch, the reader SHOULD use the CRAI-listed slice_offsets to skip landmarks for slices the index says do not overlap the query. This is a strict optimisation — `decode_slice` already filters records by overlap, so omitting the per-slice filter is correct but wastes work in multi-slice and multi-ref containers. Single-slice / single-ref containers behave identically either way. diff --git a/docs/spec/2-sam-1-reader.md b/docs/spec/2-sam-1-reader.md index 62f5a319..6c7da358 100644 --- a/docs/spec/2-sam-1-reader.md +++ b/docs/spec/2-sam-1-reader.md @@ -60,7 +60,7 @@ The full header text (all `@` lines joined with newlines) MUST be stored in `Bam > The parser MUST validate each field and produce clear error messages for malformed records: non-integer POS, FLAG > 65535, negative MAPQ, etc. Real-world SAM files from non-standard tools may contain hand-edited or corrupted records. r[sam.record.coordinate_conversion] -SAM POS is 1-based; the internal representation is 0-based (matching BAM). The parser MUST subtract 1 from POS. A POS of 0 in SAM means unmapped — after subtraction this becomes -1 (as i64). Unmapped records are filtered by FLAG 0x4 before reaching this point, so -1 positions should not appear in the RecordStore. +SAM POS is 1-based; the internal representation is 0-based (matching BAM). The parser MUST subtract 1 from POS. A POS of 0 in SAM means unmapped — after subtraction this becomes -1 (as i64). All records (including unmapped) flow through to the `filter_raw` customizer; the pileup engine is responsible for excluding unmapped reads per `r[pileup.unmapped_excluded]`. r[sam.record.cigar_parse] The CIGAR string MUST be parsed into a `Vec`. Each `CigarOp` wraps the BAM packed `u32` (`(length << 4) | op_code`), with op_codes M=0, I=1, D=2, N=3, S=4, H=5, P=6, =7, X=8. Unknown op characters MUST produce a typed parse error (this is stricter than BAM ingest, which tolerates reserved op codes via `CigarOpType::Unknown` per `r[io.typed_cigar_ops]`, because in SAM the lexical character is the only signal). The string `*` means CIGAR unavailable (0 operations, and end_pos = pos). @@ -108,8 +108,7 @@ SAM aux integer values (type `i`) are serialized into the smallest BAM integer t > 4. Split text into lines (on `\n`). > 5. Parse each alignment line. > 6. Filter: skip lines where RNAME's tid doesn't match, or where the record doesn't overlap `[start, end]`. -> 7. Skip unmapped reads (FLAG 0x4). -> 8. Push passing records into the RecordStore. +> 7. Push passing records into the RecordStore. r[sam.reader.overlap_filter+2] The overlap filter uses half-open intervals (0-based). `end_pos` MUST be computed from the parsed CIGAR. When CIGAR is `*` (unavailable), `end_pos = pos` (point record). @@ -151,7 +150,7 @@ r[sam.edge.empty_lines] Blank lines (empty or whitespace-only) between alignment records MUST be silently skipped. r[sam.edge.rname_star] -RNAME of `*` means unmapped. Combined with FLAG 0x4, these records MUST be filtered out by `fetch_into` (same as BAM's unmapped filtering). RNAME `*` without FLAG 0x4 is malformed but SHOULD be treated as unmapped. +RNAME of `*` means unmapped. These records flow through to the `filter_raw` customizer; the pileup engine is responsible for excluding them per `r[pileup.unmapped_excluded]`. RNAME `*` without FLAG 0x4 is malformed but SHOULD be treated as unmapped. r[sam.edge.line_spanning_blocks] A single SAM alignment line may span multiple BGZF blocks. The reader MUST buffer partial lines across block boundaries and only parse complete lines (terminated by `\n`). The parser MUST also strip `\r` at line boundaries to handle `\r\n` (Windows-style) line endings that may appear in files from mixed-platform pipelines.