@@ -3,6 +3,7 @@ use anyhow::Result;
33use linked_hash_set:: LinkedHashSet ;
44use parquet:: data_type:: AsBytes ;
55use rust_htslib:: bam:: record:: Aux ;
6+ use rust_htslib:: htslib:: qaddr_t;
67
78// Import various standard library collections.
89use std:: collections:: { HashMap , HashSet } ;
@@ -173,10 +174,15 @@ pub fn extract_aligned_bam_reads(
173174 let rg_sm_map = get_rg_to_sm_mapping ( bam) ;
174175
175176 let mut bmap = HashMap :: new ( ) ;
177+ let mut read_spans: HashMap < String , ( u64 , u64 ) > = HashMap :: new ( ) ; // Track read alignment spans
178+ let mut read_coordinates: HashMap < String , ( u64 , u64 ) > = HashMap :: new ( ) ; // Track read coordinates
179+ let mut read_sequence_dict: HashMap < String , String > = HashMap :: new ( ) ;
176180
177181 let _ = bam. fetch ( ( ( * chr) . as_bytes ( ) , * start, * stop) ) ;
182+
178183 for p in bam. pileup ( ) {
179184 let pileup = p. unwrap ( ) ;
185+ let mut readnames = HashSet :: new ( ) ;
180186
181187 if * start <= ( pileup. pos ( ) as u64 ) && ( pileup. pos ( ) as u64 ) < * stop {
182188 for ( i, alignment) in pileup. alignments ( ) . enumerate ( ) . filter ( |( _, a) | {
@@ -190,12 +196,28 @@ pub fn extract_aligned_bam_reads(
190196 } )
191197 . unwrap_or ( false )
192198 } ) {
199+
200+
193201 let qname = String :: from_utf8_lossy ( alignment. record ( ) . qname ( ) ) . into_owned ( ) ;
202+
203+ // skip the alignment from the same read
204+ if readnames. contains ( & qname) {
205+ continue ;
206+ }
207+ readnames. insert ( qname. clone ( ) ) ;
208+
209+
194210 let sm = match get_sm_name_from_rg ( & alignment. record ( ) , & rg_sm_map) {
195211 Ok ( a) => a,
196212 Err ( _) => String :: from ( "unknown" ) ,
197213 } ;
198214
215+ let read_seq = String :: from_utf8_lossy ( & alignment. record ( ) . seq ( ) . as_bytes ( ) ) . into_owned ( ) ;
216+
217+ if !read_sequence_dict. contains_key ( & qname) {
218+ read_sequence_dict. insert ( qname. clone ( ) , read_seq) ;
219+ }
220+
199221 let is_secondary = alignment. record ( ) . is_secondary ( ) ;
200222 let is_supplementary = alignment. record ( ) . is_supplementary ( ) ;
201223 let seq_name = format ! ( "{qname}|{name}|{sm}|{i}|{is_secondary}|{is_supplementary}" ) ;
@@ -204,30 +226,83 @@ pub fn extract_aligned_bam_reads(
204226 bmap. insert ( seq_name. clone ( ) , ( String :: new ( ) , Vec :: new ( ) ) ) ;
205227 }
206228
207- if !alignment. is_del ( ) && !alignment. is_refskip ( ) {
208- let a = alignment. record ( ) . seq ( ) [ alignment. qpos ( ) . unwrap ( ) ] ;
209- let q = alignment. record ( ) . qual ( ) [ alignment. qpos ( ) . unwrap ( ) ] ;
210-
211- bmap. get_mut ( & seq_name) . unwrap ( ) . 0 . push ( a as char ) ;
212- bmap. get_mut ( & seq_name) . unwrap ( ) . 1 . push ( q + 33 as u8 ) ;
213- }
229+ // Handle different alignment types
230+ match alignment. indel ( ) {
231+ bam:: pileup:: Indel :: Ins ( len) => {
232+
233+ // For insertions, add one reference base followed by the insertion bases
234+ if let Some ( pos1) = alignment. qpos ( ) {
235+ // Then add the insertion bases
236+ let pos2 = pos1 + ( len as usize ) + 1 ;
237+ for pos in pos1..pos2 {
238+ let a = alignment. record ( ) . seq ( ) [ pos] ;
239+ let q = alignment. record ( ) . qual ( ) [ pos] ;
240+ let valid_q = q. min ( 40 ) ;
241+
242+ // Track coordinates in the reconstructed sequence
243+ let read_start = read_coordinates. entry ( seq_name. clone ( ) ) . or_insert ( ( u64:: MAX , u64:: MIN ) ) ;
244+ read_start. 0 = read_start. 0 . min ( pos as u64 ) ;
245+ read_start. 1 = read_start. 1 . max ( pos as u64 ) ;
246+
247+ bmap. get_mut ( & seq_name) . unwrap ( ) . 0 . push ( a as char ) ;
248+ bmap. get_mut ( & seq_name) . unwrap ( ) . 1 . push ( valid_q + 33 ) ;
249+ }
250+ }
251+ }
252+ bam:: pileup:: Indel :: Del ( _) => {
253+ // For deletions, add the first base of the deletion
254+ if let Some ( qpos) = alignment. qpos ( ) {
255+ let a = alignment. record ( ) . seq ( ) [ qpos] ;
256+ let q = alignment. record ( ) . qual ( ) [ qpos] ;
257+ let valid_q = q. min ( 40 ) ;
258+
259+
260+ // Track coordinates in the reconstructed sequence
261+ let read_start = read_coordinates. entry ( seq_name. clone ( ) ) . or_insert ( ( u64:: MAX , u64:: MIN ) ) ;
262+ read_start. 0 = read_start. 0 . min ( qpos as u64 ) ;
263+ read_start. 1 = read_start. 1 . max ( qpos as u64 ) ;
214264
215- if let bam:: pileup:: Indel :: Ins ( len) = alignment. indel ( ) {
216- if let Some ( pos1) = alignment. qpos ( ) {
217- let pos2 = pos1 + ( len as usize ) ;
218- for pos in pos1..pos2 {
219- let a = alignment. record ( ) . seq ( ) [ pos] ;
220- let q = alignment. record ( ) . qual ( ) [ pos] ;
265+ bmap. get_mut ( & seq_name) . unwrap ( ) . 0 . push ( a as char ) ;
266+ bmap. get_mut ( & seq_name) . unwrap ( ) . 1 . push ( valid_q+33 ) ;
267+ }
268+ }
269+ bam:: pileup:: Indel :: None => {
270+ // For matches/mismatches, add the base
271+
272+ if let Some ( qpos) = alignment. qpos ( ) {
273+ let a = alignment. record ( ) . seq ( ) [ qpos] ;
274+ let q = alignment. record ( ) . qual ( ) [ qpos] ;
275+ let valid_q = q. min ( 40 ) ;
276+
277+ // Track coordinates in the reconstructed sequence
278+ let read_start = read_coordinates. entry ( seq_name. clone ( ) ) . or_insert ( ( u64:: MAX , u64:: MIN ) ) ;
279+ read_start. 0 = read_start. 0 . min ( qpos as u64 ) ;
280+ read_start. 1 = read_start. 1 . max ( qpos as u64 ) ;
221281
222282 bmap. get_mut ( & seq_name) . unwrap ( ) . 0 . push ( a as char ) ;
223- bmap. get_mut ( & seq_name) . unwrap ( ) . 1 . push ( q + 33 as u8 ) ;
283+ bmap. get_mut ( & seq_name) . unwrap ( ) . 1 . push ( valid_q + 33 ) ;
224284 }
285+
225286 }
226287 }
227288 }
228289 }
229290 }
230291
292+ //sanity check: ensure that the reconstructed sequences match the original read sequences
293+ for ( seq_name, ( start, end) ) in read_coordinates. iter ( ) {
294+ let qname = seq_name. split ( '|' ) . next ( ) . unwrap ( ) ;
295+ if let Some ( original_seq) = read_sequence_dict. get ( qname) {
296+ if let Some ( reconstructed_seq) = bmap. get ( seq_name) {
297+ let reconstructed_seq_str: String = reconstructed_seq. 0 . clone ( ) ;
298+ let original_subseq = & original_seq[ * start as usize ..=* end as usize ] ;
299+ if original_subseq != reconstructed_seq_str {
300+ crate :: elog!( "Warning: Reconstructed sequence does not match original for read {}, {}, {}" , qname, original_subseq, reconstructed_seq_str) ;
301+ }
302+ }
303+ }
304+ }
305+
231306 let records = bmap
232307 . iter ( )
233308 . map ( |kv| fastq:: Record :: with_attrs ( kv. 0 . as_str ( ) , None , kv. 1 . 0 . as_bytes ( ) , kv. 1 . 1 . as_bytes ( ) ) )
0 commit comments