Skip to content

Commit c6f60c7

Browse files
committed
fix hidive fetch
1 parent c34fe0b commit c6f60c7

File tree

2 files changed

+91
-16
lines changed

2 files changed

+91
-16
lines changed

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/skydive/src/stage.rs

Lines changed: 89 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use anyhow::Result;
33
use linked_hash_set::LinkedHashSet;
44
use parquet::data_type::AsBytes;
55
use rust_htslib::bam::record::Aux;
6+
use rust_htslib::htslib::qaddr_t;
67

78
// Import various standard library collections.
89
use std::collections::{HashMap, HashSet};
@@ -173,10 +174,15 @@ pub fn extract_aligned_bam_reads(
173174
let rg_sm_map = get_rg_to_sm_mapping(bam);
174175

175176
let mut bmap = HashMap::new();
177+
let mut read_spans: HashMap<String, (u64, u64)> = HashMap::new(); // Track read alignment spans
178+
let mut read_coordinates: HashMap<String, (u64, u64)> = HashMap::new(); // Track read coordinates
179+
let mut read_sequence_dict: HashMap<String, String> = HashMap::new();
176180

177181
let _ = bam.fetch(((*chr).as_bytes(), *start, *stop));
182+
178183
for p in bam.pileup() {
179184
let pileup = p.unwrap();
185+
let mut readnames = HashSet::new();
180186

181187
if *start <= (pileup.pos() as u64) && (pileup.pos() as u64) < *stop {
182188
for (i, alignment) in pileup.alignments().enumerate().filter(|(_, a)| {
@@ -190,12 +196,28 @@ pub fn extract_aligned_bam_reads(
190196
})
191197
.unwrap_or(false)
192198
}) {
199+
200+
193201
let qname = String::from_utf8_lossy(alignment.record().qname()).into_owned();
202+
203+
// skip the alignment from the same read
204+
if readnames.contains(&qname) {
205+
continue;
206+
}
207+
readnames.insert(qname.clone());
208+
209+
194210
let sm = match get_sm_name_from_rg(&alignment.record(), &rg_sm_map) {
195211
Ok(a) => a,
196212
Err(_) => String::from("unknown"),
197213
};
198214

215+
let read_seq = String::from_utf8_lossy(&alignment.record().seq().as_bytes()).into_owned();
216+
217+
if !read_sequence_dict.contains_key(&qname) {
218+
read_sequence_dict.insert(qname.clone(), read_seq);
219+
}
220+
199221
let is_secondary = alignment.record().is_secondary();
200222
let is_supplementary = alignment.record().is_supplementary();
201223
let seq_name = format!("{qname}|{name}|{sm}|{i}|{is_secondary}|{is_supplementary}");
@@ -204,30 +226,83 @@ pub fn extract_aligned_bam_reads(
204226
bmap.insert(seq_name.clone(), (String::new(), Vec::new()));
205227
}
206228

207-
if !alignment.is_del() && !alignment.is_refskip() {
208-
let a = alignment.record().seq()[alignment.qpos().unwrap()];
209-
let q = alignment.record().qual()[alignment.qpos().unwrap()];
210-
211-
bmap.get_mut(&seq_name).unwrap().0.push(a as char);
212-
bmap.get_mut(&seq_name).unwrap().1.push(q + 33 as u8);
213-
}
229+
// Handle different alignment types
230+
match alignment.indel() {
231+
bam::pileup::Indel::Ins(len) => {
232+
233+
// For insertions, add one reference base followed by the insertion bases
234+
if let Some(pos1) = alignment.qpos() {
235+
// Then add the insertion bases
236+
let pos2 = pos1 + (len as usize) + 1;
237+
for pos in pos1..pos2 {
238+
let a = alignment.record().seq()[pos];
239+
let q = alignment.record().qual()[pos];
240+
let valid_q = q.min(40);
241+
242+
// Track coordinates in the reconstructed sequence
243+
let read_start = read_coordinates.entry(seq_name.clone()).or_insert((u64::MAX, u64::MIN));
244+
read_start.0 = read_start.0.min(pos as u64);
245+
read_start.1 = read_start.1.max(pos as u64);
246+
247+
bmap.get_mut(&seq_name).unwrap().0.push(a as char);
248+
bmap.get_mut(&seq_name).unwrap().1.push(valid_q + 33);
249+
}
250+
}
251+
}
252+
bam::pileup::Indel::Del(_) => {
253+
// For deletions, add the first base of the deletion
254+
if let Some(qpos) = alignment.qpos() {
255+
let a = alignment.record().seq()[qpos];
256+
let q = alignment.record().qual()[qpos];
257+
let valid_q = q.min(40);
258+
259+
260+
// Track coordinates in the reconstructed sequence
261+
let read_start = read_coordinates.entry(seq_name.clone()).or_insert((u64::MAX, u64::MIN));
262+
read_start.0 = read_start.0.min(qpos as u64);
263+
read_start.1 = read_start.1.max(qpos as u64);
214264

215-
if let bam::pileup::Indel::Ins(len) = alignment.indel() {
216-
if let Some(pos1) = alignment.qpos() {
217-
let pos2 = pos1 + (len as usize);
218-
for pos in pos1..pos2 {
219-
let a = alignment.record().seq()[pos];
220-
let q = alignment.record().qual()[pos];
265+
bmap.get_mut(&seq_name).unwrap().0.push(a as char);
266+
bmap.get_mut(&seq_name).unwrap().1.push(valid_q+33);
267+
}
268+
}
269+
bam::pileup::Indel::None => {
270+
// For matches/mismatches, add the base
271+
272+
if let Some(qpos) = alignment.qpos() {
273+
let a = alignment.record().seq()[qpos];
274+
let q = alignment.record().qual()[qpos];
275+
let valid_q = q.min(40);
276+
277+
// Track coordinates in the reconstructed sequence
278+
let read_start = read_coordinates.entry(seq_name.clone()).or_insert((u64::MAX, u64::MIN));
279+
read_start.0 = read_start.0.min(qpos as u64);
280+
read_start.1 = read_start.1.max(qpos as u64);
221281

222282
bmap.get_mut(&seq_name).unwrap().0.push(a as char);
223-
bmap.get_mut(&seq_name).unwrap().1.push(q + 33 as u8);
283+
bmap.get_mut(&seq_name).unwrap().1.push(valid_q + 33);
224284
}
285+
225286
}
226287
}
227288
}
228289
}
229290
}
230291

292+
//sanity check: ensure that the reconstructed sequences match the original read sequences
293+
for (seq_name, (start, end)) in read_coordinates.iter() {
294+
let qname = seq_name.split('|').next().unwrap();
295+
if let Some(original_seq) = read_sequence_dict.get(qname) {
296+
if let Some(reconstructed_seq) = bmap.get(seq_name) {
297+
let reconstructed_seq_str: String = reconstructed_seq.0.clone();
298+
let original_subseq = &original_seq[*start as usize..=*end as usize];
299+
if original_subseq != reconstructed_seq_str {
300+
crate::elog!("Warning: Reconstructed sequence does not match original for read {}, {}, {}", qname, original_subseq, reconstructed_seq_str);
301+
}
302+
}
303+
}
304+
}
305+
231306
let records = bmap
232307
.iter()
233308
.map(|kv| fastq::Record::with_attrs(kv.0.as_str(), None, kv.1.0.as_bytes(), kv.1.1.as_bytes()))

0 commit comments

Comments
 (0)