Skip to content

Commit 2185b35

Browse files
authored
Merge pull request #18 from pfizer-opensource/debug/default_value_error
Fixed bug that still introduced zeros instead of the default_value introduced in v0.1.5. Thanks @mukamel-lab for pointing this out.
2 parents 2871cb2 + 3af8136 commit 2185b35

19 files changed

+868
-201
lines changed

CHANGELOG.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99
## [Unreleased]
1010
...
1111

12+
## [v0.2.0]
13+
### Fixed
14+
- fixed a bug in the intervals to values cuda kernel that
15+
introduced zeros in places where there should be
16+
"default_value" (see release v0.1.5).
17+
### Added
18+
- custom_position_sampler argument to bigwig_loader.dataset.BigWigDataset
19+
and bigwig_loader.pytorch.PytorchBigWigDataset to optionally overwrite the
20+
default random sampling of genomic coordinates from "regions of interest."
21+
- custom_track_sampler argument to bigwig_loader.dataset.BigWigDataset
22+
and bigwig_loader.pytorch.PytorchBigWigDataset to optionally use a different
23+
track sampling strategy.
24+
1225
## [v0.1.5]
1326
### Added
1427
- set a default value different from 0.0
@@ -47,7 +60,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
4760
### Added
4861
- release to pypi
4962

50-
[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.5...HEAD
63+
[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.2.0...HEAD
64+
[v0.1.6]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.5...v0.2.0
5165
[v0.1.5]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...v0.1.5
5266
[v0.1.4]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.3...v0.1.4
5367
[v0.1.3]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.2...v0.1.3

bigwig_loader/dataset.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from pathlib import Path
33
from typing import Any
44
from typing import Callable
5+
from typing import Iterable
56
from typing import Iterator
67
from typing import Literal
78
from typing import Optional
@@ -78,6 +79,12 @@ class BigWigDataset:
7879
GPU. More threads means that more IO can take place while the GPU is busy doing
7980
calculations (decompressing or neural network training for example). More threads
8081
also means a higher GPU memory usage. Default: 4
82+
custom_position_sampler: if set, this sampler will be used instead of the default
83+
position sampler (which samples randomly and uniform from regions of interest)
84+
This should be an iterable of tuples (chromosome, center).
85+
custom_track_sampler: if specified, this sampler will be used to sample tracks. When not
86+
specified, each batch simply contains all tracks, or a randomly sellected subset of
87+
tracks in case sub_sample_tracks is set. Should be Iterable batches of track indices.
8188
return_batch_objects: if True, the batches will be returned as instances of
8289
bigwig_loader.batch.Batch
8390
"""
@@ -107,6 +114,8 @@ def __init__(
107114
repeat_same_positions: bool = False,
108115
sub_sample_tracks: Optional[int] = None,
109116
n_threads: int = 4,
117+
custom_position_sampler: Optional[Iterable[tuple[str, int]]] = None,
118+
custom_track_sampler: Optional[Iterable[list[int]]] = None,
110119
return_batch_objects: bool = False,
111120
):
112121
super().__init__()
@@ -152,32 +161,34 @@ def __init__(
152161
self._sub_sample_tracks = sub_sample_tracks
153162
self._n_threads = n_threads
154163
self._return_batch_objects = return_batch_objects
155-
156-
def _create_dataloader(self) -> StreamedDataloader:
157-
position_sampler = RandomPositionSampler(
164+
self._position_sampler = custom_position_sampler or RandomPositionSampler(
158165
regions_of_interest=self.regions_of_interest,
159166
buffer_size=self._position_sampler_buffer_size,
160167
repeat_same=self._repeat_same_positions,
161168
)
169+
if custom_track_sampler is not None:
170+
self._track_sampler: Optional[Iterable[list[int]]] = custom_track_sampler
171+
elif sub_sample_tracks is not None:
172+
self._track_sampler = TrackSampler(
173+
total_number_of_tracks=len(self.bigwig_collection),
174+
sample_size=sub_sample_tracks,
175+
)
176+
else:
177+
self._track_sampler = None
162178

179+
def _create_dataloader(self) -> StreamedDataloader:
163180
sequence_sampler = GenomicSequenceSampler(
164181
reference_genome_path=self.reference_genome_path,
165182
sequence_length=self.sequence_length,
166-
position_sampler=position_sampler,
183+
position_sampler=self._position_sampler,
167184
maximum_unknown_bases_fraction=self.maximum_unknown_bases_fraction,
168185
)
169-
track_sampler = None
170-
if self._sub_sample_tracks is not None:
171-
track_sampler = TrackSampler(
172-
total_number_of_tracks=len(self.bigwig_collection),
173-
sample_size=self._sub_sample_tracks,
174-
)
175186

176187
query_batch_generator = QueryBatchGenerator(
177188
genomic_location_sampler=sequence_sampler,
178189
center_bin_to_predict=self.center_bin_to_predict,
179190
batch_size=self.super_batch_size,
180-
track_sampler=track_sampler,
191+
track_sampler=self._track_sampler,
181192
)
182193

183194
return StreamedDataloader(

bigwig_loader/download_example_data.py

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,55 @@ def download_example_data() -> None:
1919

2020
def get_reference_genome(reference_genome_path: Path = config.reference_genome) -> Path:
2121
compressed_file = reference_genome_path.with_suffix(".fasta.gz")
22-
if reference_genome_path.exists():
23-
return reference_genome_path
24-
elif compressed_file.exists():
25-
# subprocess.run(["bgzip", "-d", compressed_file])
26-
unzip_gz_file(compressed_file, reference_genome_path)
27-
else:
28-
LOGGER.info("Need reference genome for tests. Downloading it from ENCODE.")
29-
url = "https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz"
30-
urllib.request.urlretrieve(url, compressed_file)
22+
if compressed_file.exists() and not reference_genome_path.exists():
3123
# subprocess.run(["bgzip", "-d", compressed_file])
3224
unzip_gz_file(compressed_file, reference_genome_path)
25+
26+
if (
27+
reference_genome_path.exists()
28+
and checksum_md5_for_path(reference_genome_path)
29+
!= config.reference_genome_checksum
30+
):
31+
LOGGER.info(
32+
f"Reference genome checksum mismatch, downloading again from {reference_genome_path}"
33+
)
34+
_download_genome(
35+
url=config.reference_genome_url,
36+
compressed_file_path=compressed_file,
37+
uncompressed_file_path=reference_genome_path,
38+
md5_checksum=config.reference_genome_checksum,
39+
)
40+
41+
if not reference_genome_path.exists():
42+
LOGGER.info(
43+
f"Reference genome not found, downloading from {config.reference_genome_url}"
44+
)
45+
_download_genome(
46+
url=config.reference_genome_url,
47+
compressed_file_path=compressed_file,
48+
uncompressed_file_path=reference_genome_path,
49+
md5_checksum=config.reference_genome_checksum,
50+
)
3351
return reference_genome_path
3452

3553

54+
def _download_genome(
55+
url: str,
56+
compressed_file_path: Path,
57+
uncompressed_file_path: Path,
58+
md5_checksum: str,
59+
) -> Path:
60+
urllib.request.urlretrieve(url, compressed_file_path)
61+
# subprocess.run(["bgzip", "-d", compressed_file])
62+
unzip_gz_file(compressed_file_path, uncompressed_file_path)
63+
this_checksum = checksum_md5_for_path(uncompressed_file_path)
64+
if this_checksum != md5_checksum:
65+
raise RuntimeError(
66+
f"{uncompressed_file_path} has incorrect checksum: {this_checksum} vs. {md5_checksum}"
67+
)
68+
return uncompressed_file_path
69+
70+
3671
def unzip_gz_file(compressed_file_path: Path, output_file_path: Path) -> Path:
3772
with gzip.open(compressed_file_path, "rb") as gz_file:
3873
with open(output_file_path, "wb") as output_file:
@@ -52,6 +87,13 @@ def unzip_gz_file(compressed_file_path: Path, output_file_path: Path) -> Path:
5287
}
5388

5489

90+
def checksum_md5_for_path(path: Path, chunk_size: int = 10 * 1024 * 1024) -> str:
91+
"""return the md5sum"""
92+
with path.open(mode="rb") as f:
93+
checksum = checksum_md5(f, chunk_size=chunk_size)
94+
return checksum
95+
96+
5597
def checksum_md5(f: BinaryIO, *, chunk_size: int = 10 * 1024 * 1024) -> str:
5698
"""return the md5sum"""
5799
m = hashlib.md5(b"", usedforsecurity=False)
@@ -68,7 +110,7 @@ def get_example_bigwigs_files(bigwig_dir: Path = config.bigwig_dir) -> Path:
68110
file = bigwig_dir / fn
69111
if not file.exists():
70112
urllib.request.urlretrieve(url, file)
71-
with file.open(mode="rb") as f:
72-
if checksum_md5(f) != md5:
73-
raise RuntimeError(f"{fn} has incorrect checksum!")
113+
checksum = checksum_md5_for_path(file)
114+
if checksum != md5:
115+
raise RuntimeError(f"{fn} has incorrect checksum: {checksum} vs. {md5}")
74116
return bigwig_dir

bigwig_loader/intervals_to_values.py

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
import math
3+
from math import isnan
34
from pathlib import Path
45

56
import cupy as cp
@@ -86,11 +87,15 @@ def intervals_to_values(
8687
)
8788

8889
if out is None:
90+
logging.debug(f"Creating new out tensor with default value {default_value}")
91+
8992
out = cp.full(
9093
(found_starts.shape[0], len(query_starts), sequence_length // window_size),
9194
default_value,
9295
dtype=cp.float32,
9396
)
97+
logging.debug(out)
98+
9499
else:
95100
logging.debug(f"Setting default value in output tensor to {default_value}")
96101
out.fill(default_value)
@@ -120,6 +125,7 @@ def intervals_to_values(
120125
array_start = cp.ascontiguousarray(array_start)
121126
array_end = cp.ascontiguousarray(array_end)
122127
array_value = cp.ascontiguousarray(array_value)
128+
default_value_isnan = isnan(default_value)
123129

124130
cuda_kernel(
125131
(grid_size,),
@@ -137,6 +143,8 @@ def intervals_to_values(
137143
sequence_length,
138144
max_number_intervals,
139145
window_size,
146+
cp.float32(default_value),
147+
default_value_isnan,
140148
out,
141149
),
142150
)
@@ -167,8 +175,10 @@ def kernel_in_python_with_window(
167175
int,
168176
int,
169177
int,
170-
cp.ndarray,
171178
int,
179+
float,
180+
bool,
181+
cp.ndarray,
172182
],
173183
) -> cp.ndarray:
174184
"""Equivalent in python to cuda_kernel_with_window. Just for debugging."""
@@ -186,6 +196,8 @@ def kernel_in_python_with_window(
186196
sequence_length,
187197
max_number_intervals,
188198
window_size,
199+
default_value,
200+
default_value_isnan,
189201
out,
190202
) = args
191203

@@ -214,7 +226,7 @@ def kernel_in_python_with_window(
214226
print("reduced_dim")
215227
print(reduced_dim)
216228

217-
out_vector = [0.0] * reduced_dim * batch_size * num_tracks
229+
out_vector = [default_value] * reduced_dim * batch_size * num_tracks
218230

219231
for thread in range(n_threads):
220232
batch_index = thread % batch_size
@@ -235,7 +247,8 @@ def kernel_in_python_with_window(
235247

236248
cursor = found_start_index
237249
window_index = 0
238-
summation = 0
250+
summation = 0.0
251+
valid_count = 0
239252

240253
# cursor moves through the rows of the bigwig file
241254
# window_index moves through the sequence
@@ -261,19 +274,31 @@ def kernel_in_python_with_window(
261274
print("start index", start_index)
262275

263276
if start_index >= window_end:
264-
print("CONTINUE")
265-
out_vector[i * reduced_dim + window_index] = summation / window_size
266-
summation = 0
277+
if default_value_isnan:
278+
if valid_count > 0:
279+
out_vector[i * reduced_dim + window_index] = (
280+
summation / valid_count
281+
)
282+
else:
283+
out_vector[i * reduced_dim + window_index] = default_value
284+
else:
285+
summation = summation + (window_size - valid_count) * default_value
286+
out_vector[i * reduced_dim + window_index] = summation / window_size
287+
summation = 0.0
288+
valid_count = 0
267289
window_index += 1
290+
print("CONTINUE")
268291
continue
269292

270293
number = min(window_end, end_index) - max(window_start, start_index)
271294

272-
print(
273-
f"Add {number} x {track_values[cursor]} = {number * track_values[cursor]} to summation"
274-
)
275-
summation += number * track_values[cursor]
276-
print(f"Summation = {summation}")
295+
if number > 0:
296+
print(
297+
f"Add {number} x {track_values[cursor]} = {number * track_values[cursor]} to summation"
298+
)
299+
summation += number * track_values[cursor]
300+
print(f"Summation = {summation}")
301+
valid_count += number
277302

278303
print("end_index", "window_end")
279304
print(end_index, window_end)
@@ -288,8 +313,19 @@ def kernel_in_python_with_window(
288313
print(
289314
"cursor + 1 >= found_end_index \t\t calculate average, reset summation and move to next window"
290315
)
291-
out_vector[i * reduced_dim + window_index] = summation / window_size
292-
summation = 0
316+
# out_vector[i * reduced_dim + window_index] = summation / window_size
317+
if default_value_isnan:
318+
if valid_count > 0:
319+
out_vector[i * reduced_dim + window_index] = (
320+
summation / valid_count
321+
)
322+
else:
323+
out_vector[i * reduced_dim + window_index] = default_value
324+
else:
325+
summation = summation + (window_size - valid_count) * default_value
326+
out_vector[i * reduced_dim + window_index] = summation / window_size
327+
summation = 0.0
328+
valid_count = 0
293329
window_index += 1
294330
# move cursor
295331
if end_index < window_end:

bigwig_loader/pytorch.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from pathlib import Path
22
from typing import Any
33
from typing import Callable
4+
from typing import Iterable
45
from typing import Iterator
56
from typing import Literal
67
from typing import Optional
@@ -165,6 +166,12 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]):
165166
also means a higher GPU memory usage. Default: 4
166167
return_batch_objects: if True, the batches will be returned as instances of
167168
bigwig_loader.pytorch.PytorchBatch
169+
custom_position_sampler: if set, this sampler will be used instead of the default
170+
position sampler (which samples randomly and uniform from regions of interest)
171+
This should be an iterable of tuples (chromosome, center).
172+
custom_track_sampler: if specified, this sampler will be used to sample tracks. When not
173+
specified, each batch simply contains all tracks, or a randomly sellected subset of
174+
tracks in case sub_sample_tracks is set. Should be Iterable batches of track indices.
168175
"""
169176

170177
def __init__(
@@ -192,6 +199,8 @@ def __init__(
192199
repeat_same_positions: bool = False,
193200
sub_sample_tracks: Optional[int] = None,
194201
n_threads: int = 4,
202+
custom_position_sampler: Optional[Iterable[tuple[str, int]]] = None,
203+
custom_track_sampler: Optional[Iterable[list[int]]] = None,
195204
return_batch_objects: bool = False,
196205
):
197206
super().__init__()
@@ -217,6 +226,8 @@ def __init__(
217226
repeat_same_positions=repeat_same_positions,
218227
sub_sample_tracks=sub_sample_tracks,
219228
n_threads=n_threads,
229+
custom_position_sampler=custom_position_sampler,
230+
custom_track_sampler=custom_track_sampler,
220231
return_batch_objects=True,
221232
)
222233
self._return_batch_objects = return_batch_objects

bigwig_loader/sampler/genome_sampler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from pathlib import Path
22
from typing import Any
33
from typing import Callable
4+
from typing import Iterable
45
from typing import Iterator
56
from typing import Literal
67
from typing import Optional
@@ -21,7 +22,7 @@ def __init__(
2122
self,
2223
reference_genome_path: Path,
2324
sequence_length: int,
24-
position_sampler: Iterator[tuple[str, int]],
25+
position_sampler: Iterable[tuple[str, int]],
2526
maximum_unknown_bases_fraction: float = 0.1,
2627
):
2728
self.reference_genome_path = reference_genome_path

0 commit comments

Comments
 (0)