Merge pull request #17 from pfizer-opensource/feature/zero_is_not_nan

jorenretel · web-flow · commit 2871cb21fcbd · 2025-04-23T11:21:36.000+02:00
Adding feature to use custom "default_value" as filler for intervals …
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ...
 
+## [v0.1.5]
+### Added
+- set a default value different from 0.0
+
 ## [v0.1.4]
 ### Fixed
 - Updated README.md with better install instructions. Making release so
@@ -43,8 +47,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - release to pypi
 
-[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...HEAD
-[v0.1.3]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...v0.1.4
+[Unreleased]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.5...HEAD
+[v0.1.5]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.4...v0.1.5
+[v0.1.4]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.3...v0.1.4
 [v0.1.3]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.2...v0.1.3
 [v0.1.2]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.1...v0.1.2
 [v0.1.1]: https://github.com/pfizer-opensource/bigwig-loader/compare/v0.1.0...v0.1.1
diff --git a/bigwig_loader/batch_processor.py b/bigwig_loader/batch_processor.py
@@ -104,6 +104,7 @@ def get_batch(
         end: Union[Sequence[int], npt.NDArray[np.int64]],
         window_size: int = 1,
         scaling_factors_cupy: Optional[cp.ndarray] = None,
+        default_value: float = 0.0,
         out: Optional[cp.ndarray] = None,
     ) -> cp.ndarray:
         (
@@ -137,10 +138,10 @@ def get_batch(
             query_starts=abs_start,
             query_ends=abs_end,
             window_size=window_size,
+            default_value=default_value,
             out=out,
         )
         batch = cp.transpose(out, (1, 0, 2))
         if scaling_factors_cupy is not None:
             batch *= scaling_factors_cupy
-
         return batch
diff --git a/bigwig_loader/collection.py b/bigwig_loader/collection.py
@@ -140,6 +140,7 @@ def get_batch(
         start: Union[Sequence[int], npt.NDArray[np.int64]],
         end: Union[Sequence[int], npt.NDArray[np.int64]],
         window_size: int = 1,
+        default_value: float = 0.0,
         out: Optional[cp.ndarray] = None,
     ) -> cp.ndarray:
         return self.batch_processor.get_batch(
@@ -148,6 +149,7 @@ def get_batch(
             end=end,
             window_size=window_size,
             scaling_factors_cupy=self.scaling_factors_cupy,
+            default_value=default_value,
             out=out,
         )
 
@@ -171,7 +173,8 @@ def make_positions_global(
 
         """
         offsets = np.array(
-            [self.chromosome_offset_dict[chrom] for chrom in chromosomes]
+            [self.chromosome_offset_dict[chrom] for chrom in chromosomes],
+            dtype=np.int64,
         )
         return positions + offsets
 
diff --git a/bigwig_loader/dataset.py b/bigwig_loader/dataset.py
@@ -3,6 +3,7 @@
 from typing import Any
 from typing import Callable
 from typing import Iterator
+from typing import Literal
 from typing import Optional
 from typing import Sequence
 from typing import Union
@@ -38,16 +39,16 @@ class BigWigDataset:
         reference_genome_path: path to fasta file containing the reference genome.
         sequence_length: number of base pairs in input sequence
         center_bin_to_predict: if given, only do prediction on a central window. Should be
-            smaller than or equal to sequence_length. If not given will be the same as
-            sequence_length.
+            smaller than or equal to sequence_length. If None, the whole sequence length
+            will be used. Default: None
         window_size: used to down sample the resolution of the target from sequence_length
         moving_average_window_size: window size for moving average on the target. Can
             help too smooth out the target. Default: 1, which means no smoothing. If
             used in combination with window_size, the target is first downsampled and
             then smoothed.
         batch_size: batch size
         super_batch_size: batch size that is used in the background to load data from
-            bigwig files. Should be larget than or equal to batch_size. If None, it will
+            bigwig files. Should be larger than or equal to batch_size. If None, it will
             be equal to batch_size.
         batches_per_epoch: because the length of an epoch is slightly arbitrary here,
             the number of batches can be set by hand. If not the number of batches per
@@ -61,6 +62,9 @@ class BigWigDataset:
             If None, no scaling is done. Keys can be (partial) file paths. See
             bigwig_loader.path.match_key_to_path for more information about how
             dict keys are mapped to paths.
+        default_value: value to use for intervals that are not present in the
+            bigwig file. Defaults to 0.0. Can be set to cp.nan to differentiate
+            between missing values listed as 0.0.
         first_n_files: Only use the first n files (handy for debugging on less tasks)
         position_sampler_buffer_size: number of intervals picked up front by the position sampler.
             When all intervals are used, new intervals are picked.
@@ -73,7 +77,7 @@ class BigWigDataset:
         n_threads: number of python threads / cuda streams to use for loading the data to
             GPU. More threads means that more IO can take place while the GPU is busy doing
             calculations (decompressing or neural network training for example). More threads
-            also means a higher GPU memory usage.
+            also means a higher GPU memory usage. Default: 4
         return_batch_objects: if True, the batches will be returned as instances of
             bigwig_loader.batch.Batch
     """
@@ -92,11 +96,12 @@ def __init__(
         batches_per_epoch: Optional[int] = None,
         maximum_unknown_bases_fraction: float = 0.1,
         sequence_encoder: Optional[
-            Union[Callable[[Sequence[str]], Any], str]
+            Union[Callable[[Sequence[str]], Any], Literal["onehot"]]
         ] = "onehot",
         file_extensions: Sequence[str] = (".bigWig", ".bw"),
         crawl: bool = True,
         scale: Optional[dict[Union[str | Path], Any]] = None,
+        default_value: float = 0.0,
         first_n_files: Optional[int] = None,
         position_sampler_buffer_size: int = 100000,
         repeat_same_positions: bool = False,
@@ -139,6 +144,7 @@ def __init__(
         self._first_n_files = first_n_files
         self._file_extensions = file_extensions
         self._crawl = crawl
+        self._default_value = default_value
         self._scale = scale
         self._position_sampler_buffer_size = position_sampler_buffer_size
         self._repeat_same_positions = repeat_same_positions
@@ -181,6 +187,7 @@ def _create_dataloader(self) -> StreamedDataloader:
             queue_size=self._n_threads + 1,
             slice_size=self.batch_size,
             window_size=self.window_size,
+            default_value=self._default_value,
         )
 
     def __iter__(
diff --git a/bigwig_loader/intervals_to_values.py b/bigwig_loader/intervals_to_values.py
@@ -8,8 +8,6 @@
 
 CUDA_KERNEL_DIR = Path(__file__).parent.parent / "cuda_kernels"
 
-_zero = cp.asarray(0.0, dtype=cp.float32).item()
-
 
 def get_cuda_kernel() -> str:
     with open(CUDA_KERNEL_DIR / "intervals_to_values.cu") as f:
@@ -31,6 +29,7 @@ def intervals_to_values(
     found_ends: cp.ndarray | None = None,
     sizes: cp.ndarray | None = None,
     window_size: int = 1,
+    default_value: float = 0.0,
     out: cp.ndarray | None = None,
 ) -> cp.ndarray:
     """
@@ -58,6 +57,8 @@ def intervals_to_values(
         sizes: number of elements in track_starts/track_ends/track_values for each track.
             Only needed when found_starts and found_ends are not given.
         window_size: size in basepairs to average over (default: 1)
+        default_value: value to use for regions where no data is specified (default: 0.0)
+        out: array of size n_tracks x batch_size x sequence_length to store the output.
     Returns:
         out: array of size n_tracks x batch_size x sequence_length
 
@@ -85,12 +86,15 @@ def intervals_to_values(
         )
 
     if out is None:
-        out = cp.zeros(
+        out = cp.full(
             (found_starts.shape[0], len(query_starts), sequence_length // window_size),
+            default_value,
             dtype=cp.float32,
         )
     else:
-        out *= _zero
+        logging.debug(f"Setting default value in output tensor to {default_value}")
+        out.fill(default_value)
+        logging.debug(out)
 
     max_number_intervals = min(
         sequence_length, (found_ends - found_starts).max().item()
@@ -136,7 +140,6 @@ def intervals_to_values(
             out,
         ),
     )
-
     return out
 
 
diff --git a/bigwig_loader/pytorch.py b/bigwig_loader/pytorch.py
@@ -124,8 +124,8 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]):
         reference_genome_path: path to fasta file containing the reference genome.
         sequence_length: number of base pairs in input sequence
         center_bin_to_predict: if given, only do prediction on a central window. Should be
-            smaller than or equal to sequence_length. If not given will be the same as
-            sequence_length.
+            smaller than or equal to sequence_length. If None, the whole sequence length
+            will be used. Default: None
         window_size: used to down sample the resolution of the target from sequence_length
         moving_average_window_size: window size for moving average on the target. Can
             help too smooth out the target. Default: 1, which means no smoothing. If
@@ -141,8 +141,21 @@ class PytorchBigWigDataset(IterableDataset[BATCH_TYPE]):
         maximum_unknown_bases_fraction: maximum number of bases in an input sequence that
             is unknown.
         sequence_encoder: encoder to apply to the sequence. Default: bigwig_loader.util.onehot_sequences
-        position_samples_buffer_size: number of intervals picked up front by the position sampler.
+        file_extensions: load files with these extensions (default .bw and .bigWig)
+        crawl: whether to search in sub-directories for BigWig files
+        scale: Optional, dictionary with scaling factors for each BigWig file.
+            If None, no scaling is done. Keys can be (partial) file paths. See
+            bigwig_loader.path.match_key_to_path for more information about how
+            dict keys are mapped to paths.
+        default_value: value to use for intervals that are not present in the
+            bigwig file. Defaults to 0.0. Can be set to cp.nan to differentiate
+            between missing values listed as 0.0.
+        first_n_files: Only use the first n files (handy for debugging on less tasks)
+        position_sampler_buffer_size: number of intervals picked up front by the position sampler.
             When all intervals are used, new intervals are picked.
+        repeat_same_positions: if True the positions sampler does not draw a new random collection
+            of positions when the buffer runs out, but repeats the same samples. Can be used to
+            check whether network can overfit.
         sub_sample_tracks: int, if set a  different random set of tracks is selected in each
             superbatch from the total number of tracks. The indices corresponding to those tracks
             are returned in the output.
@@ -160,7 +173,7 @@ def __init__(
         collection: Union[str, Sequence[str], Path, Sequence[Path], BigWigCollection],
         reference_genome_path: Path,
         sequence_length: int = 1000,
-        center_bin_to_predict: Optional[int] = 200,
+        center_bin_to_predict: Optional[int] = None,
         window_size: int = 1,
         moving_average_window_size: int = 1,
         batch_size: int = 256,
@@ -172,6 +185,8 @@ def __init__(
         ] = "onehot",
         file_extensions: Sequence[str] = (".bigWig", ".bw"),
         crawl: bool = True,
+        scale: Optional[dict[Union[str | Path], Any]] = None,
+        default_value: float = 0.0,
         first_n_files: Optional[int] = None,
         position_sampler_buffer_size: int = 100000,
         repeat_same_positions: bool = False,
@@ -195,6 +210,8 @@ def __init__(
             sequence_encoder=sequence_encoder,
             file_extensions=file_extensions,
             crawl=crawl,
+            scale=scale,
+            default_value=default_value,
             first_n_files=first_n_files,
             position_sampler_buffer_size=position_sampler_buffer_size,
             repeat_same_positions=repeat_same_positions,
diff --git a/bigwig_loader/streamed_dataset.py b/bigwig_loader/streamed_dataset.py
@@ -108,6 +108,7 @@ def __init__(
         queue_size: int = 10,
         slice_size: int | None = None,
         window_size: int = 1,
+        default_value: float = 0.0,
     ):
         self.input_generator = input_generator
         self.collection = collection
@@ -127,6 +128,7 @@ def __init__(
         self.data_generator_thread: threading.Thread | None = None
         self._entered = False
         self._out = None
+        self._default_value = default_value
 
     def __enter__(self) -> "StreamedDataloader":
         self._entered = True
@@ -214,6 +216,7 @@ def _generate_batches(self) -> Generator[Batch, None, None]:
                             found_starts=found_starts[:, select],
                             found_ends=found_ends[:, select],
                             window_size=self.window_size,
+                            default_value=self._default_value,
                             out=out,
                         )
 
@@ -248,7 +251,7 @@ def _get_out_tensor(
         shape = (number_of_tracks, batch_size, sequence_length)
 
         if self._out is None or self._out.shape != shape:
-            self._out = cp.zeros(shape, dtype=cp.float32)
+            self._out = cp.full(shape, self._default_value, dtype=cp.float32)
         return self._out
 
     def _determine_slice_size(self, n_samples: int) -> int:
diff --git a/tests/test_collection.py b/tests/test_collection.py
@@ -57,6 +57,29 @@ def test_get_batch(collection):
     assert batch.shape == (256, n_files, 1000)
 
 
+def test_get_batch_with_nans(collection):
+    """
+    Testing whether NaNs are returned when setting
+    the default_value to cp.nan. Giving it some
+    coordinates way beyond the total chromosome
+    length now because the bigwigs I am testing
+    on include intervals with value 0.0 instead
+    of just not listing it.
+    """
+    n_files = len(collection.bigwig_paths)
+
+    batch = collection.get_batch(
+        ["chr1", "chr20", "chr4"],
+        [0, 99998999, 99998999],
+        [1000, 99999999, 99999999],
+        default_value=cp.nan,
+    )
+
+    print(batch)
+    assert batch.shape == (3, n_files, 1000)
+    assert cp.any(cp.isnan(batch))
+
+
 def test_exclude_intervals(collection):
     intervals = collection.intervals(
         ["chr3", "chr4", "chr5"], exclude_chromosomes=["chr4"]
diff --git a/tests/test_intervals_to_values.py b/tests/test_intervals_to_values.py
@@ -230,3 +230,34 @@ def test_get_values_from_intervals_batch_multiple_tracks() -> None:
     print(expected)
     print(values)
     assert (values == expected).all()
+
+
+def test_default_nan() -> None:
+    """Query end is exactly at end index before "gap"
+       Now instead of zeros, NaN values should be
+       used.
+    ."""
+    track_starts = cp.asarray([5, 10, 12, 18], dtype=cp.int32)
+    track_ends = cp.asarray([10, 12, 14, 20], dtype=cp.int32)
+    track_values = cp.asarray([20.0, 30.0, 40.0, 50.0], dtype=cp.dtype("f4"))
+    query_starts = cp.asarray([7, 9], dtype=cp.int32)
+    query_ends = cp.asarray([18, 20], dtype=cp.int32)
+    reserved = cp.zeros([2, 11], dtype=cp.dtype("<f4"))
+    values = intervals_to_values(
+        track_starts,
+        track_ends,
+        track_values,
+        query_starts,
+        query_ends,
+        default_value=cp.nan,
+        out=reserved,
+    )
+    expected = cp.asarray(
+        [
+            [20.0, 20.0, 20.0, 30.0, 30.0, 40.0, 40.0, cp.nan, cp.nan, cp.nan, cp.nan],
+            [20.0, 30.0, 30.0, 40.0, 40.0, cp.nan, cp.nan, cp.nan, cp.nan, 50.0, 50.0],
+        ]
+    )
+    print(expected)
+    print(values)
+    assert cp.allclose(values, expected, equal_nan=True)